diff --git a/.gitignore b/.gitignore index 6825061db1..43f20a137b 100644 --- a/.gitignore +++ b/.gitignore @@ -69,4 +69,10 @@ AGENTS.md microbench/build/ microbench/output/ -microbench/dramsim3* \ No newline at end of file +microbench/dramsim3* + +*.bin +*.db +*.log +*.gz +*.zstd \ No newline at end of file diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py index dc66ed7833..d650b82f70 100644 --- a/configs/common/FSConfig.py +++ b/configs/common/FSConfig.py @@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None): self.system_port = self.membus.cpu_side_ports return self -def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False): - self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby) +def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, + ruby=False, num_threads=None): + self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby, + num_threads=num_threads) self.workload = RiscvBareMetal() self.workload.reset_vect = 0x80000000 return self -def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): +def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False, + num_threads=None): self = System() if not mdesc: # generic system mdesc = SysConfig() + if num_threads is None: + num_threads = np self.mem_mode = mem_mode self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())] print(self.mem_ranges) @@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): self.lint = Clint() self.lint.pio = self.iobus.mem_side_ports self.lint.pio_addr = 0x38000000 - self.lint.num_threads = np + self.lint.num_threads = num_threads + + self.hartctrl = HartCtrl() + self.hartctrl.pio = self.iobus.mem_side_ports + self.hartctrl.num_threads = num_threads self.mmcs = NemuMMC() self.mmcs.pio = self.iobus.mem_side_ports @@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr + self.uartlite.pio_size), AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size), + AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size), AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size), AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size), ] diff --git a/configs/common/Options.py b/configs/common/Options.py index 937bdecac4..b9c89ed25c 100644 --- a/configs/common/Options.py +++ b/configs/common/Options.py @@ -344,16 +344,14 @@ def addCommonOptions(parser, configure_xiangshan=False): "that are present under any of the roots. If not given, dump all " "stats. ") + parser.add_argument("--smt", action="store_true", default=False, + help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""") + if configure_xiangshan: return # Following options are not available in XiangShan parser.add_argument("--checker", action="store_true") - parser.add_argument("--smt", action="store_true", default=False, - help=""" - Only used if multiple programs are specified. If true, - then the number of threads per cpu is same as the - number of programs.""") parser.add_argument( "--elastic-trace-en", action="store_true", help="""Enable capture of data dependency and instruction diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py index ed78ebc922..e05644c05e 100644 --- a/configs/common/xiangshan.py +++ b/configs/common/xiangshan.py @@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace): if args.difftest_ref_so is not None: ref_so = args.difftest_ref_so print("Obtained ref_so from args.difftest_ref_so: ", ref_so) - elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ: + elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ: ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"] print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so) elif "GCBV_REF_SO" in os.environ: @@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys): if args.raw_cpt: # If using raw binary, no restorer is needed. gcpt_restorer = None - elif args.num_cpus > 1: + elif args.num_cpus > 1 or args.smt: if "GCB_MULTI_CORE_RESTORER" in os.environ: gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"] print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer) else: - fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core") + fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest") elif args.restore_rvv_cpt: if "GCBV_RESTORER" in os.environ: gcpt_restorer = os.environ["GCBV_RESTORER"] @@ -359,8 +359,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys): print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer) gcpt_restorer = args.gcpt_restorer - if args.num_cpus > 1: - print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).") + if args.num_cpus > 1 or args.smt: + print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).") sys.gcpt_restorer_size_limit = 2**20 elif args.restore_rvv_cpt: print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.") @@ -407,7 +407,7 @@ def config_difftest(cpu_list, args, sys): if not args.enable_difftest: return else: - if len(cpu_list) > 1: + if len(cpu_list) > 1 or args.smt: sys.enable_mem_dedup = True for cpu in cpu_list: cpu.enable_mem_dedup = True @@ -443,7 +443,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby): test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i) for i in range(np)] # Configure MMU for trace-aware FS mode + if args.smt: + test_sys.multi_thread = True + for cpu in test_sys.cpu: + if args.smt: + cpu.numThreads = 2 cpu.mmu.pma_checker = PMAChecker( uncacheable=[AddrRange(0, size=0x80000000)]) cpu.mmu.functional = args.functional_tlb @@ -802,8 +807,11 @@ def build_xiangshan_system(args): TestCPUClass = get_xiangshan_cpu_class(args) ruby = bool(hasattr(args, 'ruby') and args.ruby) + num_threads = np * (2 if getattr(args, 'smt', False) else 1) - test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby) + test_sys = makeBareMetalXiangshanSystem( + 'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby, + num_threads=num_threads) if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode: if bool(getattr(args, 'trace_timing_ptw', False)): diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa index e97eef0940..2448a9ad95 100644 --- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa @@ -1,5 +1,24 @@ output header {{ +#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ + std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) + +#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ + do { \ + for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ + const uint32_t _vdElemIdx = \ + (vmi.rs % (elem_num_per_vreg_)) + _i; \ + const size_t _ei = _i + vmi.rs; \ + const bool _is_tail = _ei >= rVl; \ + const bool _is_masked = !this->vm && !_is_tail && \ + !elem_mask(v0, _ei); \ + if ((_is_tail && machInst.vtype8.vta) || \ + (_is_masked && machInst.vtype8.vma)) { \ + FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ + } \ + } \ + } while (0) + inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -147,6 +166,7 @@ Fault { %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -172,6 +192,8 @@ Fault %(memacc_code)s; } + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); + %(op_wb)s; return fault; } @@ -261,6 +283,7 @@ Fault %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -299,6 +322,11 @@ Fault } } +#if %(is_vecWhole)s +#else + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); +#endif + %(vfof_get_code)s; %(op_wb)s; return NoFault; diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa index a8e5b71f99..4b64f5dac0 100644 --- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa @@ -1,5 +1,24 @@ output header {{ +#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ + std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) + +#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ + do { \ + for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ + const uint32_t _vdElemIdx = \ + (vmi.rs % (elem_num_per_vreg_)) + _i; \ + const size_t _ei = _i + vmi.rs; \ + const bool _is_tail = _ei >= rVl; \ + const bool _is_masked = !this->vm && !_is_tail && \ + !elem_mask(v0, _ei); \ + if ((_is_tail && machInst.vtype8.vta) || \ + (_is_masked && machInst.vtype8.vma)) { \ + FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ + } \ + } \ + } while (0) + inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -147,6 +166,7 @@ Fault { %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -172,6 +192,8 @@ Fault %(memacc_code)s; } + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); + %(op_wb)s; return fault; } @@ -261,6 +283,7 @@ Fault %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -299,6 +322,11 @@ Fault } } +#if %(is_vecWhole)s +#else + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); +#endif + %(vfof_get_code)s; %(op_wb)s; return NoFault; diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc index 07b883fc30..b3f150ede9 100644 --- a/src/arch/riscv/tlb.cc +++ b/src/arch/riscv/tlb.cc @@ -2114,7 +2114,6 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc, return NoFault; } - PrivilegeMode TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode) { diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 63c0e7964a..264e17bf4d 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -43,6 +43,7 @@ #include "cpu/base.hh" +#include #include #include #include @@ -208,40 +209,52 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker) "of threads (%i).\n", params().isa.size(), numThreads); } - diffAllStates = std::make_shared(); + diffAllStates.resize(numThreads); + recentCommittedStores.resize(numThreads); + syncVisibleStoreReplayArmed.resize(numThreads, false); if (enableDifftest) { assert(params().difftest_ref_so.length() > 2); - diffAllStates->diff.nemu_reg = &(diffAllStates->referenceRegFile); - diffAllStates->diff.nemu_this_pc = 0x80000000u; - diffAllStates->diff.cpu_id = params().cpu_id; - warn("cpu_id set to %d\n", params().cpu_id); - - if (params().difftest_ref_so.find("spike") != std::string::npos) { - assert(!system->multiCore()); - diffAllStates->proxy = new SpikeProxy( - params().cpu_id, params().difftest_ref_so.c_str(), - params().nemuSDimg.size() && params().nemuSDCptBin.size()); - } else { - diffAllStates->proxy = - new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(), - params().nemuSDimg.size() && params().nemuSDCptBin.size(), system->enabledMemDedup(), - system->multiCore()); - } + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid] = std::make_shared(); + auto diff_state = diffAllStates[tid]; + diff_state->diff.nemu_reg = &(diff_state->referenceRegFile); + diff_state->diff.nemu_this_pc = 0x80000000u; + diff_state->diff.cpu_id = difftestHartId(tid); + warn("difftest hart id set to %d for tid %d\n", + diff_state->diff.cpu_id, tid); + + if (params().difftest_ref_so.find("spike") != std::string::npos) { + assert(!system->multiContextDifftest()); + diff_state->proxy = new SpikeProxy( + params().cpu_id, params().difftest_ref_so.c_str(), + params().nemuSDimg.size() && params().nemuSDCptBin.size()); + } else { + diff_state->proxy = + new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(), + params().nemuSDimg.size() && params().nemuSDCptBin.size(), + system->enabledMemDedup(), + system->multiContextDifftest()); + } - warn("Difftest is enabled with ref so: %s.\n", params().difftest_ref_so.c_str()); + warn("Difftest is enabled with ref so: %s.\n", + params().difftest_ref_so.c_str()); - diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), REF_TO_DUT); - diffAllStates->diff.dynamic_config.ignore_illegal_mem_access = false; - diffAllStates->diff.dynamic_config.debug_difftest = false; - diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); - if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) { - diffAllStates->proxy->sdcard_init(params().nemuSDimg.c_str(), - params().nemuSDCptBin.c_str()); + diff_state->proxy->regcpy(&(diff_state->gem5RegFile), REF_TO_DUT); + diff_state->diff.dynamic_config.ignore_illegal_mem_access = false; + diff_state->diff.dynamic_config.debug_difftest = false; + diff_state->proxy->update_config(&diff_state->diff.dynamic_config); + if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) { + diff_state->proxy->sdcard_init(params().nemuSDimg.c_str(), + params().nemuSDCptBin.c_str()); + } + diff_state->diff.will_handle_intr = false; } - diffAllStates->diff.will_handle_intr = false; } else { warn("Difftest is disabled\n"); - diffAllStates->hasCommit = true; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid] = std::make_shared(); + diffAllStates[tid]->hasCommit = true; + } } if (dumpCommitFlag) { @@ -404,11 +417,14 @@ BaseCPU::startup() if (powerState->get() == enums::PwrState::UNDEFINED) powerState->set(enums::PwrState::ON); - if (system->multiCore()) { + if (system->multiContextDifftest()) { goldenMemPtr = system->getGoldenMemPtr(); _goldenMemManager = system->getGoldenMemManager(); - diffAllStates->proxy->initState(params().cpu_id, goldenMemPtr); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid]->proxy->initState(difftestHartId(tid), + goldenMemPtr); + } } else { goldenMemPtr = nullptr; _goldenMemManager = nullptr; @@ -417,6 +433,33 @@ BaseCPU::startup() } +void +BaseCPU::recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst) +{ + RecentCommittedStore recent; + + if (!system->multiContextDifftest() || !_goldenMemManager || + !inst->isStore() || inst->isAtomic() || + (inst->isStoreConditional() && !inst->lockedWriteSuccess()) || + !inst->memData || inst->effSize == 0 || + inst->effSize > sizeof(recent.data) || + !_goldenMemManager->inPmem(inst->physEffAddr)) { + return; + } + + auto &recent_history = recentCommittedStores.at(tid); + recent.valid = true; + recent.addr = inst->physEffAddr; + recent.size = inst->effSize; + recent.seq = inst->seqNum; + std::memcpy(recent.data, inst->memData, recent.size); + recent_history.push_back(recent); + constexpr size_t max_store_history = 16; + if (recent_history.size() > max_store_history) { + recent_history.pop_front(); + } +} + probing::PMUUPtr BaseCPU::pmuProbePoint(const char *name) { @@ -702,7 +745,7 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU) if (enable_diff) { warn("Take over difftest state to new CPU\n"); enableDifftest = enable_diff; - takeOverDiffAllStates(diff_all); + takeOverDiffAllStates(std::move(diff_all)); } } @@ -865,6 +908,12 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent) hostOpRate = simOps / hostSeconds; } +int +BaseCPU::difftestHartId(ThreadID tid) const +{ + return params().cpu_id * numThreads + tid; +} + void BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq, std::string error_csr_name, int &diff_at) @@ -883,6 +932,8 @@ BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint std::pair BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; + int diff_at = DiffAt::NoneDiff; bool npc_match = false; bool is_mmio = diffInfo.curInstStrictOrdered; @@ -966,7 +1017,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) if (enableRVV) { if (diffInfo.inst->isVector()) { - readGem5Regs(); + readGem5Regs(tid); uint64_t* nemu_val = (uint64_t*)&(diffAllStates->referenceRegFile.vr[0]); uint64_t* gem5_val = (uint64_t*)&(diffAllStates->gem5RegFile.vr[0]); bool maybe_error = false; @@ -1431,35 +1482,104 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) diffInfo.physEffAddr, diffInfo.effSize); } - if (system->multiCore() && (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) && + if (system->multiContextDifftest() && + (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) && _goldenMemManager->inPmem(diffInfo.physEffAddr)) { - warn("Difference on %s instr found in multicore mode, check in golden memory\n", - diffInfo.inst->isLoad() ? "load" : "amo"); - uint8_t *golden_ptr = diffInfo.goldenValue; + DPRINTF(Diff, + "Difference on %s instr found in multicore mode, " + "check in golden memory\n", + diffInfo.inst->isLoad() ? "load" : "amo"); + uint8_t current_golden_data[16] = {}; + panic_if(diffInfo.effSize > sizeof(current_golden_data), + "Unexpected large mem diff size: %u\n", + diffInfo.effSize); + _goldenMemManager->readGoldenMem(diffInfo.physEffAddr, + current_golden_data, + diffInfo.effSize); + uint8_t *golden_ptr = current_golden_data; + uint8_t *exec_golden_ptr = diffInfo.goldenValue; + const RecentCommittedStore *matched_recent_store = nullptr; + if (diffInfo.inst->isLoad()) { + const auto &recent_history = + recentCommittedStores.at(tid); + for (auto it = recent_history.rbegin(); + it != recent_history.rend(); ++it) { + if (!it->valid || + it->addr != diffInfo.physEffAddr || + it->size != diffInfo.effSize || + it->seq >= seq || + (seq - it->seq) > 256) { + continue; + } + if (memcmp(it->data, &gem5_val, + diffInfo.effSize) == 0) { + matched_recent_store = &(*it); + break; + } + } + } + auto sync_reg = [&]() { + diffAllStates->referenceRegFile[dest_tag] = gem5_val; + diffAllStates->proxy->regcpy( + &(diffAllStates->referenceRegFile), DUT_TO_REF); + }; - // a lambda function to sync memory and register from golden results to ref - auto sync_mem_reg = [&]() { - diffAllStates->proxy->memcpy(diffInfo.physEffAddr, golden_ptr, diffInfo.effSize, + // Sync both memory and register when the value is already + // globally visible in golden memory. + auto sync_mem_reg = [&](const uint8_t *mem_src) { + diffAllStates->proxy->memcpy(diffInfo.physEffAddr, + const_cast(mem_src), + diffInfo.effSize, DIFFTEST_TO_REF); - diffAllStates->referenceRegFile[dest_tag] = gem5_val; - diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF); + sync_reg(); }; - if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) { - DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n"); - sync_mem_reg(); + if (diffInfo.inst->isLoad() && + memcmp(golden_ptr, &gem5_val, + diffInfo.effSize) == 0) { + DPRINTF(Diff, + "Load content matched in golden memory. " + "Sync from golden to ref\n"); + sync_mem_reg(golden_ptr); + continue; + } else if (diffInfo.inst->isLoad() && exec_golden_ptr && + memcmp(exec_golden_ptr, &gem5_val, + diffInfo.effSize) == 0) { + DPRINTF(Diff, + "Load content matched the execution-time " + "golden snapshot. Sync from the recorded " + "snapshot to ref\n"); + sync_mem_reg(exec_golden_ptr); + continue; + } else if (matched_recent_store) { + DPRINTF(Diff, + "Load content matched recent committed store " + "[sn:%llu] at addr %#lx. Syncing ref from the " + "store snapshot for this hart.\n", + matched_recent_store->seq, + diffInfo.physEffAddr); + sync_mem_reg(matched_recent_store->data); continue; } else if (diffInfo.inst->isAtomic()) { DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue, gem5_val); DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr); - if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) { + if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, + diffInfo.effSize) == 0) { DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n"); - sync_mem_reg(); + sync_mem_reg(golden_ptr); continue; - } else { - warn("Atomic old value not matched!\n"); } + } else if (diffInfo.inst->isLoad()) { + DPRINTF(Diff, + "Unresolved shared-memory load mismatch at " + "addr=%#lx gem5=%#lx current_golden=%#lx " + "exec_snapshot=%#lx; falling back to normal " + "difftest reporting.\n", + diffInfo.physEffAddr, gem5_val, + *(uint64_t *)golden_ptr, + exec_golden_ptr ? + *(uint64_t *)exec_golden_ptr : 0); } } @@ -1517,9 +1637,10 @@ BaseCPU::clearDiffMismatch(ThreadID tid, InstSeqNum seq) { void BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; warn("%s", diffMsg.str()); diffAllStates->proxy->isa_reg_display(); - displayGem5Regs(); + displayGem5Regs(tid); warn("start dump last %lu committed msg\n", diffInfo.lastCommittedMsg.size()); while (diffInfo.lastCommittedMsg.size()) { auto &inst = diffInfo.lastCommittedMsg.front(); @@ -1531,6 +1652,8 @@ BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq) void BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; + bool should_diff = false; DPRINTF(DumpCommit, "[sn:%llu] %#lx, %s\n", seq, diffInfo.pc->instAddr(), diffInfo.inst->disassemble(diffInfo.pc->instAddr())); @@ -1550,22 +1673,26 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) should_diff = true; if (!diffAllStates->hasCommit && diffInfo.pc->instAddr() == 0x80000000u) { diffAllStates->hasCommit = true; - readGem5Regs(); + readGem5Regs(tid); diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr(); if (noHypeMode) { - auto start = pmemStart + pmemSize * diffAllStates->diff.cpu_id; - warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize); + auto start = pmemStart + pmemSize * difftestHartId(tid); diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF); } else if (enableMemDedup) { - warn("Let ref share a COW mirror of root memory\n"); - assert(diffAllStates->proxy->ref_get_backed_memory); - diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); + if (system->multiContextDifftest()) { + assert(goldenMemPtr); + assert(diffAllStates->proxy->ref_get_backed_memory); + diffAllStates->proxy->ref_get_backed_memory( + system->createCopyOnWriteBranch(), pmemSize); + diffAllStates->proxy->memcpy_init( + 0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF); + } else { + assert(diffAllStates->proxy->ref_get_backed_memory); + diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); + } } else { - warn("MemDedup disabled, copying pmem to NEMU\n"); - warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize); diffAllStates->proxy->memcpy_init(0x80000000u, pmemStart, pmemSize, DUT_TO_REF); } - warn("Start regcpy to NEMU\n"); diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), DUT_TO_REF); } } @@ -1603,9 +1730,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) } void -BaseCPU::displayGem5Regs() +BaseCPU::displayGem5Regs(ThreadID tid) { - readGem5Regs(); + auto diffAllStates = this->diffAllStates[tid]; + readGem5Regs(tid); std::string str; //reg for (size_t i = 0; i < 32; i++) @@ -1712,8 +1840,9 @@ BaseCPU::displayGem5Regs() } void -BaseCPU::difftestRaiseIntr(uint64_t no) +BaseCPU::difftestRaiseIntr(uint64_t no, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; diffAllStates->diff.will_handle_intr = true; diffAllStates->proxy->raise_intr(no); } @@ -1721,19 +1850,24 @@ BaseCPU::difftestRaiseIntr(uint64_t no) void BaseCPU::clearGuideExecInfo() { - diffAllStates->diff.guide.force_raise_exception = false; - diffAllStates->diff.guide.force_set_jump_target = false; + for (auto &diffAllStates : this->diffAllStates) { + diffAllStates->diff.guide.force_raise_exception = false; + diffAllStates->diff.guide.force_set_jump_target = false; + } } void BaseCPU::enableDiffPrint() { - diffAllStates->diff.dynamic_config.debug_difftest = true; - diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); + for (auto &diffAllStates : this->diffAllStates) { + diffAllStates->diff.dynamic_config.debug_difftest = true; + diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); + } } -void BaseCPU::setSCSuccess(bool success, paddr_t addr) +void BaseCPU::setSCSuccess(bool success, paddr_t addr, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; diffAllStates->diff.sync.lrscValid = success; diffAllStates->diff.sync.lrscAddr = addr; // used for spike diff } @@ -1742,6 +1876,8 @@ void BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, bool force_set_jump_target, uint64_t jump_target, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; + auto &gd = diffAllStates->diff.guide; gd.force_raise_exception = true; gd.exception_num = exception_num; @@ -1769,7 +1905,7 @@ BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint6 void BaseCPU::checkL1DRefill(Addr paddr, const uint8_t* refill_data, size_t size) { assert(size == 64); - if (system->multiCore()) { + if (system->multiContextDifftest()) { uint8_t *golden_ptr = (uint8_t *)_goldenMemManager->guestToHost(paddr); if (memcmp(golden_ptr, refill_data, size)) { panic("Refill data diff with Golden addr %#lx with size %d\n", paddr, size); diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 8fe6d55d61..21c13388db 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -42,6 +42,7 @@ #ifndef __CPU_BASE_HH__ #define __CPU_BASE_HH__ +#include #include #include @@ -138,6 +139,17 @@ struct DiffAllStates class BaseCPU : public ClockedObject { protected: + struct RecentCommittedStore + { + bool valid = false; + Addr addr = 0; + size_t size = 0; + InstSeqNum seq = 0; + uint8_t data[16] = {}; + }; + + std::vector> recentCommittedStores; + std::vector syncVisibleStoreReplayArmed; const unsigned IntRegIndexBase = 0; const unsigned FPRegIndexBase = 32; @@ -693,7 +705,7 @@ class BaseCPU : public ClockedObject bool enableRVV{false}; bool enableRVHDIFF{false}; bool enableSkipCSR{false}; - std::shared_ptr diffAllStates{}; + std::vector> diffAllStates{}; enum diffRegConfig { @@ -701,7 +713,7 @@ class BaseCPU : public ClockedObject diffCsrNum = 36, }; - virtual void readGem5Regs() + virtual void readGem5Regs(ThreadID tid) { panic("difftest:readGem5Regs() is not implemented\n"); } @@ -709,6 +721,7 @@ class BaseCPU : public ClockedObject void csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq, std::string error_csr_name,int &diff_at); std::pair diffWithNEMU(ThreadID tid, InstSeqNum seq); + int difftestHartId(ThreadID tid) const; std::stringstream diffMsg; void reportDiffMismatch(ThreadID tid, InstSeqNum seq); @@ -777,13 +790,25 @@ class BaseCPU : public ClockedObject void difftestStep(ThreadID tid, InstSeqNum seq); + void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst); + void armSyncVisibleStoreReplay(ThreadID tid) + { + syncVisibleStoreReplayArmed.at(tid) = true; + } + bool consumeSyncVisibleStoreReplay(ThreadID tid) + { + bool armed = syncVisibleStoreReplayArmed.at(tid); + syncVisibleStoreReplayArmed.at(tid) = false; + return armed; + } + inline bool difftestEnabled() const { return enableDifftest; } - void displayGem5Regs(); + void displayGem5Regs(ThreadID tid); - void difftestRaiseIntr(uint64_t no); + void difftestRaiseIntr(uint64_t no, ThreadID tid = 0); - void setSCSuccess(bool success, paddr_t addr); + void setSCSuccess(bool success, paddr_t addr, ThreadID tid); void setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, // force set jump target @@ -793,14 +818,14 @@ class BaseCPU : public ClockedObject void enableDiffPrint(); - std::pair> getDiffAllStates() + std::pair>> getDiffAllStates() { return std::make_pair(enableDifftest, diffAllStates); } - void takeOverDiffAllStates(std::shared_ptr diffAllStates) + void takeOverDiffAllStates(std::vector> diffAllStates) { - this->diffAllStates = diffAllStates; + this->diffAllStates = std::move(diffAllStates); } int committedInstNum = 0; diff --git a/src/cpu/difftest.cc b/src/cpu/difftest.cc index 7293e51b9a..63665f194b 100644 --- a/src/cpu/difftest.cc +++ b/src/cpu/difftest.cc @@ -149,6 +149,12 @@ NemuProxy::NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bo #endif multiCore = multi_core; + if (multiCore) { + nemuSetHartId = (void (*)(int))dlsym(handle, "difftest_set_mhartid"); + assert(nemuSetHartId); + nemuPutGmaddr = (void (*)(uint8_t *))dlsym(handle, "difftest_put_gmaddr"); + assert(nemuPutGmaddr); + } if (enable_sdcard_diff) { sdcard_init = (void (*)(const char *, const char *))dlsym( @@ -168,15 +174,18 @@ void NemuProxy::initState(int coreid, uint8_t *golden_mem) { if (multiCore) { - auto nemu_difftest_set_mhartid = (void (*)(int))dlsym(handle, "difftest_set_mhartid"); warn("Setting mhartid to %d\n", coreid); - assert(nemu_difftest_set_mhartid); - nemu_difftest_set_mhartid(coreid); - - auto nemu_difftest_put_gmaddr = (void (*)(uint8_t *ptr))dlsym(handle, "difftest_put_gmaddr"); + setHartId(coreid); warn("Setting gmaddr to %#lx\n", (uint64_t) golden_mem); - assert(nemu_difftest_put_gmaddr); - nemu_difftest_put_gmaddr(golden_mem); + nemuPutGmaddr(golden_mem); + } +} + +void +NemuProxy::setHartId(int coreid) +{ + if (multiCore) { + nemuSetHartId(coreid); } } diff --git a/src/cpu/difftest.hh b/src/cpu/difftest.hh index af4eee4d96..7d91201b4f 100644 --- a/src/cpu/difftest.hh +++ b/src/cpu/difftest.hh @@ -195,6 +195,7 @@ class RefProxy void (*sdcard_init)(const char *img_path, const char *sd_cpt_bin_path) = nullptr; virtual void initState(int coreid, uint8_t *golden_mem) = 0; + virtual void setHartId(int coreid) = 0; protected: bool multiCore; @@ -208,6 +209,11 @@ class NemuProxy : public RefProxy NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bool enable_mem_dedup, bool multi_core); void initState(int coreid, uint8_t *golden_mem) override; + void setHartId(int coreid) override; + + private: + void (*nemuSetHartId)(int) = nullptr; + void (*nemuPutGmaddr)(uint8_t *) = nullptr; }; @@ -217,6 +223,7 @@ class SpikeProxy : public RefProxy SpikeProxy(int coreid, const char *ref_so, bool enable_sdcard_diff); void initState(int coreid, uint8_t *golden_mem) override { panic("Not implemented\n"); } + void setHartId(int coreid) override { panic("Not implemented\n"); } }; #define DIFFTEST_WIDTH 8 diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py index 2d088a6032..7676f6d643 100644 --- a/src/cpu/o3/FuncScheduler.py +++ b/src/cpu/o3/FuncScheduler.py @@ -75,6 +75,11 @@ class PAgeSelector(BaseSelector): piece = Param.Int(2, "number of instructions in a group") +class SMTBasedSelector(BaseSelector): + type = 'SMTBasedSelector' + cxx_class = 'gem5::o3::SMTBasedSelector' + cxx_header = "cpu/o3/issue_queue.hh" + class IssueQue(SimObject): type = 'IssueQue' cxx_class = 'gem5::o3::IssueQue' @@ -85,7 +90,7 @@ class IssueQue(SimObject): inports = Param.Int(2, "") scheduleToExecDelay = Param.Cycles(2, "") oports = VectorParam.IssuePort("") - sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ (default: age first)") + sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)") class Scheduler(SimObject): type = 'Scheduler' diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript index 1ee4cf9448..463a8cdfc0 100755 --- a/src/cpu/o3/SConscript +++ b/src/cpu/o3/SConscript @@ -32,7 +32,7 @@ Import('*') if env['CONF']['TARGET_ISA'] != 'null': SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel', - 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler']) + 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler']) SimObject('FuncUnitConfig.py', sim_objects=[]) SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[ 'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord']) diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index cb88ad769f..ade70ed5e3 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -168,6 +168,12 @@ struct IssueStruct DynInstPtr insts[MaxWidth]; }; +struct SquashInfo +{ + InstSeqNum squashSn; + ThreadID squashTid; +}; + struct SquashVersion { uint8_t version; @@ -181,14 +187,23 @@ struct SquashVersion return (version + 1) % versionLimit; } bool largerThan(uint8_t other) const { - bool larger = version > other && version - other <= maxInflightSquash; - bool wrapped_larger = - version + versionLimit > other && - version + versionLimit - other <= maxInflightSquash; - if (!(larger || wrapped_larger || (version == other))) { + const uint8_t distance = (version + versionLimit - other) % versionLimit; + if (distance == 0) { + return false; + } + + if (distance <= maxInflightSquash) { + return true; + } + + if (versionLimit - distance <= maxInflightSquash) { + return false; + } + + if (version != other) { panic("SquashVersion: %d, other: %d\n", version, other); } - return larger || wrapped_larger; + return false; } void update(uint8_t v) { version = v; @@ -199,6 +214,7 @@ struct SquashVersion struct ResolveQueueEntry { + ThreadID resolvedTid; uint64_t resolvedFTQId; std::vector resolvedInstPC; }; @@ -246,6 +262,10 @@ struct TimeStruct }; /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */ std::vector resolvedCFIs; // *F + + unsigned iqCount; + unsigned ldstqCount; + unsigned robCount; }; IewComm iewInfo[MaxThreads]; // iew to rename, fetch diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index c06acc1221..c7b69d656c 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -42,6 +42,7 @@ #include "cpu/o3/commit.hh" #include +#include #include #include #include @@ -104,32 +105,35 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara : commitPolicy(params.smtCommitPolicy), stuckCheckEvent([this]() { static std::vector debug_insts; - if (cpu->curCycle() - this->lastCommitCycle > 40000) { - if (traceMaybeExitOnPipelineDrainFromStuckCheck()) { - return; - } - if (auto inst = rob->readHeadInst(0)) { - warn("can't commit inst %s\n", inst->genDisassembly()); - debug_insts.insert( - debug_insts.begin(), rob->getInstList(0).begin(), - rob->getInstList(0).end()); - warn("dump rob front 10 insts\n"); - int i = 0; - for (auto inst = debug_insts.begin(); - inst != debug_insts.end() && i < 10; inst++, i++) { - warn("%s\n", (*inst)->genDisassembly()); + for (ThreadID tid = 0; tid < numThreads; tid++) { + if (cpu->curCycle() - this->lastCommitCycle[tid] > 40000) { + if (traceMaybeExitOnPipelineDrainFromStuckCheck()) { + return; } - } else { - warn("rob was empty, may be fetch or rename stuck\n"); + + if (auto inst = rob->readHeadInst(0)) { + warn("can't commit inst %s\n", inst->genDisassembly()); + debug_insts.insert( + debug_insts.begin(), rob->getInstList(tid).begin(), + rob->getInstList(tid).end()); + warn("dump rob front 10 insts\n"); + int i = 0; + for (auto inst = debug_insts.begin(); + inst != debug_insts.end() && i < 10; inst++, i++) { + warn("%s\n", (*inst)->genDisassembly()); + } + } else { + warn("rob was empty, may be fetch or rename stuck\n"); + } + panic( + "Commit stage is stucked for more than 40,000 cycles!\n" + "Thread: %d Last commit cycle: %lu, current cycle: %lu, suggested " + "--debug-start=%llu --debug-end=%llu\n", tid, + lastCommitCycle[tid], cpu->curCycle(), + cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] - 200)), + cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] + 200))); } - panic( - "Commit stage is stucked for more than 40,000 cycles!\n" - "Last commit cycle: %lu, current cycle: %lu, suggested " - "--debug-start=%llu --debug-end=%llu\n", - lastCommitCycle, cpu->curCycle(), - cpu->cyclesToTicks(Cycles(lastCommitCycle - 200)), - cpu->cyclesToTicks(Cycles(lastCommitCycle + 200))); } cpu->schedule(this->stuckCheckEvent, cpu->clockEdge(Cycles(40010))); }, "CommitStuckCheckEvent"), @@ -1184,342 +1188,391 @@ Commit::commitInsts() DPRINTF(Commit, "Trying to commit instructions in the ROB.\n"); unsigned num_committed = 0; + std::array num_committed_per_thread = {}; + std::array commit_width_per_thread = {}; DynInstPtr head_inst; - int commit_width = rob->countInstsOfGroups(commitWidth); + int commit_width = 0; + for (ThreadID tid : *activeThreads) { + commit_width_per_thread[tid] = + rob->countInstsOfGroups(tid, commitWidth); + commit_width += commit_width_per_thread[tid]; + } if (commit_width >= 0) { cpu->activityThisCycle(); } - // Commit as many instructions as possible until the commit bandwidth - // limit is reached, or it becomes impossible to commit any more. - while (num_committed < commit_width) { - // hardware transactionally memory - // If executing within a transaction, - // need to handle interrupts specially - - ThreadID commit_thread = getCommittingThread(); - - // Check for any interrupt that we've already squashed for - // and start processing it. - if (interrupt != NoFault) { - // If inside a transaction, postpone interrupts - if (executingHtmTransaction(commit_thread)) { - cpu->clearInterrupts(0); - toIEW->commitInfo[0].clearInterrupt = true; - interrupt = NoFault; - avoidQuiesceLiveLock = true; - } else { - handleInterrupt(); - } + // Commit each thread independently for up to its local commit window. + for (ThreadID commit_thread : *activeThreads) { + if (commitStatus[commit_thread] != Running && + commitStatus[commit_thread] != Idle && + commitStatus[commit_thread] != FetchTrapPending) { + continue; } - // ThreadID commit_thread = getCommittingThread(); - - if (commit_thread == -1) - break; - - head_inst = rob->readHeadInst(commit_thread); - - if (!rob->isHeadGroupReady(commit_thread)) { - if (debug::Commit && head_inst->readyToCommit()) { - InstSeqNum seqnum = rob->getHeadGroupLastDoneSeq(commit_thread); - DPRINTF( - Commit, - "[sn:%llu] Head is ready to commit, but the group is not all ready, last done inst [sn:%llu]\n", - head_inst->seqNum, seqnum); + while (num_committed < commit_width && + num_committed_per_thread[commit_thread] < + commit_width_per_thread[commit_thread]) { + // hardware transactionally memory + // If executing within a transaction, + // need to handle interrupts specially + + // Check for any interrupt that we've already squashed for + // and start processing it. + if (interrupt != NoFault) { + // If inside a transaction, postpone interrupts + if (executingHtmTransaction(commit_thread)) { + cpu->clearInterrupts(0); + toIEW->commitInfo[0].clearInterrupt = true; + interrupt = NoFault; + avoidQuiesceLiveLock = true; + } else { + handleInterrupt(); + } } - break; - } - ThreadID tid = head_inst->threadNumber; - - assert(tid == commit_thread); - - DPRINTF(Commit, - "Trying to commit head instruction, [tid:%i] [sn:%llu]\n", - tid, head_inst->seqNum); + head_inst = rob->readHeadInst(commit_thread); + + if (!rob->isHeadGroupReady(commit_thread)) { + if (debug::Commit && head_inst->readyToCommit()) { + InstSeqNum seqnum = + rob->getHeadGroupLastDoneSeq(commit_thread); + DPRINTF( + Commit, + "[sn:%llu] Head is ready to commit, but the group " + "is not all ready, last done inst [sn:%llu]\n", + head_inst->seqNum, seqnum); + } + break; + } - // If the head instruction is squashed, it is ready to retire - // (be removed from the ROB) at any time. - if (head_inst->isSquashed()) { + ThreadID tid = head_inst->threadNumber; - DPRINTF(Commit, "Retiring squashed instruction from " - "ROB.\n"); + assert(tid == commit_thread); - rob->retireHead(commit_thread); + DPRINTF(Commit, + "Trying to commit head instruction, [tid:%i] [sn:%llu]\n", + tid, head_inst->seqNum); - ++stats.commitSquashedInsts; - // Notify potential listeners that this instruction is squashed - ppSquash->notify(head_inst); + // If the head instruction is squashed, it is ready to retire + // (be removed from the ROB) at any time. + if (head_inst->isSquashed()) { - // Record that the number of ROB entries has changed. - changedROBNumEntries[tid] = true; - } else { - set(pc[tid], head_inst->pcState()); - traceMaybeInjectCtrlFlowChangeFault(tid, head_inst); + DPRINTF(Commit, "Retiring squashed instruction from " + "ROB.\n"); - // Try to commit the head instruction. - bool commit_success = commitHead(head_inst, num_committed); + rob->drainSquashedHead(commit_thread); - if (commit_success) { - cpu->perfCCT->updateInstPos(head_inst->seqNum, PerfRecord::AtCommit); - auto res = head_inst->getResult(); - if (res.is()) { - cpu->perfCCT->updateInstMeta(head_inst->seqNum, InstDetail::Result, res.as()); - } - cpu->perfCCT->commitMeta(head_inst->seqNum); + ++stats.commitSquashedInsts; + // Notify potential listeners that this instruction is squashed + ppSquash->notify(head_inst); - DPRINTF(CommitTrace, "CT: %s\n", head_inst->genDisassembly()); + // Record that the number of ROB entries has changed. + changedROBNumEntries[tid] = true; + } else { + set(pc[tid], head_inst->pcState()); + traceMaybeInjectCtrlFlowChangeFault(tid, head_inst); + + // Try to commit the head instruction. + bool commit_success = commitHead(head_inst, + num_committed_per_thread[tid]); + + if (commit_success) { + cpu->perfCCT->updateInstPos(head_inst->seqNum, + PerfRecord::AtCommit); + auto res = head_inst->getResult(); + if (res.is()) { + cpu->perfCCT->updateInstMeta( + head_inst->seqNum, InstDetail::Result, + res.as()); + } + cpu->perfCCT->commitMeta(head_inst->seqNum); - if (ismispred) { - ismispred = false; - stats.recovery_bubble += (cpu->curCycle() - lastCommitCycle) * renameWidth; - } - if (head_inst->mispredicted()) { - ismispred = true; - } + DPRINTF(CommitTrace, "CT [tid:%d]: %s\n", + head_inst->threadNumber, + head_inst->genDisassembly()); - lastCommitCycle = cpu->curCycle(); - const auto &head_rv_pc = head_inst->pcState().as(); - if (bp->isBTB()) { - auto dbbtb = dynamic_cast(bp); - bool miss = head_inst->mispredicted(); - if (head_inst->isReturn()) { - DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n", - head_inst->pcState().instAddr(), miss, - head_rv_pc.npc(), *(head_inst->predPC)); + if (ismispred) { + ismispred = false; + stats.recovery_bubble += + (cpu->curCycle() - lastCommitCycle[tid]) * + renameWidth; + } + if (head_inst->mispredicted()) { + ismispred = true; } - // FIXME: ignore mret/sret/uret in correspond with RTL - if (!head_inst->isNonSpeculative() && head_inst->isControl()) { - dbbtb->commitBranch(head_inst, miss); - if (!head_inst->isReturn() && head_inst->isIndirectCtrl() && miss) { - misPredIndirect[head_inst->pcState().instAddr()]++; + lastCommitCycle[tid] = cpu->curCycle(); + const auto &head_rv_pc = + head_inst->pcState().as(); + if (bp->isBTB()) { + auto dbbtb = dynamic_cast< + branch_prediction::btb_pred:: + DecoupledBPUWithBTB *>(bp); + bool miss = head_inst->mispredicted(); + if (head_inst->isReturn()) { + DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n", + head_inst->pcState().instAddr(), miss, + head_rv_pc.npc(), *(head_inst->predPC)); } - } - dbbtb->notifyInstCommit(head_inst); - } - if (traceMaybeExitOnLastTraceInst(head_inst)) { - return; - } - if (head_inst->isUpdateVsstatusSd()) { - auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); - RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); - RiscvISA::VSSTATUS vsstatus = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - RiscvISA::VSSTATUS32 vsstatus32 = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - - if (v) { - if (hstatus.vsxl ==1) { - vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid); - } else { - vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid); + // FIXME: ignore mret/sret/uret in correspond with RTL + if (!head_inst->isNonSpeculative() && head_inst->isControl()) { + dbbtb->commitBranch(head_inst, miss); + if (!head_inst->isReturn() && + head_inst->isIndirectCtrl() && miss) { + misPredIndirect[head_inst->pcState().instAddr()]++; + } } + dbbtb->notifyInstCommit(head_inst); } + if (traceMaybeExitOnLastTraceInst(head_inst)) { + return; + } - } - if (head_inst->isUpdateMstatusSd()) { - updateMstatusSd(tid); - } + if (head_inst->isUpdateVsstatusSd()) { + auto v = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); + RiscvISA::HSTATUS hstatus = + cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); + RiscvISA::VSSTATUS vsstatus = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + RiscvISA::VSSTATUS32 vsstatus32 = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + + if (v) { + if (hstatus.vsxl ==1) { + vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus32, tid); + } else { + vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus, tid); + } + } - ++num_committed; - stats.committedInstType[tid][head_inst->opClass()]++; - ppCommit->notify(head_inst); + } + if (head_inst->isUpdateMstatusSd()) { + updateMstatusSd(tid); + } - // hardware transactional memory + ++num_committed; + ++num_committed_per_thread[tid]; + stats.committedInstType[tid][head_inst->opClass()]++; + ppCommit->notify(head_inst); - // update nesting depth - if (head_inst->isHtmStart()) - htmStarts[tid]++; + // hardware transactional memory - // sanity check - if (head_inst->inHtmTransactionalState()) { - assert(executingHtmTransaction(tid)); - } else { - assert(!executingHtmTransaction(tid)); - } + // update nesting depth + if (head_inst->isHtmStart()) + htmStarts[tid]++; - // update nesting depth - if (head_inst->isHtmStop()) - htmStops[tid]++; + // sanity check + if (head_inst->inHtmTransactionalState()) { + assert(executingHtmTransaction(tid)); + } else { + assert(!executingHtmTransaction(tid)); + } - changedROBNumEntries[tid] = true; + // update nesting depth + if (head_inst->isHtmStop()) + htmStops[tid]++; - // Set the doneSeqNum to the youngest committed instruction. - toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum; + changedROBNumEntries[tid] = true; - if (head_inst->getFtqId() > 1) { - toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1; - } - committedTargetId = head_inst->getFtqId(); - committedLoopIter = head_inst->getLoopIteration(); - - if (tid == 0) - canHandleInterrupts = !head_inst->isDelayedCommit(); - - // at this point store conditionals should either have - // been completed or predicated false - assert(!head_inst->isStoreConditional() || - head_inst->isCompleted() || - !head_inst->readPredicate()); - - // Updates misc. registers. - head_inst->updateMiscRegs(); - if (head_inst->staticInst->isVectorConfig()) { - auto vset = static_cast(head_inst->staticInst.get()); - if (!(vset->vtypeIsImm)) { - auto tc = head_inst->tcBase(); - RiscvISA::VTYPE new_vtype = head_inst->readMiscReg(RiscvISA::MISCREG_VTYPE); - tc->getDecoderPtr()->as().setVtype(new_vtype); + // Set the doneSeqNum to the youngest committed instruction. + toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum; + + if (head_inst->getFtqId() > 1) { + toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1; } - } - if (head_inst->isFloating() && head_inst->isLoad()){ - RiscvISA::STATUS status = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, tid); - status.sd = 1; - status.fs = 3; - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, (RegVal)status, tid); - } - if (head_inst->isUpdateVsstatusSd()) { - auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); - RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); - RiscvISA::VSSTATUS vsstatus = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - RiscvISA::VSSTATUS32 vsstatus32 = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - - if (v) { - if (hstatus.vsxl ==1) { - vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid); - } else { - vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid); + committedTargetId = head_inst->getFtqId(); + committedLoopIter = head_inst->getLoopIteration(); + + if (tid == 0) + canHandleInterrupts = !head_inst->isDelayedCommit(); + + // at this point store conditionals should either have + // been completed or predicated false + assert(!head_inst->isStoreConditional() || + head_inst->isCompleted() || + !head_inst->readPredicate()); + + // Updates misc. registers. + head_inst->updateMiscRegs(); + if (head_inst->staticInst->isVectorConfig()) { + auto vset = static_cast( + head_inst->staticInst.get()); + if (!(vset->vtypeIsImm)) { + auto tc = head_inst->tcBase(); + RiscvISA::VTYPE new_vtype = + head_inst->readMiscReg( + RiscvISA::MISCREG_VTYPE); + tc->getDecoderPtr()->as().setVtype(new_vtype); } } - - } - - if (cpu->difftestEnabled()) { - diffInst(tid, head_inst); - } - - if (head_inst->isLoad()) { - Addr load_pc = head_inst->pcState().instAddr(); - Addr load_addr = head_inst->physEffAddr; - char buffer[8] = {0}; - if (head_inst->memData) { - std::memcpy(buffer, head_inst->memData, - std::min(head_inst->effSize, - sizeof(buffer))); + if (head_inst->isFloating() && head_inst->isLoad()) { + RiscvISA::STATUS status = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_STATUS, tid); + status.sd = 1; + status.fs = 3; + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_STATUS, + (RegVal)status, tid); } - Addr load_value = *((uint64_t *)buffer); - bool hit = loadTripleCounter.update(load_pc, load_addr, load_value); - if (hit) { - // same PC && same addr && same value - stats.loadTriple++; + if (head_inst->isUpdateVsstatusSd()) { + auto v = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); + RiscvISA::HSTATUS hstatus = + cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); + RiscvISA::VSSTATUS vsstatus = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + RiscvISA::VSSTATUS32 vsstatus32 = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + + if (v) { + if (hstatus.vsxl ==1) { + vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus32, tid); + } else { + vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus, tid); + } + } + } - // EA reuse: compare to last committed EA of same static load - auto itEA = lastLoadEA.find(load_pc); - if (itEA != lastLoadEA.end() && itEA->second == load_addr) { - stats.loadEAReused++; + + if (head_inst->isReadBarrier() || + head_inst->isWriteBarrier()) { + cpu->armSyncVisibleStoreReplay(tid); } - lastLoadEA[load_pc] = load_addr; - // Producer stability: only if this load had a forwarding producer - if (head_inst->hasProducerStorePC()) { - stats.loadsWithProducer++; - const Addr prodPC = head_inst->producerStorePC(); - auto itP = lastLoadProducerStorePC.find(load_pc); - if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) { - stats.producerStable++; - } - lastLoadProducerStorePC[load_pc] = prodPC; - // optional: clear after use to avoid confusing later stages - head_inst->clearProducerStorePC(); + if (cpu->difftestEnabled()) { + diffInst(tid, head_inst); } - } + if (head_inst->isLoad()) { + Addr load_pc = head_inst->pcState().instAddr(); + Addr load_addr = head_inst->physEffAddr; + char buffer[8] = {0}; + if (head_inst->memData) { + std::memcpy(buffer, head_inst->memData, + std::min(head_inst->effSize, + sizeof(buffer))); + } + Addr load_value = *((uint64_t *)buffer); + bool hit = loadTripleCounter.update(load_pc, load_addr, load_value); + if (hit) { + // same PC && same addr && same value + stats.loadTriple++; + } + // EA reuse: compare to last committed EA of same static load + auto itEA = lastLoadEA.find(load_pc); + if (itEA != lastLoadEA.end() && itEA->second == load_addr) { + stats.loadEAReused++; + } + lastLoadEA[load_pc] = load_addr; + // Producer stability: only if this load had a forwarding producer + if (head_inst->hasProducerStorePC()) { + stats.loadsWithProducer++; + const Addr prodPC = head_inst->producerStorePC(); + auto itP = lastLoadProducerStorePC.find(load_pc); + if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) { + stats.producerStable++; + } + lastLoadProducerStorePC[load_pc] = prodPC; + + // optional: clear after use to avoid confusing later stages + head_inst->clearProducerStorePC(); + } + } - // Check instruction execution if it successfully commits and - // is not carrying a fault. - if (cpu->checker) { - cpu->checker->verify(head_inst); - } - cpu->traceFunctions(pc[tid]->instAddr()); - traceOnCommit(tid, head_inst); + // Check instruction execution if it successfully commits and + // is not carrying a fault. + if (cpu->checker) { + cpu->checker->verify(head_inst); + } - head_inst->staticInst->advancePC(*pc[tid]); + cpu->traceFunctions(pc[tid]->instAddr()); + traceOnCommit(tid, head_inst); - // Keep track of the last sequence number commited - lastCommitedSeqNum[tid] = head_inst->seqNum; + head_inst->staticInst->advancePC(*pc[tid]); - // If this is an instruction that doesn't play nicely with - // others squash everything and restart fetch - if (head_inst->isSquashAfter()) - squashAfter(tid, head_inst); + // Keep track of the last sequence number commited + lastCommitedSeqNum[tid] = head_inst->seqNum; - if (drainPending) { - if (pc[tid]->microPC() == 0 && interrupt == NoFault && - !thread[tid]->trapPending) { - // Last architectually committed instruction. - // Squash the pipeline, stall fetch, and use - // drainImminent to disable interrupts - DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]); + // If this is an instruction that doesn't play nicely with + // others squash everything and restart fetch + if (head_inst->isSquashAfter()) squashAfter(tid, head_inst); - cpu->commitDrained(tid); - drainImminent = true; - } - } - bool onInstBoundary = !head_inst->isMicroop() || - head_inst->isLastMicroop() || - !head_inst->isDelayedCommit(); - - if (onInstBoundary) { - int count = 0; - Addr oldpc; - // Make sure we're not currently updating state while - // handling PC events. - assert(!thread[tid]->noSquashFromTC && - !thread[tid]->trapPending); - do { - oldpc = pc[tid]->instAddr(); - thread[tid]->pcEventQueue.service( - oldpc, thread[tid]->getTC()); - count++; - } while (oldpc != pc[tid]->instAddr()); - if (count > 1) { - DPRINTF(Commit, - "PC skip function event, stopping commit\n"); - break; - } - traceOnMacroCommit(tid); + if (drainPending) { + if (pc[tid]->microPC() == 0 && interrupt == NoFault && + !thread[tid]->trapPending) { + // Last architectually committed instruction. + // Squash the pipeline, stall fetch, and use + // drainImminent to disable interrupts + DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]); + squashAfter(tid, head_inst); + cpu->commitDrained(tid); + drainImminent = true; + } } - // Check if an instruction just enabled interrupts and we've - // previously had an interrupt pending that was not handled - // because interrupts were subsequently disabled before the - // pipeline reached a place to handle the interrupt. In that - // case squash now to make sure the interrupt is handled. - // - // If we don't do this, we might end up in a live lock - // situation. - if (!interrupt && avoidQuiesceLiveLock && - onInstBoundary && cpu->checkInterrupts(0)) - squashAfter(tid, head_inst); - } else { - DPRINTF(Commit, "Unable to commit head instruction PC:%s " - "[tid:%i] [sn:%llu].\n", - head_inst->pcState(), tid ,head_inst->seqNum); - break; + bool onInstBoundary = !head_inst->isMicroop() || + head_inst->isLastMicroop() || + !head_inst->isDelayedCommit(); + + if (onInstBoundary) { + int count = 0; + Addr oldpc; + // Make sure we're not currently updating state while + // handling PC events. + assert(!thread[tid]->noSquashFromTC && + !thread[tid]->trapPending); + do { + oldpc = pc[tid]->instAddr(); + thread[tid]->pcEventQueue.service( + oldpc, thread[tid]->getTC()); + count++; + } while (oldpc != pc[tid]->instAddr()); + if (count > 1) { + DPRINTF(Commit, + "PC skip function event, stopping commit\n"); + break; + } + traceOnMacroCommit(tid); + } + + // Check if an instruction just enabled interrupts and we've + // previously had an interrupt pending that was not handled + // because interrupts were subsequently disabled before the + // pipeline reached a place to handle the interrupt. In that + // case squash now to make sure the interrupt is handled. + // + // If we don't do this, we might end up in a live lock + // situation. + if (!interrupt && avoidQuiesceLiveLock && + onInstBoundary && cpu->checkInterrupts(0)) + squashAfter(tid, head_inst); + } else { + DPRINTF(Commit, "Unable to commit head instruction PC:%s " + "[tid:%i] [sn:%llu].\n", + head_inst->pcState(), tid ,head_inst->seqNum); + break; + } } } } @@ -1569,6 +1622,8 @@ Commit::diffInst(ThreadID tid, const DynInstPtr &inst) { cpu->diffInfo.physEffAddr = inst->physEffAddr; cpu->diffInfo.effSize = inst->effSize; cpu->diffInfo.goldenValue = inst->getGolden(); + cpu->diffInfo.amoOldGoldenValue = inst->getAmoOldGoldenValue(); + cpu->recordCommittedStore(tid, inst); cpu->difftestStep(tid, inst->seqNum); } @@ -1599,9 +1654,12 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) // Memory-ordering instructions such as sfence.vma must not execute // until older stores are visible; otherwise page-table updates may // race with the TLB invalidation. - if ((head_inst->isMemRef() || head_inst->isReturn() || - head_inst->isReadBarrier() || head_inst->isWriteBarrier()) && - (inst_num > 0 || !iewStage->flushStores(tid))) { + const bool needs_store_drain = + head_inst->isMemRef() || head_inst->isReturn() || + head_inst->isReadBarrier() || head_inst->isWriteBarrier(); + const bool stores_drained = + !needs_store_drain || iewStage->flushStores(tid, head_inst->seqNum); + if (needs_store_drain && (inst_num > 0 || !stores_drained)) { DPRINTF(Commit, "[tid:%i] [sn:%llu] " "Waiting for all stores to writeback.\n", @@ -1655,7 +1713,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) if (inst_fault != NoFault) { traceLogInstFault(head_inst, inst_fault); - if (!iewStage->flushStores(tid) || inst_num > 0) { + if (!iewStage->flushStores(tid, head_inst->seqNum) || inst_num > 0) { DPRINTF(Commit, "[tid:%i] [sn:%llu] " "Stores outstanding, fault must wait.\n", @@ -1816,7 +1874,8 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) if (head_inst->isStoreConditional()) { DPRINTF(Commit, "[tid:%i] [sn:%llu] Store Conditional success: %i\n", tid, head_inst->seqNum, head_inst->lockedWriteSuccess()); - cpu->setSCSuccess(head_inst->lockedWriteSuccess(), head_inst->physEffAddr); + cpu->setSCSuccess(head_inst->lockedWriteSuccess(), + head_inst->physEffAddr, tid); } // Update the commit rename map @@ -1962,6 +2021,13 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid) DPRINTF(Commit, "Squashing in-flight renamed instructions\n"); for (unsigned i_idx = 0; i_idx < fromRename->size; i_idx++) { const DynInstPtr &inst = fromRename->insts[i_idx]; + if (inst->threadNumber != tid) { + DPRINTF(Commit, + "[tid:%i] [sn:%llu] Preserving other-thread in-flight " + "instruction during squash for tid %i\n", + inst->threadNumber, inst->seqNum, tid); + continue; + } DPRINTF(Commit, "[tid:%i] [sn:%llu] Squashing in-flight " "instruction PC %s\n", inst->threadNumber, inst->seqNum, inst->pcState()); @@ -1970,10 +2036,10 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid) fixedbuffer[tid].clear(); - localSquashVer.update(localSquashVer.nextVersion()); - toIEW->commitInfo[tid].squashVersion = localSquashVer; + localSquashVer[tid].update(localSquashVer[tid].nextVersion()); + toIEW->commitInfo[tid].squashVersion = localSquashVer[tid]; DPRINTF(Commit, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[tid].getVersion()); } void @@ -1994,7 +2060,9 @@ Commit::markCompletedInsts() fromIEW->insts[inst_num]->setCanCommit(); auto &inst = fromIEW->insts[inst_num]; - panic_if(!rob->findInst(0, inst->seqNum), "[sn:%llu] Committed instruction not found in ROB", + panic_if(!rob->findInst(inst->threadNumber, inst->seqNum), + "[tid:%i] [sn:%llu] Committed instruction not found in ROB", + inst->threadNumber, inst->seqNum); } } diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index cc13cff324..418dc0b779 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -196,7 +196,7 @@ class Commit }; std::list branchLog; - uint64_t lastCommitCycle = 0; + uint64_t lastCommitCycle[MaxThreads] = {0}; EventFunctionWrapper stuckCheckEvent; @@ -215,8 +215,6 @@ class Commit /** Returns the name of the Commit. */ std::string name() const; - uint64_t getLastCommitCycle() const { return lastCommitCycle; } - /** Registers probes. */ void regProbePoints(); @@ -430,7 +428,7 @@ class Commit /** Wire to read information from rename queue. */ TimeBuffer::wire fromRename; - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; public: /** ROB interface. */ diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index f43ae5e861..34f6844f37 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -134,13 +134,6 @@ CPU::CPU(const BaseO3CPUParams ¶ms) cpuStats(this), valuePred(params.valuePred) { - fatal_if(FullSystem && params.numThreads > 1, - "SMT is not supported in O3 in full system mode currently."); - - fatal_if(!FullSystem && params.numThreads < params.workload.size(), - "More workload items (%d) than threads (%d) on CPU %s.", - params.workload.size(), params.numThreads, name()); - if (!params.switched_out) { _status = Running; } else { @@ -205,7 +198,10 @@ CPU::CPU(const BaseO3CPUParams ¶ms) ThreadID active_threads; if (FullSystem) { - active_threads = 1; + // FS-SMT still uses one shared workload/system image, but the O3 core + // must provision per-thread architectural state for every hardware + // thread context exposed by the CPU. + active_threads = numThreads; } else { active_threads = params.workload.size(); @@ -282,9 +278,7 @@ CPU::CPU(const BaseO3CPUParams ¶ms) for (ThreadID tid = 0; tid < numThreads; ++tid) { if (FullSystem) { - // SMT is not supported in FS mode yet. - assert(numThreads == 1); - thread[tid] = new ThreadState(this, 0, NULL); + thread[tid] = new ThreadState(this, tid, NULL); } else { if (tid < params.workload.size()) { DPRINTF(O3CPU, "Workload[%i] process is %#x", tid, @@ -1382,10 +1376,10 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst) cpi_r.roll(1); } - uint64_t committedInsts = totalInsts(); + const uint64_t committedThreadInsts = thread[tid]->numInst; if (this->nextDumpInstCount && !dump_done - && committedInsts >= this->nextDumpInstCount) { + && committedThreadInsts >= this->nextDumpInstCount) { fprintf(stderr, "Will trigger stat dump and reset\n"); statistics::schedStatEvent(true, true, curTick(), 0); scheduleInstStop(tid,0,"Will trigger stat dump and reset"); @@ -1399,7 +1393,8 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst) // Check for instruction-count-based events. thread[tid]->comInstEventQueue.serviceEvents(thread[tid]->numInst); - if (this->warmupInstCount && !warmup_done && committedInsts >= this->warmupInstCount) { + if (this->warmupInstCount && !warmup_done && + committedThreadInsts >= this->warmupInstCount) { fprintf(stderr, "Will trigger stat dump and reset\n"); statistics::schedStatEvent(true, true, curTick(), 0); scheduleInstStop(tid,0,"Will trigger stat dump and reset"); @@ -1740,12 +1735,13 @@ CPU::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid, } void -CPU::readGem5Regs() +CPU::readGem5Regs(ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; for (int i = 0; i < 32; i++) { - diffAllStates->gem5RegFile[i] = readArchIntReg(i, 0); - diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, 0); - readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], 0); + diffAllStates->gem5RegFile[i] = readArchIntReg(i, tid); + diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, tid); + readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], tid); } } diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 3a01e6cbbe..e49c00f5b0 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -739,7 +739,7 @@ class CPU : public BaseCPU HtmFailureFaultCause cause) override; //difftest virtual function - void readGem5Regs() override; + void readGem5Regs(ThreadID tid) override; private: /** Value predictor */ diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 2b101f96c1..0901476f2b 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -72,6 +72,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams ¶ms) iewToDecodeDelay(params.iewToDecodeDelay), commitToDecodeDelay(params.commitToDecodeDelay), fetchToDecodeDelay(params.fetchToDecodeDelay), + decodeToFetchDelay(params.decodeToFetchDelay), decodeWidth(params.decodeWidth), numThreads(params.numThreads), enableLoadFusion(params.enable_loadFusion), @@ -86,8 +87,15 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams ¶ms) for (int i=0;i(decodeWidth); } - stallBuffer = boost::circular_buffer(decodeWidth * (fetchToDecodeDelay + 1)); - eachstallSize = boost::circular_buffer(fetchToDecodeDelay + 1); + // This buffer preserves the fetch->decode pipeline contents when decode + // stalls while TimeBuffer keeps advancing. Its depth matches the original + // forward pipeline window; fetch is backpressured before full to absorb + // both the decode->fetch feedback delay and the request already issued in + // the current cycle before decode computes backpressure. + const auto stallGroupDepth = fetchToDecodeDelay + 1; + stallBuffer = boost::circular_buffer( + decodeWidth * stallGroupDepth); + eachstallSize = boost::circular_buffer(stallGroupDepth); decodeStalls.resize(decodeWidth, StallReason::NoStall); @@ -130,8 +138,14 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) : statistics::Group(cpu, "decode"), ADD_STAT(idleCycles, statistics::units::Cycle::get(), "Number of cycles decode is idle"), + ADD_STAT(smtidleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle per tid"), ADD_STAT(blockedCycles, statistics::units::Cycle::get(), "Number of cycles decode is blocked"), + ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked per tid"), + ADD_STAT(smtnotactiveCycles, statistics::units::Cycle::get(), + "Number of cycles fetch no active per tid"), ADD_STAT(runCycles, statistics::units::Cycle::get(), "Number of cycles decode is running"), ADD_STAT(unblockCycles, statistics::units::Cycle::get(), @@ -171,6 +185,16 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) mispredictedByPC.flags(statistics::total); mispredictedByNPC.flags(statistics::total); fusedInsts.init(128).flags(statistics::nozero); + + smtidleCycles + .init(4) + .flags(statistics::total); + smtblockedCycles + .init(4) + .flags(statistics::total); + smtnotactiveCycles + .init(4) + .flags(statistics::total); } void @@ -373,6 +397,38 @@ Decode::updateActivate() void Decode::moveInstsToBuffer() { + auto tryMoveHeadGroupToFixedBuffer = [&]() -> bool { + if (stallBuffer.empty()) { + return false; + } + + // stallbuffer moves to fixedbuffer in strict FIFO order. + ThreadID tid = stallBuffer.front()->threadNumber; + if (!fixedbuffer[tid].empty()) { + return false; + } + + int insts_from_stall = eachstallSize.front(); + eachstallSize.pop_front(); + for (int i = 0; i < insts_from_stall; ++i) { + const DynInstPtr &inst = stallBuffer.front(); + assert(tid == inst->threadNumber); + if (localSquashVer[tid].largerThan(inst->getVersion())) { + inst->setSquashed(); + } + assert(!fixedbuffer[inst->threadNumber].full()); + fixedbuffer[inst->threadNumber].push_back(inst); + stallBuffer.pop_front(); + } + + return true; + }; + + // Model one stage advance before latching the next cycle's input so a + // full stall buffer can still accept a new fetch bundle when its head + // group moves forward in the same cycle. + const bool moved_group = tryMoveHeadGroupToFixedBuffer(); + // do not support mixed thread instructions in one fetch group int insts_from_fetch = fromFetch->size; if (insts_from_fetch != 0) { @@ -392,23 +448,12 @@ Decode::moveInstsToBuffer() if (stallBuffer.empty()) { return; } - // stallbuffer move to fixedbuffer - ThreadID tid = stallBuffer.front()->threadNumber; - if (!fixedbuffer[tid].empty()) - return; - insts_from_fetch = eachstallSize.front(); - eachstallSize.pop_front(); - for (int i = 0; i < insts_from_fetch; ++i) { - const DynInstPtr &inst = stallBuffer.front(); - assert(tid == inst->threadNumber); - if (localSquashVer.largerThan(inst->getVersion())) { - inst->setSquashed(); - } - assert(!fixedbuffer[inst->threadNumber].full()); - fixedbuffer[inst->threadNumber].push_back(inst); - stallBuffer.pop_front(); - } + // If nothing advanced before latching new input, allow the current head + // (possibly the just-arrived group) to fill an empty stage this cycle. + if (!moved_group) { + tryMoveHeadGroupToFixedBuffer(); + } } void @@ -419,9 +464,10 @@ Decode::checkSquash() DPRINTF(Decode, "[tid:%i] Squashing instructions due to squash " "from commit.\n", i); squash(i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); DPRINTF(Decode, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[i].getVersion()); } } } @@ -442,13 +488,36 @@ Decode::tick() // check threads stall & status ThreadID tid = InvalidThreadID; ThreadID blocked_tid = InvalidThreadID; + const bool fifoBackpressured = + !stallBuffer.empty() && + eachstallSize.size() + decodeToFetchDelay + 1 >= + eachstallSize.capacity(); + const ThreadID fifoHeadTid = + !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID; + const StallReason fifoBlockReason = + (fifoBackpressured && fifoHeadTid != InvalidThreadID && + stallSig->blockDecode[fifoHeadTid]) ? + stallSig->decodeBlockReason[fifoHeadTid] : + (fifoBackpressured ? StallReason::OtherFragStall : + StallReason::NoStall); for (int i = 0; i < numThreads; i++) { bool block = stallSig->blockDecode[i]; bool active = !block && !fixedbuffer[i].empty(); - stallSig->blockFetch[i] = block; + if(block){ + ++stats.smtblockedCycles[i]; + } + + if(!active) + { + ++stats.smtnotactiveCycles[i]; + } + + stallSig->blockFetch[i] = block || fifoBackpressured; stallSig->fetchBlockReason[i] = - block ? stallSig->decodeBlockReason[i] : StallReason::NoStall; + stallSig->blockFetch[i] ? + (block ? stallSig->decodeBlockReason[i] : fifoBlockReason) : + StallReason::NoStall; toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i]; if (active) { if (tid == InvalidThreadID) @@ -539,6 +608,7 @@ Decode::decodeInsts(ThreadID tid) " early.\n",tid); // Should I change the status to idle? ++stats.idleCycles; + ++stats.smtidleCycles[tid]; StallReason stall = StallReason::NoStall; for (auto iter : fromFetch->fetchStallReason) { diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index a510d8dd9d..f2e39b56a6 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -236,6 +236,9 @@ class Decode /** Fetch to decode delay. */ Cycles fetchToDecodeDelay; + /** Decode to fetch feedback delay for stage backpressure. */ + Cycles decodeToFetchDelay; + /** The width of decode, in instructions. */ unsigned decodeWidth; @@ -256,8 +259,12 @@ class Decode /** Stat for total number of idle cycles. */ statistics::Scalar idleCycles; + + statistics::Vector smtidleCycles; /** Stat for total number of blocked cycles. */ statistics::Scalar blockedCycles; + statistics::Vector smtblockedCycles; + statistics::Vector smtnotactiveCycles; /** Stat for total number of normal running cycles. */ statistics::Scalar runCycles; /** Stat for total number of unblocking cycles. */ @@ -293,7 +300,7 @@ class Decode void setAllStalls(StallReason decodeStall); - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; }; } // namespace o3 diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 21c9cec4e6..ff31aa9bb9 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -98,7 +98,6 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) fetchWidth(params.fetchWidth), decodeWidth(params.decodeWidth), retryPkt(), - retryTid(InvalidThreadID), cacheBlkSize(cpu->cacheLineSize()), fetchBufferSize(params.fetchBufferSize), fetchQueueSize(params.fetchQueueSize), @@ -148,6 +147,8 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) threads[tid].data = new uint8_t[fetchBufferSize]; } + initDecodeScheduler(); + // Get the size of an instruction. // stallReason size should be the same as decodeWidth,renameWidth,dispWidth stallReason.resize(decodeWidth, StallReason::NoStall); @@ -203,8 +204,12 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) "Number of cycles fetch has spent waiting for tlb"), ADD_STAT(idleCycles, statistics::units::Cycle::get(), "Number of cycles fetch was idle"), + ADD_STAT(smtidleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle per tid"), ADD_STAT(blockedCycles, statistics::units::Cycle::get(), "Number of cycles fetch has spent blocked"), + ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked per tid"), ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), "Number of cycles fetch has spent waiting on interrupts, or bad " "addresses, or out of MSHRs"), @@ -240,6 +245,10 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) "Distribution of fetch status"), ADD_STAT(decodeStalls, statistics::units::Count::get(), "Number of decode stalls"), + ADD_STAT(smtdecodeStalls, statistics::units::Count::get(), + "Number of decode stalls per tid"), + ADD_STAT(smtftqempty, statistics::units::Count::get(), + "Number of ftq empty per tid"), ADD_STAT(decodeStallRate, statistics::units::Rate< statistics::units::Count, statistics::units::Cycle>::get(), "Number of decode stalls per cycle", @@ -335,6 +344,18 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) } decodeStalls .prereq(decodeStalls); + smtdecodeStalls + .init(fetch->numThreads) + .flags(statistics::total); + smtftqempty + .init(fetch->numThreads) + .flags(statistics::total); + smtidleCycles + .init(fetch->numThreads) + .flags(statistics::total); + smtblockedCycles + .init(fetch->numThreads) + .flags(statistics::total); decodeStallRate .flags(statistics::total); fetchBubbles @@ -372,6 +393,41 @@ Fetch::setTimeBuffer(TimeBuffer *time_buffer) fromCommit = timeBuffer->getWire(-commitToFetchDelay); } +void +Fetch::initDecodeScheduler() +{ + // Initialize counters (same as before) + lsqCounter = new InstsCounter(); + iqCounter = new InstsCounter(); + robCounter = new InstsCounter(); + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 0\n"); + + for (ThreadID tid = 0; tid < numThreads; tid++) + { + lsqCounter->setCounter(tid, 0); + iqCounter->setCounter(tid, 0); + robCounter->setCounter(tid, 0); + } + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 1\n"); + + if (smtDecodePolicy == "icount") { + // Use ROB as default counter for icount + decodeScheduler = new ICountScheduler(numThreads, robCounter); + } + else if (smtDecodePolicy == "delayed") { + decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, delayedSchedulerDelay); + } + else if (smtDecodePolicy == "multi_priority") { + decodeScheduler = new MultiPrioritySched(numThreads, {lsqCounter, iqCounter, robCounter}); + } + else { + // Default: round-robin like (use delayed with thread cycling) + decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, numThreads); + } + + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: %s\n", smtDecodePolicy.c_str()); +} + void Fetch::setActiveThreads(std::list *at_ptr) { @@ -423,6 +479,10 @@ Fetch::resetStage() { numInst = 0; interruptPending = false; + for (auto *pkt : retryPkt) { + delete pkt; + } + retryPkt.clear(); cacheBlocked = false; priorityList.clear(); @@ -452,7 +512,9 @@ Fetch::resetStage() } assert(dbpbtb); - dbpbtb->resetPC(threads[0].fetchpc->instAddr()); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + dbpbtb->resetPC(tid, threads[tid].fetchpc->instAddr()); + } } bool @@ -550,8 +612,35 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt) DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n", tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size()); - // Note: retry is handled completely by the standard gem5 recvReqRetry mechanism - // No need to handle retry here to avoid duplicate packet sending + bool waitingOnRetry = false; + for (const auto status : threads[tid].cacheReq.requestStatus) { + if (status == CacheWaitRetry) { + waitingOnRetry = true; + break; + } + } + + if (waitingOnRetry && cacheBlocked && !retryPkt.empty()) { + PacketPtr queuedPkt = retryPkt.front(); + const ThreadID queuedTid = + cpu->contextToThread(queuedPkt->req->contextId()); + const bool sameThreadRetry = queuedTid == tid && + threads[tid].cacheReq.findRequestIndex(queuedPkt->req) != SIZE_MAX; + + if (sameThreadRetry && icachePort.sendTimingReq(queuedPkt)) { + DPRINTF(Fetch, + "[tid:%i] Retrying matching queued I-cache packet %#lx " + "after sibling response\n", + tid, queuedPkt->req->getVaddr()); + updateCacheRequestStatusByRequest(tid, queuedPkt->req, + CacheWaitResponse); + ppFetchRequestSent->notify(queuedPkt->req); + retryPkt.erase(retryPkt.begin()); + if (retryPkt.empty()) { + cacheBlocked = false; + } + } + } return false; // Return false to indicate we're still waiting } @@ -619,8 +708,8 @@ Fetch::processCacheCompletion(PacketPtr pkt) } // Verify fetchBufferPC alignment with the supplying FSQ entry. - if (threads[tid].valid && dbpbtb->ftqHasFetching(0)) { - const auto &stream = dbpbtb->ftqFetchingTarget(0); + if (threads[tid].valid && dbpbtb->ftqHasFetching(tid)) { + const auto &stream = dbpbtb->ftqFetchingTarget(tid); if (threads[tid].startPC != stream.startPC) { panic("fetchBufferPC %#x should be aligned with FSQ startPC %#x", threads[tid].startPC, stream.startPC); @@ -650,7 +739,6 @@ Fetch::drainSanityCheck() const { assert(isDrained()); assert(retryPkt.size() == 0); - assert(retryTid == InvalidThreadID); assert(!cacheBlocked); assert(!interruptPending); @@ -756,7 +844,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) // Decoupled+BTB-only: compute next PC directly from the supplying FSQ entry. ThreadID tid = inst->threadNumber; assert(dbpbtb); - assert(dbpbtb->ftqHasFetching(0)); + assert(dbpbtb->ftqHasFetching(tid)); const auto &stream = dbpbtb->ftqFetchingTarget(tid); const Addr curr_pc = next_pc.instAddr(); @@ -902,6 +990,16 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr fetchStats.cacheLines++; + if (cacheBlocked) { + DPRINTF(Fetch, "[tid:%i] I-cache port already waiting for retry, queueing %#lx\n", + tid, mem_req->getVaddr()); + + updateCacheRequestStatusByRequest(tid, mem_req, CacheWaitRetry); + setAllFetchStalls(StallReason::IcacheStall); + retryPkt.push_back(data_pkt); + return; + } + // Access the cache. if (!icachePort.sendTimingReq(data_pkt)) { DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid); @@ -913,7 +1011,6 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr mem_req->getVaddr()); setAllFetchStalls(StallReason::IcacheStall); retryPkt.push_back(data_pkt); - retryTid = tid; cacheBlocked = true; } else { DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid); @@ -965,7 +1062,7 @@ Fetch::handleTranslationFault(ThreadID tid, const RequestPtr &mem_req, const Fau // We will use a nop in order to carry the fault. DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr, fetch_pc, fetch_pc, false); - instruction->setVersion(localSquashVer); + instruction->setVersion(localSquashVer[tid]); instruction->setNotAnInst(); instruction->setPredTarg(fetch_pc); @@ -1073,15 +1170,17 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN // Reset the cache request after cancelling threads[tid].cacheReq.reset(); - // Get rid of the retrying packet if it was from this thread. - if (retryTid == tid) { - assert(cacheBlocked); - for (auto it : retryPkt) { - delete it; + // Drop any retry packets that belong to this squashed thread. + for (auto it = retryPkt.begin(); it != retryPkt.end();) { + if (cpu->contextToThread((*it)->req->contextId()) == tid) { + delete *it; + it = retryPkt.erase(it); + } else { + ++it; } - retryPkt.clear(); - retryTid = InvalidThreadID; - cacheBlocked = false; // clear cache blocked + } + if (retryPkt.empty()) { + cacheBlocked = false; } if (squashInst && !squashInst->isControl()) { @@ -1285,6 +1384,32 @@ Fetch::handleInterrupts() } } +ThreadID +Fetch::selectUnstalledThread() +{ + + // if (numThreads == 1) { + // return 0; + // } + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (!stallSig->blockFetch[tid]) { + lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount); + iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount); + robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount); + + } else { + lsqCounter->setCounter(tid, UINT64_MAX); + iqCounter->setCounter(tid, UINT64_MAX); + robCounter->setCounter(tid, UINT64_MAX); + + } + DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount); + } + + ThreadID selected = decodeScheduler->getThread(); + return selected; +} + void Fetch::sendInstructionsToDecode() { @@ -1296,9 +1421,12 @@ Fetch::sendInstructionsToDecode() for (int i = 0; i < numThreads; i++) { if (!stallSig->blockFetch[i]) { any_thread_active = true; - break; + //break; + }else{ + fetchStats.smtdecodeStalls[i]++; } } + if (!any_thread_active) { // All threads are blocked, no instructions to send ThreadID blocked_tid = InvalidThreadID; @@ -1321,7 +1449,8 @@ Fetch::sendInstructionsToDecode() return; } - ThreadID tid = 0; // TODO: smt support + ThreadID tid =selectUnstalledThread(); + DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid); // fetch totally stalled if (stallSig->blockFetch[tid]) { @@ -1407,6 +1536,7 @@ Fetch::measureFrontendBubbles(unsigned insts_to_decode, ThreadID tid) if (stallSig->blockFetch[tid]) { fetchStats.decodeStalls++; + //fetchStats.smtdecodeStalls[tid]++; } } @@ -1459,35 +1589,42 @@ Fetch::handleIEWSignals() return; } - auto &incoming = fromIEW->iewInfo->resolvedCFIs; const bool had_pending_resolve = !resolveQueue.empty(); - uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size(); uint8_t enqueueCount = 0; + uint8_t enqueueSize = 0; + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + enqueueSize += fromIEW->iewInfo[tid].resolvedCFIs.size(); + } if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) { fetchStats.resolveQueueFullEvents++; fetchStats.resolveEnqueueFailEvent += enqueueSize; } else { + for (ThreadID tid = 0; tid < numThreads; ++tid) { + auto &incoming = fromIEW->iewInfo[tid].resolvedCFIs; + for (const auto &resolved : incoming) { + bool merged = false; + for (auto &queued : resolveQueue) { + if (queued.resolvedTid == tid && + queued.resolvedFTQId == resolved.ftqId) { + queued.resolvedInstPC.push_back(resolved.pc); + merged = true; + break; + } + } - for (const auto &resolved : incoming) { - bool merged = false; - for (auto &queued : resolveQueue) { - if (queued.resolvedFTQId == resolved.ftqId) { - queued.resolvedInstPC.push_back(resolved.pc); - merged = true; - break; + if (merged) { + continue; } - } - if (merged) { - continue; + ResolveQueueEntry new_entry; + new_entry.resolvedTid = tid; + new_entry.resolvedFTQId = resolved.ftqId; + new_entry.resolvedInstPC.push_back(resolved.pc); + resolveQueue.push_back(std::move(new_entry)); + enqueueCount++; } - - ResolveQueueEntry new_entry; - new_entry.resolvedFTQId = resolved.ftqId; - new_entry.resolvedInstPC.push_back(resolved.pc); - resolveQueue.push_back(std::move(new_entry)); - enqueueCount++; } fetchStats.resolveEnqueueCount.sample(enqueueCount); } @@ -1499,18 +1636,19 @@ Fetch::handleIEWSignals() // and fetch consuming them as predictor resolved updates. if (had_pending_resolve && !resolveQueue.empty()) { auto &entry = resolveQueue.front(); + ThreadID tid = entry.resolvedTid; unsigned int stream_id = entry.resolvedFTQId; - dbpbtb->prepareResolveUpdateEntries(stream_id, 0); + dbpbtb->prepareResolveUpdateEntries(stream_id, tid); for (const auto resolvedInstPC : entry.resolvedInstPC) { - dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0); + dbpbtb->markCFIResolved(stream_id, resolvedInstPC, tid); } - bool success = dbpbtb->resolveUpdate(stream_id, 0); + bool success = dbpbtb->resolveUpdate(stream_id, tid); if (success) { - dbpbtb->notifyResolveSuccess(); + dbpbtb->notifyResolveSuccess(tid); resolveQueue.pop_front(); fetchStats.resolveDequeueCount++; } else { - dbpbtb->notifyResolveFailure(); + dbpbtb->notifyResolveFailure(tid); } } } @@ -1549,8 +1687,10 @@ Fetch::handleCommitSignals(ThreadID tid) squash(*fromCommit->commitInfo[tid].pc, squash_seq, squash_inst, tid); - localSquashVer.update(fromCommit->commitInfo[tid].squashVersion.getVersion()); - DPRINTF(Fetch, "Updating squash version to %u\n", localSquashVer.getVersion()); + localSquashVer[tid].update( + fromCommit->commitInfo[tid].squashVersion.getVersion()); + DPRINTF(Fetch, "Updating squash version to %u\n", + localSquashVer[tid].getVersion()); auto mispred_inst = fromCommit->commitInfo[tid].mispredictInst; @@ -1658,8 +1798,8 @@ Fetch::buildInst(ThreadID tid, StaticInstPtr staticInst, instruction->isMov()); assert(dbpbtb); DPRINTF(DecoupleBP, "Set instruction %lu with fetch id %lu\n", - instruction->seqNum, dbpbtb->ftqHeadId(0)); - instruction->setFtqId(dbpbtb->ftqHeadId(0)); + instruction->seqNum, dbpbtb->ftqHeadId(tid)); + instruction->setFtqId(dbpbtb->ftqHeadId(tid)); #if TRACING_ON if (trace) { @@ -1734,6 +1874,7 @@ Fetch::prepareFetchAddress(ThreadID tid, bool &status_change) } else { if (fetchStatus[tid] == Idle) { ++fetchStats.idleCycles; + ++fetchStats.smtidleCycles[tid]; DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid); } // Status is Idle, so fetch should do nothing. @@ -1861,7 +2002,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, tid, waitForVsetvl); } - instruction->setVersion(localSquashVer); + instruction->setVersion(localSquashVer[tid]); ppFetch->notify(instruction); numInst++; @@ -1996,6 +2137,7 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { } if (ftqEmpty(tid)) { + ++fetchStats.smtftqempty[tid]; DPRINTF(Fetch, "[tid:%i] No FSQ entry available for next fetch\n", tid); return; } @@ -2003,8 +2145,22 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { assert(dbpbtb); const auto &stream = dbpbtb->ftqFetchingTarget(tid); const Addr start_pc = stream.startPC; + const Addr current_pc = pc_state.instAddr(); threads[tid].startPC = start_pc; + if (current_pc < stream.startPC || + current_pc >= stream.predEndPC) { + auto &reset_pc = threads[tid].fetchpc->as(); + reset_pc.pc(stream.startPC); + reset_pc.npc(stream.startPC + 4); + reset_pc.uReset(); + DPRINTF(Fetch, + "[tid:%i] Resetting fetch PC to new FTQ stream start %s " + "(previous PC %#lx outside [%#lx, %#lx))\n", + tid, *threads[tid].fetchpc, current_pc, + stream.startPC, stream.predEndPC); + } + DPRINTF(Fetch, "[tid:%i] Issuing a pipelined I-cache access for new FSQ entry, " "starting at PC %#x (endPC %#x; original PC %s)\n", tid, start_pc, stream.predEndPC, pc_state); @@ -2014,36 +2170,32 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { void Fetch::recvReqRetry() { - if (retryPkt.size() == 0) { - assert(retryTid == InvalidThreadID); + if (retryPkt.empty()) { // Access has been squashed since it was sent out. Just clear // the cache being blocked. cacheBlocked = false; return; } assert(cacheBlocked); - assert(retryTid != InvalidThreadID); - // Note: In multi-cacheline fetch, overall status may not be CacheWaitRetry - // if some requests have progressed while others still need retry. - // The presence of retryPkt itself indicates retry is needed. + retryPendingIcacheRequests(); +} - for (auto it = retryPkt.begin(); it != retryPkt.end();) { - if (icachePort.sendTimingReq(*it)) { - // Use new cache state management with specific RequestPtr - updateCacheRequestStatusByRequest(retryTid, (*it)->req, CacheWaitResponse); - // Notify Fetch Request probe when a retryPkt is successfully sent. - // Note that notify must be called before retryPkt is set to NULL. - ppFetchRequestSent->notify((*it)->req); - it = retryPkt.erase(it); - } else { - it++; +void +Fetch::retryPendingIcacheRequests() +{ + while (!retryPkt.empty()) { + PacketPtr pkt = retryPkt.front(); + if (!icachePort.sendTimingReq(pkt)) { + return; } - } - if (retryPkt.size() == 0) { - retryTid = InvalidThreadID; - cacheBlocked = false; + const ThreadID tid = cpu->contextToThread(pkt->req->contextId()); + updateCacheRequestStatusByRequest(tid, pkt->req, CacheWaitResponse); + ppFetchRequestSent->notify(pkt->req); + retryPkt.erase(retryPkt.begin()); } + + cacheBlocked = false; } void @@ -2058,6 +2210,7 @@ Fetch::profileStall(ThreadID tid) DPRINTF(Fetch, "Fetch has no active thread!\n"); } else if (fetchStatus[tid] == Blocked) { ++fetchStats.blockedCycles; + ++fetchStats.smtblockedCycles[tid]; DPRINTF(Fetch, "[tid:%i] Fetch is blocked!\n", tid); } else if (fetchStatus[tid] == Squashing) { ++fetchStats.squashCycles; diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 19091ef30e..18e6159022 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -65,6 +65,7 @@ #include "mem/port.hh" #include "sim/eventq.hh" #include "sim/probe/probe.hh" +#include "cpu/o3/smt_sched.hh" namespace gem5 { @@ -233,6 +234,18 @@ class Fetch /** To probe when a fetch request is successfully sent. */ ProbePointArg *ppFetchRequestSent; + // SMT Decode Scheduler + SMTScheduler* decodeScheduler; + + // Counters from backend structures (to be passed in) + InstsCounter* lsqCounter; + InstsCounter* iqCounter; + InstsCounter* robCounter; + + // Configuration parameters + std::string smtDecodePolicy ="multi_priority"; + int delayedSchedulerDelay; + public: /** Fetch constructor. */ Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms); @@ -299,10 +312,19 @@ class Fetch /** For priority-based fetch policies, need to keep update priorityList */ void deactivateThread(ThreadID tid); + + // Function to initialize scheduler + void initDecodeScheduler(); + + // Select a thread that is not fetch-blocked, using scheduler + ThreadID selectUnstalledThread(); private: /** Reset this pipeline stage */ void resetStage(); + /** Retry queued I-cache packets once, stopping at the first new block. */ + void retryPendingIcacheRequests(); + /** Changes the status of this stage to active, and indicates this * to the CPU. */ @@ -657,12 +679,9 @@ class Fetch /** Is the cache blocked? If so no threads can access it. */ bool cacheBlocked; - /** The packet that is waiting to be retried. */ + /** Packets waiting for the next cache-issued retry callback. */ std::vector retryPkt; - /** The thread that is waiting on the cache to tell fetch to retry. */ - ThreadID retryTid; - /** Cache block size. */ unsigned int cacheBlkSize; @@ -1035,8 +1054,12 @@ class Fetch * the pipeline. */ statistics::Scalar idleCycles; + + statistics::Vector smtidleCycles; /** Total number of cycles spent blocked. */ statistics::Scalar blockedCycles; + + statistics::Vector smtblockedCycles; /** Total number of cycles spent in any other state. */ statistics::Scalar miscStallCycles; /** Total number of cycles spent in waiting for drains. */ @@ -1072,6 +1095,10 @@ class Fetch statistics::Vector fetchStatusDist; /** Number of decode stalls */ statistics::Scalar decodeStalls; + + statistics::Vector smtdecodeStalls; + + statistics::Vector smtftqempty; /** Number of decode stalls per cycle */ statistics::Formula decodeStallRate; /** Unutilized issue-pipeline slots while there is no backend-stall */ @@ -1107,7 +1134,7 @@ class Fetch statistics::Scalar traceMetaCleanupCommitCalls; } fetchStats; - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; public: const FetchStatGroup &getFetchStats() { return fetchStats; } diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index c4ffd4cb50..2d4cdb34c3 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -525,7 +525,7 @@ IEW::squash(ThreadID tid) for (auto& dp : dispQue) { for (auto& it : dp) { - if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum) { + if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum && (it->threadNumber == tid)) { it->setSquashed(); } } @@ -799,13 +799,14 @@ IEW::checkSquash() for (int i = 0; i < numThreads; i++) { if (fromCommit->commitInfo[i].squash) { squash(i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); - DPRINTF(IEW, "Updating squash version to %u\n", localSquashVer.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); + DPRINTF(IEW, "Updating squash version to %u\n", + localSquashVer[i].getVersion()); fetchRedirect[i] = false; iewStats.stallEvents[ROBWalk]++; setAllStalls(StallReason::CommitSquash); - return; } if (fromCommit->commitInfo[i].robSquashing) { @@ -831,7 +832,7 @@ IEW::moveInstsToBuffer() for (int i = 0; i < insts_from_rename; ++i) { const DynInstPtr &inst = fromRename->insts[i]; assert(inst->threadNumber == tid); - if (localSquashVer.largerThan(inst->getVersion())) { + if (localSquashVer[tid].largerThan(inst->getVersion())) { inst->setSquashed(); } else { fixedbuffer[tid].push_back(inst); @@ -935,9 +936,9 @@ IEW::dispatchInsts() toRename->iewInfo[tid].robHeadStallReason = checkDispatchStall(tid, NumDQ, nullptr, -1); toRename->iewInfo[tid].lqHeadStallReason = - ldstQueue.lqEmpty() ? StallReason::NoStall : checkLSQStall(tid, true); + ldstQueue.lqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, true); toRename->iewInfo[tid].sqHeadStallReason = - ldstQueue.sqEmpty() ? StallReason::NoStall : checkLSQStall(tid, false); + ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false); toRename->iewInfo[tid].blockReason = blockReason; } } @@ -1523,6 +1524,9 @@ IEW::executeInsts() while (threads != end) { ThreadID tid = *threads++; fetchRedirect[tid] = false; + toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid); + toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid); + toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid); } // Uncomment this if you want to see all available instructions. @@ -1533,6 +1537,7 @@ IEW::executeInsts() ThreadID tid = *activeThreads->begin(); toFetch->iewInfo[tid].resolvedCFIs.clear(); + // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; fromIssue->size = 0; @@ -1548,6 +1553,11 @@ IEW::executeInsts() // executing ppExecute->notify(inst); + if (inst->isSplitStoreData() && + ldstQueue.splitStoreAddrSquashed(inst)) { + inst->setSquashed(); + } + // Check if the instruction is squashed; if so then skip it if (inst->isSquashed()) { DPRINTF(IEW, "Execute: Instruction was squashed. PC: %s, [tid:%i]" @@ -1682,8 +1692,8 @@ IEW::writebackInsts() DynInstPtr inst = toCommit->insts[inst_num]; ThreadID tid = inst->threadNumber; - if (inst->savedRequest && inst->isLoad()) { - inst->pf_source = inst->savedRequest->mainReq()->getPFSource(); + if (inst->isLoad()) { + inst->pf_source = ldstQueue.getLoadPFSource(inst); } DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n", diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index e63e7aff11..94dd9a0835 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -260,6 +260,10 @@ class IEW * the store queue or the store buffer to write back to. */ bool flushStores(ThreadID tid) { return ldstQueue.flushStores(tid); } + bool flushStores(ThreadID tid, InstSeqNum seq_num) + { + return ldstQueue.flushStores(tid, seq_num); + } /** Check if we need to squash after a load/store/branch is executed. */ void SquashCheckAfterExe(DynInstPtr inst); @@ -405,7 +409,7 @@ class IEW /** Scoreboard pointer. */ Scoreboard* scoreboard; - SquashVersion localSquashVer{0}; + SquashVersion localSquashVer[MaxThreads]; /** Value predictor */ valuepred::VPUnit *valuePred; diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 89a027c3b1..db8ec407f4 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -53,6 +53,7 @@ #include "cpu/o3/dyn_inst.hh" #include "cpu/o3/dyn_inst_ptr.hh" #include "cpu/o3/fu_pool.hh" +#include "cpu/o3/iew.hh" #include "cpu/o3/issue_queue.hh" #include "cpu/o3/limits.hh" #include "debug/IQ.hh" @@ -151,7 +152,8 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue); scheduler->resetDepGraph(numPhysRegs); scheduler->setMemDepUnit(memDepUnit); - + scheduler->initIQICountSmtScheduler(numThreads); + resetState(); } @@ -757,7 +759,7 @@ InstructionQueue::commit(const InstSeqNum &inst, ThreadID tid) { DPRINTF(IQ, "[tid:%i] Committing instructions older than [sn:%llu]\n", tid,inst); - scheduler->doCommit(inst); + scheduler->doCommit(inst, tid); } int @@ -1121,7 +1123,9 @@ InstructionQueue::doSquash(ThreadID tid) DPRINTF(IQ, "[tid:%i] Squashing until sequence number %i!\n", tid, squashedSeqNum[tid]); - scheduler->doSquash(squashedSeqNum[tid]); + squashInfo.squashTid = tid; + squashInfo.squashSn = squashedSeqNum[tid]; + scheduler->doSquash(squashInfo); for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) { if (!it->inst || @@ -1134,7 +1138,7 @@ InstructionQueue::doSquash(ThreadID tid) } for (auto it = nonSpecInsts.begin(); it != nonSpecInsts.end();) { - if (it->first > squashedSeqNum[tid]) { + if (it->first > squashedSeqNum[tid] && (it->second->threadNumber == tid)) { auto& squashed_inst = it->second; if (!squashed_inst->isIssued() || (squashed_inst->isMemRef() && diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index db01710da9..f163ebb28e 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -427,6 +427,7 @@ class InstructionQueue /** The sequence number of the squashed instruction. */ InstSeqNum squashedSeqNum[MaxThreads]; + SquashInfo squashInfo; struct IQStats : public statistics::Group { diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index 1017698961..87a3e39dd7 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -140,6 +140,58 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid) } } +void +SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq) +{ + BaseSelector::setparent(scheduler, iq); + + smtScheduler = iq->getIndependentIQICountScheduler(); +} + +ReadyQue::iterator +SMTBasedSelector::select(ReadyQue::iterator begin, int portid) +{ + if (begin == end) { + return end; + } + + ThreadID priorityThread = 0; + + if (smtScheduler) { + priorityThread = smtScheduler->getThread(); + + DPRINTF(Schedule, + "SMTBasedSelector: priority thread = %d\n", + priorityThread); + } + + for (auto it = begin; it != end; it++) { + auto& inst = *it; + + if (inst->threadNumber == priorityThread) { + DPRINTF(Schedule, + "[sn:%llu] selected by SMT policy (tid=%d)\n", + inst->seqNum, priorityThread); + return it; + } + } + + + for (auto it = begin; it != end; it++) { + auto& inst = *it; + + if (inst->threadNumber != priorityThread) { + DPRINTF(Schedule, + "[sn:%llu] selected by default (tid=%d, priority=%d)\n", + inst->seqNum, inst->threadNumber, priorityThread); + return it; + } + } + + DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n"); + return begin; +} + bool IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const { @@ -301,6 +353,9 @@ IssueQue::IssueQue(const IssueQueParams& params) if (storePipeAcc) numStorePipe++; } + + //Init InstsCounter + instsCounter = new InstsCounter(); } void @@ -327,7 +382,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst) } // check bypass data ready or not if (!scheduler->bypassScoreboard[src->flatIndex()]) [[unlikely]] { - auto dst_inst = scheduler->getInstByDstReg(src->flatIndex()); + auto dst_inst = scheduler->getInstByDstReg(src->flatIndex(), + inst->threadNumber, + inst->seqNum); assert(dst_inst); if (!dst_inst->isLoad()) panic("dst[sn:%llu] is not load, src[sn:%llu]", dst_inst->seqNum, inst->seqNum); warn_once( @@ -350,6 +407,9 @@ IssueQue::addToFu(const DynInstPtr& inst) } inst->setIssued(); POPINST(inst); + if (hasInstsCounter()) { + decInIQInstsCounter(inst->threadNumber); + } scheduler->addToFU(inst); } @@ -489,14 +549,16 @@ IssueQue::wakeUpDependents(const DynInstPtr& inst, bool speculative) for (auto& it : depgraph) { int srcIdx = it.first; auto& consumer = it.second; - if (consumer->readySrcIdx(srcIdx)) { - continue; - } - consumer->markSrcRegReady(srcIdx); + if(consumer->threadNumber == inst->threadNumber){ + if (consumer->readySrcIdx(srcIdx)) { + continue; + } + consumer->markSrcRegReady(srcIdx); - DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx); - addIfReady(consumer); + DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx); + addIfReady(consumer); + } } if (!speculative) { @@ -697,6 +759,9 @@ IssueQue::insert(const DynInstPtr& inst) selector->allocate(inst); inst->issueQue = this; instList.emplace_back(inst); + if (hasInstsCounter()) { + incInIQInstsCounter(inst->threadNumber); + } bool addToDepGraph = false; for (int i = 0; i < inst->numSrcRegs(); i++) { auto src = inst->renamedSrcIdx(i); @@ -743,20 +808,28 @@ IssueQue::insertNonSpec(const DynInstPtr& inst) } void -IssueQue::doCommit(const InstSeqNum seqNum) +IssueQue::doCommit(const InstSeqNum seqNum, ThreadID tid) { - while (!instList.empty() && instList.front()->seqNum <= seqNum) { - assert(instList.front()->isIssued()); - instList.pop_front(); + for (auto it = instList.begin(); it != instList.end();) { + const auto &inst = *it; + if (inst->threadNumber == tid && inst->seqNum <= seqNum) { + assert(inst->isIssued()); + it = instList.erase(it); + } else { + ++it; + } } } void -IssueQue::doSquash(const InstSeqNum seqNum) +IssueQue::doSquash(SquashInfo squashInfo) { for (auto it = instList.begin(); it != instList.end();) { - if ((*it)->seqNum > seqNum) { + if (((*it)->seqNum > squashInfo.squashSn) && ((*it)->threadNumber == squashInfo.squashTid)) { if (!(*it)->isIssued()) { + if (hasInstsCounter()) { + decInIQInstsCounter((*it)->threadNumber); + } POPINST((*it)); (*it)->setIssued(); } @@ -779,7 +852,7 @@ IssueQue::doSquash(const InstSeqNum seqNum) int size = inflightIssues[-i].size; for (int j = 0; j < size; j++) { auto& inst = inflightIssues[-i].insts[j]; - if (inst && inst->isSquashed()) { + if (inst && inst->isSquashed() && (inst->threadNumber == squashInfo.squashTid)) { inst = nullptr; } } @@ -788,7 +861,7 @@ IssueQue::doSquash(const InstSeqNum seqNum) // clear in depGraph for (auto& entrys : subDepGraph) { for (auto it = entrys.begin(); it != entrys.end();) { - if ((*it).second->isSquashed()) { + if ((*it).second->isSquashed() && ((*it).second->threadNumber == squashInfo.squashTid)) { it = entrys.erase(it); } else { it++; @@ -797,6 +870,33 @@ IssueQue::doSquash(const InstSeqNum seqNum) } } +void +IssueQue::incInIQInstsCounter(ThreadID tid) +{ + if (instsCounter) { + instsCounter->incCounter(tid); + } +} + +void +IssueQue::decInIQInstsCounter(ThreadID tid) +{ + if (instsCounter) { + instsCounter->decCounter(tid); + } +} + +void +IssueQue::initIndependentIQICountScheduler(int numThreads) +{ + assert(instsCounter != nullptr && "InstsCounter must be set first"); + + independentIQICountScheduler = new IndependentIQICountScheduler( + numThreads, instsCounter); + + DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname); +} + Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to, PendingWakeEventsType* owner) : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to) @@ -1143,18 +1243,28 @@ Scheduler::ready(OpClass op, int disp_seq) } DynInstPtr -Scheduler::getInstByDstReg(RegIndex flatIdx) +Scheduler::getInstByDstReg(RegIndex flatIdx, ThreadID tid, + InstSeqNum consumerSeqNum) { + DynInstPtr candidate = nullptr; + for (auto iq : issueQues) { - for (auto& inst : iq->instList) { - for (auto i = 0; i < inst->numDestRegs(); i++) { - if (inst->renamedDestIdx(i)->flatIndex() == flatIdx) { - return inst; + for (auto &inst : iq->instList) { + if (inst->threadNumber != tid || inst->seqNum >= consumerSeqNum) { + continue; + } + for (int i = 0; i < inst->numDestRegs(); i++) { + if (inst->renamedDestIdx(i)->flatIndex() != flatIdx) { + continue; + } + if (!candidate || inst->seqNum > candidate->seqNum) { + candidate = inst; } } } } - return nullptr; + + return candidate; } void @@ -1394,12 +1504,14 @@ Scheduler::loadCancel(const DynInstPtr& inst) for (auto& it : iq->subDepGraph[dst->flatIndex()]) { int srcIdx = it.first; auto& depInst = it.second; - if (depInst->readySrcIdx(srcIdx)) { - DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum, - depInst->renamedSrcIdx(srcIdx)->flatIndex()); - depInst->issueQue->cancel(depInst); - depInst->clearSrcRegReady(srcIdx); - dfs.push(depInst); + if(depInst->threadNumber == inst->threadNumber){ + if (depInst->readySrcIdx(srcIdx)) { + DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum, + depInst->renamedSrcIdx(srcIdx)->flatIndex()); + depInst->issueQue->cancel(depInst); + depInst->clearSrcRegReady(srcIdx); + dfs.push(depInst); + } } } } @@ -1512,19 +1624,19 @@ Scheduler::isDrained() } void -Scheduler::doCommit(const InstSeqNum seqNum) +Scheduler::doCommit(const InstSeqNum seqNum, ThreadID tid) { for (auto it : issueQues) { - it->doCommit(seqNum); + it->doCommit(seqNum, tid); } } void -Scheduler::doSquash(const InstSeqNum seqNum) +Scheduler::doSquash(SquashInfo squashInfo) { - DPRINTF(Schedule, "doSquash until seqNum %lu\n", seqNum); + DPRINTF(Schedule, "doSquash until seqNum %lu\n", squashInfo.squashSn); for (auto it : issueQues) { - it->doSquash(seqNum); + it->doSquash(squashInfo); } } @@ -1538,6 +1650,17 @@ Scheduler::getIQInsts() return total; } +uint32_t +Scheduler::getIQInsts(ThreadID tid) +{ + uint32_t total = 0; + for (auto iq : issueQues) { + total += iq->getInstsCounter()->getCounter(tid);; + } + return total; +} + + void Scheduler::setMainRdpOpt(bool enable) { @@ -1546,5 +1669,19 @@ Scheduler::setMainRdpOpt(bool enable) } } +void +Scheduler::initIQICountSmtScheduler(int numThreads) +{ + DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads); + + // to do: add switch;add SMTSchedulingPolicy + for (auto iq : issueQues) { + InstsCounter* counter = iq->getInstsCounter(); + assert(counter); + iq->initIndependentIQICountScheduler(numThreads); + iq->selector->setparent(this, iq); + } +} + } } diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index f804595b54..bade5f78f8 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -16,12 +16,14 @@ #include "cpu/inst_seq.hh" #include "cpu/o3/dyn_inst.hh" #include "cpu/o3/dyn_inst_ptr.hh" +#include "cpu/o3/smt_sched.hh" #include "cpu/reg_class.hh" #include "cpu/timebuf.hh" #include "params/BaseSelector.hh" #include "params/IssuePort.hh" #include "params/IssueQue.hh" #include "params/PAgeSelector.hh" +#include "params/SMTBasedSelector.hh" #include "params/Scheduler.hh" #include "params/SpecWakeupChannel.hh" #include "sim/sim_object.hh" @@ -99,11 +101,25 @@ class PAgeSelector : public BaseSelector ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; }; +class SMTBasedSelector : public BaseSelector +{ + private: + IndependentIQICountScheduler* smtScheduler = nullptr; + public: + SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {} + void setparent(Scheduler* scheduler, IssueQue* iq) override; + void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);} + void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);} + ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; +}; + class IssueQue : public SimObject { friend class Scheduler; friend class BaseSelector; friend class PAgeSelector; + friend class InstsCounter; + friend class IndependentIQICountScheduler; std::string _name; const int inports; @@ -171,6 +187,10 @@ class IssueQue : public SimObject Scheduler* scheduler = nullptr; BaseSelector* selector = nullptr; + //iq smt scheduler + InstsCounter* instsCounter = nullptr; + IndependentIQICountScheduler* independentIQICountScheduler = nullptr; + struct IssueQueStats : public statistics::Group { IssueQueStats(statistics::Group* parent, IssueQue* que, std::string name); @@ -206,6 +226,21 @@ class IssueQue : public SimObject void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; } void resetDepGraph(int numPhysRegs); + void setInstsCounter(InstsCounter* counter) { instsCounter = counter;} + + InstsCounter* getInstsCounter() const {return instsCounter; } + + void incInIQInstsCounter(ThreadID tid); + void decInIQInstsCounter(ThreadID tid); + bool hasInstsCounter() const { return instsCounter != nullptr; } + + void initIndependentIQICountScheduler(int numThreads); + + void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) { + independentIQICountScheduler = _independentIQICountScheduler; + } + IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; } + void tick(); bool ready(); int emptyEntries() const { return iqsize - instNum; } @@ -217,8 +252,8 @@ class IssueQue : public SimObject void retryMem(const DynInstPtr& inst); bool idle(); - void doCommit(const InstSeqNum inst); - void doSquash(const InstSeqNum seqNum); + void doCommit(const InstSeqNum inst, ThreadID tid); + void doSquash(SquashInfo squashInfo); int getIssueStages() { return scheduleToExecDelay; } int getId() { return IQID; } @@ -329,12 +364,14 @@ class Scheduler : public SimObject void setAllScoreBoard(PhysRegIdPtr reg); void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; } void setMainRdpOpt(bool enable); + void initIQICountSmtScheduler(int numThreads); void tick(); void issueAndSelect(); void lookahead(std::deque& insts); bool ready(const DynInstPtr& inst, int disp_seq); - DynInstPtr getInstByDstReg(RegIndex flatIdx); + DynInstPtr getInstByDstReg(RegIndex flatIdx, ThreadID tid, + InstSeqNum consumerSeqNum); void addProducer(const DynInstPtr& inst); // return true if insert successful @@ -356,9 +393,10 @@ class Scheduler : public SimObject uint32_t getCorrectedOpLat(const DynInstPtr& inst); bool hasReadyInsts(); bool isDrained(); - void doCommit(const InstSeqNum seqNum); - void doSquash(const InstSeqNum seqNum); + void doCommit(const InstSeqNum seqNum, ThreadID tid); + void doSquash(SquashInfo squashInfo); uint32_t getIQInsts(); + uint32_t getIQInsts(ThreadID tid); SchedulerStats& getStats() { return stats; } }; diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 759f974cbc..1c18cf33b0 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -89,23 +89,96 @@ LSQ::DcachePort::DcachePort(LSQ *_lsq, CPU *_cpu) : std::list LSQ::SingleDataRequest::singleList; +namespace +{ + +bool +storeBufferEntryEligibleForLoad(const LSQ::StoreBufferEntry *entry, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return false; + } + + if (entry->tid == load_tid) { + return entry->seqNum < load_seq; + } + + return entry->generation != 0 && entry->generation <= visible_generation; +} + +bool +storeBufferByteEligibleForLoad(const LSQ::StoreBufferEntry *entry, + size_t byte_idx, ThreadID load_tid, + InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return false; + } + + if (entry->tid == load_tid) { + return entry->seqNum < load_seq; + } + + if (!entry->sending) { + return false; + } + + return byte_idx < entry->byteGenerations.size() && + entry->byteGenerations[byte_idx] != 0 && + entry->byteGenerations[byte_idx] <= visible_generation; +} + +uint64_t +storeBufferEligibleGeneration(const LSQ::StoreBufferEntry *entry, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return 0; + } + + uint64_t best_generation = 0; + if (storeBufferEntryEligibleForLoad(entry, load_tid, load_seq, + visible_generation)) { + best_generation = entry->generation; + } + if (storeBufferEntryEligibleForLoad(entry->vice, load_tid, load_seq, + visible_generation)) { + best_generation = std::max(best_generation, entry->vice->generation); + } + return best_generation; +} + +} // anonymous namespace + void -LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr, +LSQ::StoreBufferEntry::reset(ThreadID tid, InstSeqNum seq_num, + uint64_t block_vaddr, uint64_t block_paddr, uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask) + const std::vector &mask, + uint64_t generation) { std::fill(validMask.begin(), validMask.begin() + offset, false); + std::fill(byteGenerations.begin(), byteGenerations.end(), 0); for (int i = 0; i < size; i++) { validMask[offset + i] = mask[i]; + if (mask[i]) { + byteGenerations[offset + i] = generation; + } } std::fill(validMask.begin() + offset + size, validMask.end(), false); memcpy(blockDatas.data() + offset, datas, size); this->tid = tid; + this->seqNum = seq_num; this->blockVaddr = block_vaddr; this->blockPaddr = block_paddr; + this->generation = generation; this->sending = false; this->request = nullptr; this->vice = nullptr; @@ -113,19 +186,23 @@ LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_ void LSQ::StoreBufferEntry::merge(uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask) + const std::vector &mask, + uint64_t generation) { assert(offset + size <= validMask.size()); for (uint64_t i = 0; i < size; ++i) { if (mask[i]) { blockDatas[offset + i] = datas[i]; validMask[offset + i] = true; + byteGenerations[offset + i] = generation; } } } bool -LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq) +LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) { int offset = req->getPaddr() & (validMask.size() - 1); // the offset in the split request @@ -136,13 +213,21 @@ LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq) bool full_forward = true; for (int i = 0; i < req->getSize(); i++) { assert(goffset + i < lsqreq->_size); - if (vice && vice->validMask[offset + i]) { + const bool vice_eligible = + vice && vice->validMask[offset + i] && + storeBufferByteEligibleForLoad(vice, offset + i, load_tid, + load_seq, visible_generation); + const bool self_eligible = + validMask[offset + i] && + storeBufferByteEligibleForLoad(this, offset + i, load_tid, + load_seq, visible_generation); + if (vice_eligible) { // vice is newer assert(vice->blockVaddr == blockVaddr); lsqreq->SBforwardPackets.push_back( LSQRequest::FWDPacket{ .idx = goffset + i, .byte = vice->blockDatas[offset + i]}); - } else if (validMask[offset + i]) { + } else if (self_eligible) { lsqreq->SBforwardPackets.push_back( LSQRequest::FWDPacket{ .idx = goffset + i, .byte = blockDatas[offset + i]}); @@ -182,6 +267,40 @@ LSQ::StoreBuffer::size() const return _size; } +uint64_t +LSQ::StoreBuffer::size(ThreadID tid) const +{ + uint64_t count = 0; + for (size_t index = 0; index < data_vec.size(); ++index) { + if (!data_vld[index]) { + continue; + } + + auto *entry = data_vec[index]; + if (entry && entry->tid == tid) { + ++count; + } + } + return count; +} + +uint64_t +LSQ::StoreBuffer::size(ThreadID tid, InstSeqNum seq_num) const +{ + uint64_t count = 0; + for (size_t index = 0; index < data_vec.size(); ++index) { + if (!data_vld[index]) { + continue; + } + + auto *entry = data_vec[index]; + if (entry && entry->tid == tid && entry->seqNum < seq_num) { + ++count; + } + } + return count; +} + uint64_t LSQ::StoreBuffer::unsentSize() const { @@ -243,6 +362,47 @@ LSQ::StoreBuffer::getEvict() return data_vec[index]; } +LSQ::StoreBufferEntry * +LSQ::StoreBuffer::getEvict(const bool *eligible_tids, size_t num_threads) +{ + return getEvict(eligible_tids, nullptr, num_threads); +} + +LSQ::StoreBufferEntry * +LSQ::StoreBuffer::getEvict(const bool *eligible_tids, + const InstSeqNum *eligible_seq, + size_t num_threads) +{ + if (eligible_tids == nullptr && eligible_seq == nullptr) { + return getEvict(); + } + + for (auto it = lru_index.rbegin(); it != lru_index.rend(); ++it) { + auto *entry = data_vec[*it]; + if (!entry) { + continue; + } + + const ThreadID tid = entry->tid; + if (tid >= num_threads) { + continue; + } + if (eligible_tids && !eligible_tids[tid]) { + continue; + } + if (eligible_seq && + eligible_seq[tid] != static_cast(-1) && + entry->seqNum >= eligible_seq[tid]) { + continue; + } + + lru_index.erase(std::find(lru_index.begin(), lru_index.end(), *it)); + return entry; + } + + return nullptr; +} + LSQ::StoreBufferEntry * LSQ::StoreBuffer::createVice(StoreBufferEntry *entry) { @@ -368,6 +528,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) params.StoreCompletionWidth); thread[tid].init(cpu, iew_ptr, params, this, tid); thread[tid].setDcachePort(&dcachePort); + _storeBufferFlushing[tid] = false; } std::vector store_buffer_entries; @@ -637,6 +798,14 @@ LSQ::insertStore(const DynInstPtr &store_inst) thread[tid].insertStore(store_inst); } +bool +LSQ::splitStoreAddrSquashed(const DynInstPtr &inst) +{ + ThreadID tid = inst->threadNumber; + + return thread[tid].splitStoreAddrSquashed(inst); +} + void LSQ::issueToLoadPipe(const DynInstPtr &inst) { @@ -705,18 +874,26 @@ LSQ::processWriteback() if (storeBufferBlocked()) { - // dont offload store to sbuffer when sbuffer is flushing DPRINTF(StoreBuffer, "Store buffer is blocking, skip SQ offload\n"); return; } + std::vector offload_quota(numThreads, 0); std::vector offload_demand(numThreads, 0); std::vector requester_tids; requester_tids.reserve(activeThreads->size()); + for (ThreadID tid : *activeThreads) { offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries( maxStoreBufferEntriesAcceptedFromSQPerCycle); - if (offload_demand[tid] != 0) { + // During a global sbuffer flush, only threads that requested the + // flush may keep draining older committed stores from their SQ. + // If both SMT threads are flushing simultaneously, both must still be + // allowed to make forward progress, otherwise they can deadlock while + // waiting on each other's flush bit. + const bool conti = + !storeBufferFlushing() || storeBufferFlushing(tid); + if (conti && offload_demand[tid] != 0) { requester_tids.push_back(tid); } } @@ -760,17 +937,23 @@ LSQ::processWriteback() ThreadID tid = *threads++; thread[tid].offloadToStoreBuffer(offload_quota[tid]); } + + // A fence/flush only waits for the requesting thread's sbuffer domain. + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (!storeBufferFlushing(tid) || + !storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid])) { + continue; + } + + clearStoreBufferFlushing(tid); + cpu->activityThisCycle(); + } } void LSQ::storeBufferWriteback() { bool can_evict = true; - if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] { - assert(storeBuffer.unsentSize() == 0); - clearStoreBufferFlushing(); - cpu->activityThisCycle(); - } // write request will stall one cycle // so 2 cycle send one write request @@ -810,12 +993,23 @@ LSQ::storeBufferWriteback() } if (cause) { - StoreBufferEntry *entry = storeBuffer.getEvict(); + StoreBufferEntry *entry = nullptr; + if (*cause == StoreBufferEvictCause::Flush) { + entry = storeBuffer.getEvict( + _storeBufferFlushing, _storeBufferFlushBeforeSeq, + numThreads); + } else { + entry = storeBuffer.getEvict(); + } + if (!entry) { + /* Disabled with the broad sbuffer watchdog above. */ + return; + } + /* Disabled with the broad sbuffer watchdog above. */ auto &owner_unit = thread[entry->tid]; recordStoreBufferEviction(*cause); DPRINTF(StoreBuffer, "Evicting sbuffer entry[%#x]\n", entry->blockPaddr); - if (debug::StoreBuffer) { DPRINTFR(StoreBuffer, "Dumping sbuffer entry data\n"); for (int i = 0; i < owner_unit.cacheLineSize(); i++) { @@ -901,6 +1095,20 @@ void LSQ::completeSbufferEvict(PacketPtr pkt) { auto request = dynamic_cast(pkt->senderState); + const Addr block_paddr = request->sbuffer_entry->blockPaddr; + invalidateOtherThreadStoreBufferBytes(request->sbuffer_entry->tid, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), + request->sbuffer_entry->generation); + markStoreBufferBlockVisible(block_paddr, + request->sbuffer_entry->generation); + const bool replay_executed_loads = + cpu->consumeSyncVisibleStoreReplay(request->sbuffer_entry->tid); + notifyOtherThreadsStoreVisible(request->sbuffer_entry->tid, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), + request->sbuffer_entry->seqNum, + replay_executed_loads); if (cpu->goldenMemManager() && cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { Addr paddr = request->mainReq()->getPaddr(); @@ -912,6 +1120,7 @@ LSQ::completeSbufferEvict(PacketPtr pkt) } storeBuffer.release(request->sbuffer_entry); + reclaimStoreBufferBlockMetadata(block_paddr); DPRINTF(StoreBuffer, "finish entry[%#x] evict to cache, sbuffer size: %d, " "unsentsize: %d\n", @@ -1074,7 +1283,6 @@ LSQ::recvTimingResp(PacketPtr pkt) LSQRequest *request = dynamic_cast(pkt->senderState); panic_if(!request, "Got packet back with unknown sender state\n"); - thread[request->_port.lsqID].recvTimingResp(pkt); if (pkt->isInvalidate()) { @@ -1337,6 +1545,12 @@ LSQ::lqEmpty() const return true; } +bool +LSQ::lqEmpty(ThreadID tid) const +{ + return thread[tid].lqEmpty(); +} + bool LSQ::sqEmpty() const { @@ -1353,6 +1567,12 @@ LSQ::sqEmpty() const return true; } +bool +LSQ::sqEmpty(ThreadID tid) const +{ + return thread[tid].sqEmpty(); +} + bool LSQ::lqFull() { @@ -1419,6 +1639,29 @@ LSQ::getLSQHeadInst(ThreadID tid, bool isLoad) } } +int +LSQ::getLoadPFSource(const DynInstPtr &inst) const +{ + if (!inst || !inst->isLoad() || inst->lqIdx < 0) { + return -1; + } + + const auto &entry = thread[inst->threadNumber].loadQueue[inst->lqIdx]; + auto *request = entry.request(); + if (!request) { + return -1; + } + + // A load can retire through a split request or after replay/discard has + // detached some request state. Prefetch source is best-effort metadata, so + // only query a live sub-request when one still exists. + if (request->numReqs() == 0) { + return -1; + } + + return request->req()->getPFSource(); +} + bool LSQ::isStalled() { @@ -1466,12 +1709,245 @@ LSQ::hasStoresToWB(ThreadID tid) return thread.at(tid).hasStoresToWB(); } -bool LSQ::flushStores(ThreadID tid) +bool +LSQ::hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num) +{ + return thread.at(tid).hasStoresToWBBefore(seq_num); +} + +bool +LSQ::flushStores(ThreadID tid) +{ + _storeBufferFlushing[tid] = true; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + const bool has_stores = hasStoresToWB(tid); + const bool sbuffer_empty = + storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid]); + if (!has_stores && sbuffer_empty) { + clearStoreBufferFlushing(tid); + return true; + } + + return false; +} + +bool +LSQ::flushStores(ThreadID tid, InstSeqNum seq_num) +{ + _storeBufferFlushing[tid] = true; + _storeBufferFlushBeforeSeq[tid] = seq_num; + const bool has_older_stores = hasStoresToWBBefore(tid, seq_num); + const bool sbuffer_empty = storeBufferEmpty(tid, seq_num); + if (!has_older_stores && sbuffer_empty) { + clearStoreBufferFlushing(tid); + return true; + } + + return false; +} + +void +LSQ::requestGlobalStoreBufferFlush() { - _storeBufferFlushing = true; - // TODO:high performance shared SMT storebuffer flushing - bool t = !hasStoresToWB(tid) && storeBufferEmpty(); - return t; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + _storeBufferFlushing[tid] = true; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + } +} + +bool +LSQ::storeBufferHasConflict(ThreadID tid, Addr block_paddr) const +{ + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + + if (storeBuffer.get(other_tid, block_paddr)) { + return true; + } + } + + return false; +} + +uint64_t +LSQ::bumpStoreBufferBlockVersion(Addr block_paddr) +{ + auto &version = storeBufferBlockVersion[block_paddr]; + ++version; + if (version == 0) { + version = 1; + } + return version; +} + +uint64_t +LSQ::currentStoreBufferBlockVersion(Addr block_paddr) const +{ + auto it = storeBufferBlockVersion.find(block_paddr); + return it == storeBufferBlockVersion.end() ? 0 : it->second; +} + +void +LSQ::markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation) +{ + auto &visible = storeBufferVisibleVersion[block_paddr]; + visible = std::max(visible, generation); + reclaimStoreBufferBlockMetadata(block_paddr); +} + +uint64_t +LSQ::currentStoreBufferVisibleVersion(Addr block_paddr) const +{ + auto it = storeBufferVisibleVersion.find(block_paddr); + return it == storeBufferVisibleVersion.end() ? 0 : it->second; +} + +LSQ::StoreBufferEntry * +LSQ::findForwardingStoreBufferEntry(Addr block_paddr, ThreadID load_tid, + InstSeqNum load_seq) const +{ + StoreBufferEntry *best_entry = nullptr; + uint64_t best_generation = 0; + const auto visible_generation = + currentStoreBufferVisibleVersion(block_paddr); + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + auto entry = storeBuffer.get(tid, block_paddr); + if (!entry) { + continue; + } + + const uint64_t entry_generation = + storeBufferEligibleGeneration(entry, load_tid, load_seq, + visible_generation); + if (entry_generation == 0) { + continue; + } + + if (!best_entry || entry_generation > best_generation) { + best_entry = entry; + best_generation = entry_generation; + } + } + + return best_entry; +} + +bool +LSQ::hasLiveStoreBufferBlock(Addr block_paddr) const +{ + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (storeBuffer.get(tid, block_paddr)) { + return true; + } + } + return false; +} + +void +LSQ::reclaimStoreBufferBlockMetadata(Addr block_paddr) +{ + if (hasLiveStoreBufferBlock(block_paddr)) { + return; + } + + auto version_it = storeBufferBlockVersion.find(block_paddr); + if (version_it == storeBufferBlockVersion.end()) { + storeBufferVisibleVersion.erase(block_paddr); + return; + } + + auto visible_it = storeBufferVisibleVersion.find(block_paddr); + const uint64_t visible_generation = + visible_it == storeBufferVisibleVersion.end() ? 0 : visible_it->second; + if (visible_generation < version_it->second) { + return; + } + + storeBufferBlockVersion.erase(version_it); + if (visible_it != storeBufferVisibleVersion.end()) { + storeBufferVisibleVersion.erase(visible_it); + } +} + +void +LSQ::invalidateOtherThreadStoreBufferBytes( + ThreadID tid, Addr paddr, const std::vector &mask, + uint64_t generation) +{ + const Addr cache_block_mask = + ~((static_cast(cpu->cacheLineSize())) - 1); + const Addr block_paddr = paddr & cache_block_mask; + const Addr offset = paddr & ~cache_block_mask; + auto invalidate_entry = [&](StoreBufferEntry *entry) { + if (!entry || offset + mask.size() > entry->validMask.size()) { + return; + } + + if (!entry->sending) { + return; + } + + for (size_t i = 0; i < mask.size(); ++i) { + if (mask[i] && + entry->byteGenerations[offset + i] != 0 && + entry->byteGenerations[offset + i] <= generation) { + entry->validMask[offset + i] = false; + } + } + }; + + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + + auto entry = storeBuffer.get(other_tid, block_paddr); + if (!entry) { + continue; + } + + invalidate_entry(entry); + invalidate_entry(entry->vice); + } +} + +void +LSQ::notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr, + const std::vector &byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads) +{ + if (numThreads <= 1) { + return; + } + + Request::Flags flags; + const Addr cache_block_mask = + ~((static_cast(cpu->cacheLineSize())) - 1); + RequestPtr req = std::make_shared( + store_paddr & cache_block_mask, cpu->cacheLineSize(), flags, + cpu->dataRequestorId()); + Packet pkt(req, MemCmd::InvalidateReq); + + for (ThreadID context_id = 0; context_id < numThreads; ++context_id) { + gem5::ThreadContext *tc = cpu->getContext(context_id); + bool no_squash = cpu->thread[context_id]->noSquashFromTC; + cpu->thread[context_id]->noSquashFromTC = true; + tc->getIsaPtr()->handleLockedSnoop(&pkt, cache_block_mask); + cpu->thread[context_id]->noSquashFromTC = no_squash; + } + + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + thread[other_tid].checkLocalStoreVisible(store_paddr, byte_enable, + store_seq, + replay_executed_loads); + } } int @@ -1529,6 +2005,48 @@ LSQ::dumpInsts(ThreadID tid) const thread.at(tid).dumpInsts(); } +void +LSQ::dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const +{ + cprintf("Store buffer state for tid %i:\n", tid); + cprintf(" flushing=%d flushBeforeSeq=%llu\n", + _storeBufferFlushing[tid], + static_cast(_storeBufferFlushBeforeSeq[tid])); + cprintf(" storesToWB=%d hasStoresToWBBefore=%d\n", + thread.at(tid).numStoresToSbuffer(), + thread.at(tid).hasStoresToWBBefore(seq_num)); + cprintf(" sbufferSize(tid)=%llu sbufferSizeBeforeSeq=%llu\n", + static_cast(storeBuffer.size(tid)), + static_cast(storeBuffer.size(tid, seq_num))); +} + +void +LSQ::dumpStoreBuffer(ThreadID tid) const +{ + cprintf("Store buffer entries for tid %i:\n", tid); + const auto &entries = storeBuffer.entries(); + for (size_t index = 0; index < entries.size(); ++index) { + if (!storeBuffer.valid(index)) { + continue; + } + + auto *entry = entries[index]; + if (!entry || entry->tid != tid) { + continue; + } + + cprintf(" idx:%d seq:%llu paddr:%#lx vaddr:%#lx sending=%d vice=%d generation=%llu request=%p\n", + entry->index, + static_cast(entry->seqNum), + entry->blockPaddr, + entry->blockVaddr, + entry->sending, + entry->vice != nullptr, + static_cast(entry->generation), + entry->request); + } +} + bool LSQ::isMisaligned(const DynInstPtr& inst, Addr vaddr, int size) { @@ -1820,6 +2338,12 @@ LSQ::SplitDataRequest::mainReq() return _mainReq; } +RequestPtr +LSQ::SplitDataRequest::mainReq() const +{ + return _mainReq; +} + void LSQ::SplitDataRequest::initiateTranslation() { @@ -2028,14 +2552,47 @@ LSQ::LSQRequest::forward() } } -LSQ::LSQRequest::~LSQRequest() +void +LSQ::LSQRequest::detachLSQEntry() { - if (isAnyOutstandingRequest()) { - warn("numInTranslationFragments = %u, _numOutstandingPackets = %u\n", - numInTranslationFragments, _numOutstandingPackets); - std::raise(SIGINT); + if (!_inst) { + return; } + + if (isLoad() && _inst->lqIdx >= 0 && + _port.loadQueue[_inst->lqIdx].request() == this) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from LQ entry\n", + _inst->seqNum); + _port.loadQueue[_inst->lqIdx].setRequest(nullptr); + } else if ((isAtomic() || _inst->isStore()) && _inst->sqIdx >= 0 && + _port.storeQueue[_inst->sqIdx].request() == this) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from SQ entry\n", + _inst->seqNum); + _port.storeQueue[_inst->sqIdx].setRequest(nullptr); + } +} + +void +LSQ::LSQRequest::detachInflightLoad() +{ + if (!isLoad()) { + return; + } + + auto &inflight = _port.inflightLoads; + auto it = std::find(inflight.begin(), inflight.end(), this); + if (it != inflight.end()) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from inflightLoads\n", + _inst ? _inst->seqNum : 0); + inflight.erase(it); + } +} + +LSQ::LSQRequest::~LSQRequest() +{ assert(!isAnyOutstandingRequest()); + detachLSQEntry(); + detachInflightLoad(); if (_inst && _inst->savedRequest == this) { DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum); _inst->savedRequest = nullptr; @@ -2125,7 +2682,6 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) mainReq()->isUncacheable(), cacheHit, *((uint64_t*)buffer)); } - if (isLoad()) { auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this); if (it != lsqUnit()->inflightLoads.end()) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 788ff0ae29..6ebbe9d5dd 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -64,6 +64,7 @@ #include "cpu/inst_seq.hh" #include "cpu/o3/dyn_inst_ptr.hh" #include "cpu/o3/dyn_inst_xsmeta.hh" +#include "cpu/o3/limits.hh" #include "cpu/utils.hh" #include "enums/SMTQueuePolicy.hh" #include "mem/packet.hh" @@ -146,10 +147,13 @@ class LSQ public: const int index; ThreadID tid; + InstSeqNum seqNum = 0; Addr blockVaddr; Addr blockPaddr; std::vector blockDatas; std::vector validMask; + std::vector byteGenerations; + uint64_t generation = 0; bool sending; // the another same addr entry when sending // another cannot sending until self sending finished @@ -161,16 +165,20 @@ class LSQ { blockDatas.resize(size, 0); validMask.resize(size, false); + byteGenerations.resize(size, 0); } - void reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr, - uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask); + void reset(ThreadID tid, InstSeqNum seq_num, uint64_t block_vaddr, + uint64_t block_paddr, uint64_t offset, uint8_t *datas, + uint64_t size, const std::vector &mask, + uint64_t generation); void merge(uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask); + const std::vector &mask, uint64_t generation); - bool recordForward(RequestPtr req, LSQRequest *lsqreq); + bool recordForward(RequestPtr req, LSQRequest *lsqreq, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation); }; class StoreBuffer @@ -197,12 +205,21 @@ class LSQ void setData(std::vector &data_vec); bool full() const; uint64_t size() const; + uint64_t size(ThreadID tid) const; + uint64_t size(ThreadID tid, InstSeqNum seq_num) const; uint64_t unsentSize() const; + const std::vector &entries() const { return data_vec; } + bool valid(size_t index) const { return data_vld.at(index); } StoreBufferEntry *getEmpty(); void insert(StoreBufferEntry *entry); StoreBufferEntry *get(ThreadID tid, uint64_t addr) const; void update(int index); StoreBufferEntry *getEvict(); + StoreBufferEntry *getEvict(const bool *eligible_tids, + size_t num_threads); + StoreBufferEntry *getEvict(const bool *eligible_tids, + const InstSeqNum *eligible_seq, + size_t num_threads); StoreBufferEntry *createVice(StoreBufferEntry *entry); void release(StoreBufferEntry *entry); }; @@ -350,6 +367,8 @@ class LSQ AtomicOpFunctorPtr _amo_op; bool _hasStaleTranslation; bool _sbufferBypass; + bool _goldenSnapshotCaptured = false; + uint64_t _storeBufferGeneration = 0; struct FWDPacket { @@ -370,6 +389,14 @@ class LSQ /** Install the request in the LQ/SQ. */ void install(); + /** If the request is still installed in the current LQ/SQ slot, + * detach that slot so later scans do not observe a discarded or + * deleted request through the queue entry. */ + void detachLSQEntry(); + + /** Remove the request from the in-flight load tracker if present. */ + void detachInflightLoad(); + bool squashed() const override; @@ -476,6 +503,7 @@ class LSQ RequestPtr req(int idx = 0) { return _reqs.at(idx); } const RequestPtr req(int idx = 0) const { return _reqs.at(idx); } + size_t numReqs() const { return _reqs.size(); } Addr getVaddr(int idx = 0) const { return req(idx)->getVaddr(); } virtual void initiateTranslation() = 0; @@ -496,6 +524,13 @@ class LSQ return req(); } + virtual RequestPtr + mainReq() const + { + assert (_reqs.size() == 1); + return req(); + } + /** * Test if there is any in-flight translation or mem access request */ @@ -635,6 +670,8 @@ class LSQ void discard() { + detachLSQEntry(); + detachInflightLoad(); release(Flag::Discarded); } @@ -766,6 +803,7 @@ class LSQ virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask); virtual RequestPtr mainReq(); + virtual RequestPtr mainReq() const; virtual PacketPtr mainPacket(); virtual std::string name() const { return "SplitDataRequest"; } }; @@ -823,6 +861,7 @@ class LSQ void insertLoad(const DynInstPtr &load_inst); /** Inserts a store into the LSQ. */ void insertStore(const DynInstPtr &store_inst); + bool splitStoreAddrSquashed(const DynInstPtr &inst); /** Executes an amo inst. */ Fault executeAmo(const DynInstPtr &inst); @@ -938,8 +977,12 @@ class LSQ bool isEmpty() const; /** Returns if all of the LQs are empty. */ bool lqEmpty() const; + /** Returns if the LQ of a given thread is empty. */ + bool lqEmpty(ThreadID tid) const; /** Returns if all of the SQs are empty. */ bool sqEmpty() const; + /** Returns if the SQ of a given thread is empty. */ + bool sqEmpty(ThreadID tid) const; /** Returns if any of the LQs are full. */ bool lqFull(); @@ -954,6 +997,8 @@ class LSQ /** Returns whether the head instruction of sq has completed*/ const DynInstPtr& getLSQHeadInst(ThreadID tid, bool isLoad); + int getLoadPFSource(const DynInstPtr &inst) const; + /** * Returns if the LSQ is stalled due to a memory operation that must be * replayed. @@ -972,9 +1017,29 @@ class LSQ * to memory. */ bool hasStoresToWB(ThreadID tid); + bool hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num); // true if all stores are flushed bool flushStores(ThreadID tid); + bool flushStores(ThreadID tid, InstSeqNum seq_num); + void requestGlobalStoreBufferFlush(); + bool storeBufferHasConflict(ThreadID tid, Addr block_paddr) const; + uint64_t bumpStoreBufferBlockVersion(Addr block_paddr); + uint64_t currentStoreBufferBlockVersion(Addr block_paddr) const; + void markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation); + uint64_t currentStoreBufferVisibleVersion(Addr block_paddr) const; + StoreBufferEntry *findForwardingStoreBufferEntry(Addr block_paddr, + ThreadID load_tid, + InstSeqNum load_seq) const; + bool hasLiveStoreBufferBlock(Addr block_paddr) const; + void reclaimStoreBufferBlockMetadata(Addr block_paddr); + void invalidateOtherThreadStoreBufferBytes( + ThreadID tid, Addr paddr, const std::vector &mask, + uint64_t generation); + void notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr, + const std::vector &byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads); /** Returns the number of stores a specific thread has to write back. */ int numStoresToSbuffer(ThreadID tid); @@ -990,6 +1055,10 @@ class LSQ void dumpInsts() const; /** Debugging function to print out instructions from a specific thread. */ void dumpInsts(ThreadID tid) const; + /** Debugging function to print store-buffer flush state for a thread. */ + void dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const; + /** Debugging function to print store-buffer entries for a thread. */ + void dumpStoreBuffer(ThreadID tid) const; bool isMisaligned(const DynInstPtr& inst, Addr vaddr, int size); @@ -1077,8 +1146,34 @@ class LSQ bool getDcacheWriteStall() { return dcacheWriteStall; } StoreBuffer &getStoreBuffer() { return storeBuffer; } bool storeBufferEmpty() const { return storeBuffer.size() == 0; } - bool storeBufferFlushing() const { return _storeBufferFlushing; } - void clearStoreBufferFlushing() { _storeBufferFlushing = false; } + bool storeBufferEmpty(ThreadID tid) const + { + return storeBuffer.size(tid) == 0; + } + bool storeBufferEmpty(ThreadID tid, InstSeqNum seq_num) const + { + return storeBuffer.size(tid, seq_num) == 0; + } + bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; } + bool storeBufferFlushing() const + { + for (auto tid : *activeThreads) { + if (_storeBufferFlushing[tid]) + return true; + } + return false; + } + void clearStoreBufferFlushing(ThreadID tid) + { + _storeBufferFlushing[tid] = false; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + } + void clearStoreBufferFlushing() { + for (auto tid : *activeThreads) { + _storeBufferFlushing[tid] = false; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + } + } uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; } uint32_t getSbufferEntries() const { return sbufferEntries; } uint64_t getStoreBufferInactiveCycles() const @@ -1139,7 +1234,6 @@ class LSQ std::vector dcacheRefillDataRead; std::vector dcacheRefillDataWrite; std::vector dcacheRefillTagWrite; - bool isDcacheRefillTagWrite() const { for (auto stage : dcacheRefillTagWrite) { @@ -1168,7 +1262,12 @@ class LSQ const uint64_t storeBufferInactiveThreshold; const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2; StoreBuffer storeBuffer; - bool _storeBufferFlushing = false; + std::unordered_map storeBufferBlockVersion; + std::unordered_map storeBufferVisibleVersion; + bool _storeBufferFlushing[MaxThreads] = {false}; + InstSeqNum _storeBufferFlushBeforeSeq[MaxThreads] = { + static_cast(-1) + }; uint64_t storeBufferWritebackInactive = 0; StoreBufferEntry *blockedSbufferEntry = nullptr; ThreadID nextStoreBufferOffloadTid = InvalidThreadID; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index a358b9df19..f027cdb7db 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -182,10 +182,10 @@ LSQUnit::SQEntry::setStatus(SplitStoreStatus status) LSQUnit::WritebackRegEvent::WritebackRegEvent(const DynInstPtr &_inst, PacketPtr _pkt, LSQUnit *lsq_ptr) : Event(Default_Pri, AutoDelete), - inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr) + inst(_inst), request(_inst->savedRequest), pkt(_pkt), lsqPtr(lsq_ptr) { - assert(_inst->savedRequest); - _inst->savedRequest->writebackScheduled(); + assert(request); + request->writebackScheduled(); } void @@ -195,8 +195,8 @@ LSQUnit::WritebackRegEvent::process() lsqPtr->writebackReg(inst, pkt); - assert(inst->savedRequest); - inst->savedRequest->writebackDone(); + assert(request); + request->writebackDone(); delete pkt; } @@ -349,19 +349,49 @@ LSQUnit::completeDataAccess(PacketPtr pkt) if (inst->isLoad() || inst->isAtomic()) { Addr addr = pkt->getAddr(); auto [enable_diff, diff_all_states] = cpu->getDiffAllStates(); - if (system->multiCore() && enable_diff && !request->_sbufferBypass && + if (system->multiContextDifftest() && enable_diff && + request->_sbufferBypass && + inst->isLoad() && + cpu->goldenMemManager()->inPmem(addr)) { + // A store-forwarded load may legitimately observe a value that + // is newer than the current shared golden memory snapshot. + // Keep the observed value on the instruction so difftest can + // repair the reference state for this hart if needed. + inst->setGolden(pkt->getPtr()); + } + if (system->multiContextDifftest() && enable_diff && + !request->_sbufferBypass && cpu->goldenMemManager()->inPmem(addr)) { - // check data with golden mem - uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); uint8_t *loaded_data = pkt->getPtr(); size_t size = pkt->getSize(); - if (memcmp(golden_data, loaded_data, size) == 0) { - assert(size == inst->effSize); - inst->setGolden(golden_data); + assert(size == inst->effSize); + + if (inst->isAtomic()) { + uint8_t current_golden[8] = {}; + panic_if(size > sizeof(current_golden), + "Unexpected AMO size %u at addr %#lx\n", + size, addr); + cpu->goldenMemManager()->readGoldenMem(addr, current_golden, + size); + + // Preserve the DUT-observed old value until completeStore() + // derives the post-AMO memory image. The golden old-value + // snapshot used by difftest is captured when the request + // is first sent, before later concurrent updates can + // advance shared memory. + inst->setGolden(loaded_data); } else { - panic("Data error at addr %#lx, size %d. %s\n", - addr, size, - goldenDiffStr(loaded_data, golden_data, size).c_str()); + // check data with golden mem + uint8_t *golden_data = + (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); + if (memcmp(golden_data, loaded_data, size) != 0) { + DPRINTF(Diff, + "[tid:%d] [sn:%llu] Load sees value different from " + "current golden memory at addr %#lx, size %d. " + "Treating as concurrent update window. %s\n", + inst->threadNumber, inst->seqNum, addr, size, + goldenDiffStr(loaded_data, golden_data, size).c_str()); + } } } } @@ -737,6 +767,44 @@ LSQUnit::insertStore(const DynInstPtr& store_inst) storeQueue.back().set(store_inst); } +LSQUnit::LSQRequest * +LSQUnit::currentLoadRequest(const DynInstPtr &inst) +{ + return (inst && inst->lqIdx >= 0) ? loadQueue[inst->lqIdx].request() + : nullptr; +} + +LSQUnit::LSQRequest * +LSQUnit::currentStoreRequest(const DynInstPtr &inst) +{ + return (inst && inst->sqIdx >= 0) ? storeQueue[inst->sqIdx].request() + : nullptr; +} + +bool +LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst) +{ + if (!inst->isSplitStoreData()) { + return false; + } + + if (!storeQueue.isValidIdx(inst->sqIdx)) { + return true; + } + + auto sq_it = storeQueue.getIterator(inst->sqIdx); + if (!sq_it->valid()) { + return true; + } + + const auto &sta_inst = sq_it->instruction(); + if (!sta_inst || sta_inst->seqNum != inst->seqNum) { + return true; + } + + return sta_inst->isSquashed(); +} + bool LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst) { @@ -746,9 +814,10 @@ LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_ Addr store_eff_addr1 = store_inst->physEffAddr >> depCheckShift; Addr store_eff_addr2 = (store_inst->physEffAddr + store_inst->effSize - 1) >> depCheckShift; - LSQRequest* store_req = store_inst->savedRequest; + LSQRequest* store_req = currentStoreRequest(store_inst); + LSQRequest* load_req = currentLoadRequest(load_inst); // Dont perform pipe line nuke check for split load - bool load_is_splited = load_inst->savedRequest && load_inst->savedRequest->isSplit(); + bool load_is_splited = load_req && load_req->isSplit(); bool load_need_check = !load_is_splited && load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt); bool store_need_check = store_req && store_req->isTranslationComplete() && @@ -828,7 +897,7 @@ LSQUnit::checkSnoop(PacketPtr pkt) DynInstPtr ld_inst = iter->instruction(); assert(ld_inst); - LSQRequest *request = ld_inst->savedRequest; + LSQRequest *request = iter->request(); // Check that this snoop didn't just invalidate our lock flag if (ld_inst->effAddrValid() && request && @@ -842,7 +911,7 @@ LSQUnit::checkSnoop(PacketPtr pkt) while (++iter != loadQueue.end()) { ld_inst = iter->instruction(); assert(ld_inst); - request = ld_inst->savedRequest;// iter->request(); + request = iter->request(); if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) continue; @@ -886,12 +955,131 @@ LSQUnit::checkSnoop(PacketPtr pkt) return; } +namespace +{ + +bool +overlapsVisibleStore(const o3::LSQ::LSQRequest *load_req, Addr store_paddr, + const std::vector &store_byte_enable) +{ + if (!load_req) { + return false; + } + + for (size_t req_idx = 0; req_idx < load_req->numReqs(); ++req_idx) { + const auto req = load_req->req(req_idx); + if (!req->hasPaddr()) { + continue; + } + + const Addr load_start = req->getPaddr(); + const Addr load_end = load_start + req->getSize(); + for (size_t byte_idx = 0; byte_idx < store_byte_enable.size(); + ++byte_idx) { + if (!store_byte_enable[byte_idx]) { + continue; + } + + const Addr byte_addr = store_paddr + byte_idx; + if (byte_addr >= load_start && byte_addr < load_end) { + return true; + } + } + } + + return false; +} + +} // anonymous namespace + +void +LSQUnit::checkLocalStoreVisible(Addr store_paddr, + const std::vector &store_byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads) +{ + [[maybe_unused]] const InstSeqNum visible_store_seq = store_seq; + [[maybe_unused]] const bool replay_visible_loads = replay_executed_loads; + + if (loadQueue.empty()) { + return; + } + + const Addr block_addr = store_paddr & cacheBlockMask; + DynInstPtr oldest_violator = memDepViolator; + + for (auto it = loadQueue.begin(); it != loadQueue.end(); ++it) { + DynInstPtr ld_inst = it->instruction(); + if (!ld_inst || ld_inst->isSquashed() || ld_inst->needReplay() || + !ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { + continue; + } + + LSQRequest *request = it->request(); + // Replay/cancel paths can leave the dyninst carrying a stale + // savedRequest pointer after the active LQ request has been replaced + // or dropped. Only the current queue entry request is safe here. + if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) { + continue; + } + if (!overlapsVisibleStore(request, store_paddr, store_byte_enable)) { + continue; + } + if (ld_inst->memReqFlags & Request::LLSC) { + ld_inst->tcBase()->getIsaPtr()->handleLockedSnoopHit(ld_inst.get()); + } + + if (ld_inst->isExecuted()) { + DPRINTF(LSQUnit, + "Local visible store ignores already executed load " + "[sn:%lli] on addr %#x\n", + ld_inst->seqNum, store_paddr); + continue; + } + + ld_inst->hitExternalSnoop(true); + ld_inst->possibleLoadViolation(true); + DPRINTF(LSQUnit, + "Local visible store replays not-yet-executed load [sn:%lli] " + "on addr %#x\n", + ld_inst->seqNum, store_paddr); + ld_inst->setNukeReplay(); + loadSetReplay(ld_inst, request, true); + } + + if (oldest_violator && + (!memDepViolator || oldest_violator->seqNum < memDepViolator->seqNum)) { + memDepViolator = oldest_violator; + cpu->activityThisCycle(); + iewStage->SquashCheckAfterExe(oldest_violator); + } +} + Fault LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, const DynInstPtr& inst) { + LSQRequest *request = nullptr; + if (inst->isLoad()) { + if (inst->lqIdx >= 0) { + request = loadQueue[inst->lqIdx].request(); + } + } else if (inst->isStore() || inst->isAtomic()) { + if (inst->sqIdx >= 0) { + request = storeQueue[inst->sqIdx].request(); + } + } + + // Replay/cancel paths can drop the active LSQ request before the + // instruction is retried. In that window the dyninst may still carry a + // stale savedRequest pointer, so only the current LSQ entry request is + // safe to inspect here. + if (!request) { + return NoFault; + } + auto saved_it = loadIt; - for (auto req0 : inst->savedRequest->_reqs) { + for (auto req0 : request->_reqs) { Addr inst_eff_addr1 = req0->getPaddr() >> depCheckShift; Addr inst_eff_addr2 = (req0->getPaddr() + req0->getSize() - 1) >> depCheckShift; @@ -1000,13 +1188,11 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow) // Reset DTB translation state inst->translationStarted(false); inst->translationCompleted(false); + inst->savedRequest = nullptr; // clear request in loadQueue loadQueue[inst->lqIdx].setRequest(nullptr); if (dropReqNow) { - // discard this request request->discard(); - // TODO: is this essential? - inst->savedRequest = nullptr; } DPRINTF(LoadPipeline, "Load [sn:%ld] set replay, dropReqNow: %d\n", inst->seqNum, dropReqNow); @@ -1058,8 +1244,9 @@ LSQUnit::loadDoTranslate(const DynInstPtr &inst) DPRINTF(LoadPipeline, "Load [sn:%llu] setTLBMissReplay\n", inst->seqNum); } - if (inst->savedRequest && inst->savedRequest->isTranslationComplete()) { - inst->setNormalLd(inst->savedRequest->isNormalLd()); + if (auto *request = currentLoadRequest(inst); + request && request->isTranslationComplete()) { + inst->setNormalLd(request->isNormalLd()); cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::VAddress, inst->effAddr); cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::PAddress, inst->physEffAddr); @@ -1074,7 +1261,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst) DPRINTF(LoadPipeline, "loadDoSendRequest: load [sn:%lli]\n", inst->seqNum); assert(!inst->isSquashed()); Fault load_fault = inst->getFault(); - LSQRequest* request = inst->savedRequest; + LSQRequest* request = currentLoadRequest(inst); if (inst->effAddrValid()) { for (int i = 0; i < storePipeSx[1]->size; i++) { @@ -1120,9 +1307,9 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst) } if (load_fault != NoFault && inst->translationCompleted() && - inst->savedRequest->isPartialFault() - && !inst->savedRequest->isComplete()) { - assert(inst->savedRequest->isSplit()); + request && request->isPartialFault() + && !request->isComplete()) { + assert(request->isSplit()); // If we have a partial fault where the mem access is not complete yet // then the cache must have been blocked. This load will be re-executed // when the cache gets unblocked. We will handle the fault when the @@ -1165,7 +1352,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) DPRINTF(LoadPipeline, "loadDoRecvData: load [sn:%lli]\n", inst->seqNum); assert(!inst->isSquashed()); - LSQRequest* request = inst->savedRequest; + LSQRequest* request = currentLoadRequest(inst); if (inst->wakeUpEarly()) { auto& bus = getLsq()->bus; @@ -1257,7 +1444,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) // No nuke happens, prepare the inst data // assert(request->isNormalLd() ? !request->isAnyOutstandingRequest() : true); - request = inst->savedRequest; + request = currentLoadRequest(inst); if (inst->fullForward()) { DPRINTF(LoadPipeline, "Load [sn:%llu] fullForward\n", inst->seqNum); assert(request); @@ -1314,13 +1501,25 @@ LSQUnit::executeLoadPipeSx() case 0: fault = loadDoTranslate(inst); break; - case 1: - iewStage->getScheduler()->specWakeUpFromLoadPipe(inst); - // Loads will mark themselves as executed, and their writeback - // event adds the instruction to the queue to commit + case 1: { fault = loadDoSendRequest(inst); + auto *request = currentLoadRequest(inst); + if (fault == NoFault && + !inst->replayOrSkipFollowingPipe() && + inst->readPredicate() && + inst->readMemAccPredicate() && + request && + request->isTranslationComplete() && + request->isMemAccessRequired()) { + iewStage->getScheduler()->specWakeUpFromLoadPipe( + inst); + } + // Loads will mark themselves as executed, and their + // writeback event adds the instruction to the queue + // to commit. iewStage->SquashCheckAfterExe(inst); break; + } case 2: fault = loadDoRecvData(inst); @@ -1377,10 +1576,12 @@ LSQUnit::executeLoadPipeSx() else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst); else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst); else if (inst->needNukeReplay()) { - if (inst->cacheHit()) { - loadSetReplay(inst, inst->savedRequest, true); - } else if (inst->hasPendingCacheReq()) { - loadSetReplay(inst, inst->savedRequest, false); + if (auto *request = currentLoadRequest(inst); request) { + if (inst->cacheHit()) { + loadSetReplay(inst, request, true); + } else if (inst->hasPendingCacheReq()) { + loadSetReplay(inst, request, false); + } } inst->issueQue->retryMem(inst); } @@ -1410,7 +1611,10 @@ LSQUnit::executeLoadPipeSx() } if (i == loadPipeStages - 1 && !inst->needReplay()) { - if (inst->isNormalLd() || !inst->readMemAccPredicate()) iewStage->readyToFinish(inst); + if (inst->isExecuted() && + (inst->isNormalLd() || !inst->readMemAccPredicate())) { + iewStage->readyToFinish(inst); + } iewStage->activityThisCycle(); inst->endPipelining(); DPRINTF(LoadPipeline, "Load [sn:%llu] ready to finish\n", @@ -1538,6 +1742,10 @@ LSQUnit::executeStorePipeSx() continue; } + if (splitStoreAddrSquashed(inst)) { + inst->setSquashed(); + } + if (inst->isSquashed()) { DPRINTF(StorePipeline, "Execute: Instruction was squashed. PC: %s, [tid:%i]" " [sn:%llu]\n", inst->pcState(), inst->threadNumber, @@ -1752,7 +1960,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst) if (x.instruction()->seqNum > youngest_inst) { break; } - assert(x.instruction()->isSplitStoreAddr() ? x.splitStoreFinish() : true); + // Commit can publish a new squash to IEW one cycle after IEW has + // already received an older doneMemSeqNum. If that stale + // doneMemSeqNum reaches here in the same cycle that ROB marks this + // store squashed, do not advance SQ writeback state past the + // squashed entry; IEW's next-cycle squash will remove it. + if (x.instruction()->isSquashed()) { + break; + } + if (x.instruction()->isSplitStoreAddr() && !x.splitStoreFinish()) { + panic("Split store reached commitStores unfinished: tid=%d " + "seq=%llu pc=%#lx youngest=%llu canCommit=%d " + "executed=%d squashed=%d addrReady=%d dataReady=%d " + "staFinish=%d stdFinish=%d canWB=%d completed=%d\n", + x.instruction()->threadNumber, + static_cast( + x.instruction()->seqNum), + x.instruction()->pcState().instAddr(), + static_cast(youngest_inst), + x.instruction()->readyToCommit(), + x.instruction()->isExecuted(), + x.instruction()->isSquashed(), + x.addrReady(), x.dataReady(), + x.staFinish(), x.stdFinish(), + x.canWB(), x.completed()); + } DPRINTF(LSQUnit, "Marking store as able to write back, PC " "%s [sn:%lli]\n", x.instruction()->pcState(), @@ -1765,6 +1997,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst) } } +bool +LSQUnit::hasStoresToWBBefore(InstSeqNum seq_num) const +{ + if (storesToWB == 0) { + return false; + } + + for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) { + if (!it->valid() || !it->instruction()) { + continue; + } + + const auto &inst = it->instruction(); + if (inst->seqNum >= seq_num) { + break; + } + + if (it->canWB() && !it->completed()) { + return true; + } + } + + return false; +} + bool LSQUnit::writebackBlockedStore() { @@ -1772,8 +2029,25 @@ LSQUnit::writebackBlockedStore() return false; } - storeWBIt->request()->sendPacketToCache(); - if (storeWBIt->request()->isSent()) { + auto *request = storeWBIt->request(); + const auto &inst = storeWBIt->instruction(); + + if (request->mainReq()->hasPaddr() && + system->multiContextDifftest() && inst->isAtomic() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { + uint8_t issue_golden[8] = {}; + panic_if(request->_size > sizeof(issue_golden), + "Unexpected AMO size %u at addr %#lx\n", + request->_size, request->mainReq()->getPaddr()); + cpu->goldenMemManager()->readGoldenMem( + request->mainReq()->getPaddr(), issue_golden, request->_size); + std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden, + request->_size); + } + + request->sendPacketToCache(); + if (request->isSent()) { storePostSend(); } return isStoreBlocked; @@ -1784,6 +2058,7 @@ LSQUnit::directStoreToCache() { DynInstPtr inst = storeWBIt->instruction(); LSQRequest* request = storeWBIt->request(); + if ((request->mainReq()->isLLSC() || request->mainReq()->isRelease()) && (storeWBIt.idx() != storeQueue.head())) { DPRINTF(LSQUnit, "Store idx:%i PC:%s to Addr:%#x " @@ -1832,6 +2107,28 @@ LSQUnit::directStoreToCache() } } + if (request->mainReq()->hasPaddr()) { + if (request->_storeBufferGeneration == 0) { + const Addr block_paddr = + request->mainReq()->getPaddr() & cacheBlockMask; + request->_storeBufferGeneration = + lsq->bumpStoreBufferBlockVersion(block_paddr); + } + + if (system->multiContextDifftest() && inst->isAtomic() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { + uint8_t issue_golden[8] = {}; + panic_if(request->_size > sizeof(issue_golden), + "Unexpected AMO size %u at addr %#lx\n", + request->_size, request->mainReq()->getPaddr()); + cpu->goldenMemManager()->readGoldenMem( + request->mainReq()->getPaddr(), issue_golden, request->_size); + std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden, + request->_size); + } + } + if (request->mainReq()->isLocalAccess()) { assert(!inst->isStoreConditional()); assert(!inst->inHtmTransactionalState()); @@ -1893,6 +2190,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) { assert(!lsq->storeBufferBlocked()); if (isStoreBlocked) return; + if (max_entries == 0) return; uint32_t accepted_entries = 0; while (storesToWB > 0 && @@ -1923,17 +2221,20 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) request->mainReq()->isRelease() || request->mainReq()->isStrictlyOrdered() || inst->isStoreConditional()) { - DPRINTF(StoreBuffer, "Find atomic/SC store [sn:%llu]\n", storeWBIt->instruction()->seqNum); if (!(storeWBIt.idx() == storeQueue.head())) { - DPRINTF(StoreBuffer, "atomic/SC store waiting\n"); break; } - if (!storeBufferEmpty()) { - DPRINTF(StoreBuffer, "sbuffer need flush\n"); + if (request->mainReq()->hasPaddr()) { + const Addr block_paddr = + request->mainReq()->getPaddr() & cacheBlockMask; + if (lsq->storeBufferHasConflict(lsqID, block_paddr)) { + lsq->requestGlobalStoreBufferFlush(); + break; + } + } + if (!storeBufferEmpty(lsqID)) { lsq->flushStores(lsqID); break; - } else { - DPRINTF(StoreBuffer, "sbuffer finishing flushed\n"); } bool contin = directStoreToCache(); if (isStoreBlocked) { @@ -1956,8 +2257,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) uint64_t offset = vaddr - vbase; DPRINTF(LSQUnit, "Spilt store idx %d [sn:%lli] insert into sbuffer\n", i, inst->seqNum); assert(offset + req->getSize() <= storeWBIt->size()); - bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, req->getSize(), - req->getByteEnable()); + bool success = insertStoreBuffer( + vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, + req->getSize(), req->getByteEnable(), inst->seqNum); if (success) { request->_numOutstandingPackets++; } else { @@ -1977,8 +2279,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) Addr vaddr = request->getVaddr(); Addr paddr = request->mainReq()->getPaddr(); DPRINTF(LSQUnit, "Store [sn:%lli] insert into sbuffer\n", inst->seqNum); - bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size, - request->mainReq()->getByteEnable()); + bool success = insertStoreBuffer( + vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size, + request->mainReq()->getByteEnable(), inst->seqNum); if (!success) { break; } @@ -1990,7 +2293,10 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) } } -bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector& mask) +bool +LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, + uint64_t size, const std::vector& mask, + InstSeqNum store_seq) { auto &storeBuffer = lsq->getStoreBuffer(); // access range must in a cache block @@ -1998,14 +2304,18 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t Addr blockVaddr = vaddr & cacheBlockMask; Addr blockPaddr = paddr & cacheBlockMask; Addr offset = paddr & ~cacheBlockMask; + // check request is not already in the storebuffer auto entry = storeBuffer.get(lsqID, blockPaddr); + const auto generation = lsq->bumpStoreBufferBlockVersion(blockPaddr); + if (entry) { if (entry->sending) { if (entry->vice) { // merge into vice entry = entry->vice; - entry->merge(offset, datas, size, mask); + entry->merge(offset, datas, size, mask, generation); + entry->generation = generation; DPRINTF(StoreBuffer, "Merging vice entry[%#x] for addr %#x\n", blockPaddr, paddr); } else { @@ -2017,14 +2327,18 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t } stats.sbufferCreateVice++; auto vice = storeBuffer.createVice(entry); - vice->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask); + vice->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, + datas, size, mask, generation); + vice->generation = generation; DPRINTF(StoreBuffer, "Create new vice entry[%#x] for addr %#x\n", blockPaddr, paddr); } } else { // merge into unsent storeBuffer.update(entry->index); - entry->merge(offset, datas, size, mask); + entry->merge(offset, datas, size, mask, generation); + entry->seqNum = std::max(entry->seqNum, store_seq); + entry->generation = generation; DPRINTF(StoreBuffer, "Merging entry[%#x] for addr %#x\n", blockPaddr, paddr); } @@ -2037,7 +2351,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t } // insert auto entry = storeBuffer.getEmpty(); - entry->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask); + entry->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, datas, + size, mask, generation); + entry->generation = generation; storeBuffer.insert(entry); DPRINTF(StoreBuffer, "Create new entry[%#x] for addr %#x\n", blockPaddr, paddr); @@ -2256,6 +2572,7 @@ LSQUnit::squash(const InstSeqNum &squashed_num) break; } } + } uint64_t @@ -2337,7 +2654,8 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt) if (!htm_fault) { assert(dynamic_cast(inst->fault.get()) != nullptr || - inst->savedRequest->isPartialFault()); + (currentLoadRequest(inst) && + currentLoadRequest(inst)->isPartialFault())); } else if (!pkt->htmTransactionFailedInCache()) { // Situation in which the instruction has a hardware @@ -2358,8 +2676,12 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt) } } - if (!inst->savedRequest->isNormalLd()) { - // Need to insert instruction into queue to commit + const bool finish_after_writeback = + !inst->isNormalLd() || !inst->inPipe(); + if (finish_after_writeback) { + // Normal loads usually wait for the last pipe stage to enqueue commit. + // If the response arrives after the load has already drained from the + // pipe, writeback must finish the instruction here. iewStage->readyToFinish(inst); iewStage->activityThisCycle(); } @@ -2383,14 +2705,51 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe * store queue. */ DynInstPtr store_inst = store_idx->instruction(); auto request = store_idx->request(); - + // Predicated-off or zero-sized stores can legitimately reach completion + // without ever materializing a backing memory request. + const bool has_main_request = + request && request->numReqs() > 0; + const bool has_paddr = + has_main_request && request->mainReq()->hasPaddr(); DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " "idx:%i\n", store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); + if (!from_sbuffer && + (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && + has_paddr) { + const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask; + auto generation = request->_storeBufferGeneration; + const bool replay_executed_loads = + store_inst->isAtomic() || cpu->consumeSyncVisibleStoreReplay(lsqID); + if (generation == 0) { + generation = lsq->bumpStoreBufferBlockVersion(block_paddr); + } + lsq->invalidateOtherThreadStoreBufferBytes( + lsqID, request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), generation); + lsq->markStoreBufferBlockVisible(block_paddr, generation); + lsq->notifyOtherThreadsStoreVisible(lsqID, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), store_inst->seqNum, + replay_executed_loads); + } + + if (from_sbuffer && + (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && + has_paddr) { + auto generation = request->_storeBufferGeneration; + if (generation == 0) { + generation = lsq->bumpStoreBufferBlockVersion( + request->mainReq()->getPaddr() & cacheBlockMask); + request->_storeBufferGeneration = generation; + } + } + if (!from_sbuffer && (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && cpu->goldenMemManager() && + has_paddr && cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { Addr paddr = request->mainReq()->getPaddr(); if (!store_inst->isAtomic()) { @@ -2400,23 +2759,22 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe request->_size); } else { uint8_t tmp_data[8]; - memset(tmp_data, 0, 8); - memcpy(tmp_data, store_inst->memData, request->_size); + memset(tmp_data, 0, sizeof(tmp_data)); assert(request->req()->getAtomicOpFunctor()); - // read golden memory to get the global latest value before this AMO is executed for further compare - cpu->goldenMemManager()->readGoldenMem(paddr, - store_inst->getAmoOldGoldenValuePtr(), request->_size); - cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue(); + // The AMO response returns the old memory value. Capture it on the + // instruction so commit/difftest can use a per-inst golden copy + // under SMT, but derive the new memory image from the DUT-observed + // old value captured in goldenData. + memcpy(tmp_data, store_inst->getGolden(), request->_size); - // before amo operate on golden memory (*(request->req()->getAtomicOpFunctor()))(tmp_data); - // after amo operate on golden memory DPRINTF(LSQUnit, "AMO writing to golden memory at addr %#x, data %#lx, mask %#x, size %d\n", paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size); cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff, request->_size); + store_inst->setGolden(tmp_data); } } @@ -2522,11 +2880,15 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo request->packetSent(); if (isLoad) { - auto &storeBuffer = lsq->getStoreBuffer(); - auto entry = storeBuffer.get(lsqID, pkt->getAddr() & cacheBlockMask); + const Addr block_addr = pkt->getAddr() & cacheBlockMask; + auto entry = lsq->findForwardingStoreBufferEntry( + block_addr, lsqID, request->instruction()->seqNum); if (entry) { DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print()); - if (entry->recordForward(pkt->req, request)) { + if (entry->recordForward( + pkt->req, request, lsqID, + request->instruction()->seqNum, + lsq->currentStoreBufferVisibleVersion(block_addr))) { assert(request->isSplit()); // here must be split request stats.sbufferFullForward++; } else if (!request->SBforwardPackets.empty()) { @@ -2697,8 +3059,12 @@ LSQUnit::dumpInsts() const for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) { if (it->valid()) { const DynInstPtr &inst(it->instruction()); - cprintf("idx:%d %s.[sn:%llu] %s\n", it.idx(), inst->pcState(), inst->seqNum, - it->addrReady() ? "AddrReady" : "Not AddrReady"); + cprintf("idx:%d %s.[sn:%llu] %s squashed=%d canWB=%d completed=%d " + "dataReady=%d staFinish=%d stdFinish=%d\n", + it.idx(), inst->pcState(), inst->seqNum, + it->addrReady() ? "AddrReady" : "Not AddrReady", + inst->isSquashed(), it->canWB(), it->completed(), + it->dataReady(), it->staFinish(), it->stdFinish()); } } @@ -2930,19 +3296,37 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) } if (request) { + request->SBforwardPackets.clear(); request->SQforwardPackets.clear(); + request->_sbufferBypass = false; + if (!load_inst->hasPendingCacheReq()) { + request->_goldenSnapshotCaptured = false; + } } // Check the SQ for any previous stores that might lead to forwarding auto store_it = load_inst->sqIt; - panic_if(store_it < storeWBIt, "[sn:%llu] Load instruction's store index is younger than store writeback index", - load_inst->seqNum); - // End once we've reached the top of the LSQ - while (store_it != storeWBIt && !load_inst->isDataPrefetch()) { + if (storeWBIt.dereferenceable()) { + panic_if(store_it < storeWBIt, + "[sn:%llu] Load instruction's store index is younger than " + "store writeback index", + load_inst->seqNum); + } + // End once we've reached the top of the LSQ. If storeWBIt is end(), there + // is no outstanding SQ forwarding window to scan. + while (storeWBIt.dereferenceable() && + store_it != storeWBIt && + !load_inst->isDataPrefetch()) { // Move the index to one younger store_it--; assert(store_it->valid()); assert(store_it->instruction()->seqNum < load_inst->seqNum); + auto store_req = store_it->request(); + + if (store_it->completed()) { + continue; + } + int store_size = store_it->size(); // Cache maintenance instructions go down via the store @@ -3077,9 +3461,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) "addr %#x, data: %#lx\n", store_it->instruction()->seqNum, load_inst->seqNum, request->mainReq()->getPaddr(), *((uint64_t*)buffer)); } - - - load_inst->setFullForward(); // Don't need to do anything special for split loads. @@ -3131,11 +3512,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) // sbuffer forward if (!load_inst->isDataPrefetch() && !request->isSplit()) { Addr blk_addr = request->mainReq()->getPaddr() & cacheBlockMask; - int offset = request->mainReq()->getPaddr() & ~cacheBlockMask; - auto &storeBuffer = lsq->getStoreBuffer(); - auto entry = storeBuffer.get(lsqID, blk_addr); + auto entry = lsq->findForwardingStoreBufferEntry( + blk_addr, lsqID, load_inst->seqNum); if (entry) { - if (entry->recordForward(request->mainReq(), request)) { + if (entry->recordForward(request->mainReq(), request, lsqID, + load_inst->seqNum, + lsq->currentStoreBufferVisibleVersion( + blk_addr))) { // full forward // no need to send to cache stats.sbufferFullForward++; @@ -3150,7 +3533,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) DPRINTF(LoadPipeline, "Load [sn:%llu] forward from sbuffer, data: %lx\n", load_inst->seqNum, *((uint64_t*)buffer)); } - return NoFault; } } @@ -3196,9 +3578,21 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) } else { DPRINTF(LoadPipeline, "Load [sn:%llu] sendPacketToCache\n", load_inst->seqNum); // if cannot forward from bus, do real cache access + bool should_capture_golden = + system->multiContextDifftest() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr()) && + !request->_goldenSnapshotCaptured; request->buildPackets(); // if the cache is not blocked, do cache access request->sendPacketToCache(); + if (request->isSent() && should_capture_golden) { + uint8_t *issue_golden = + (uint8_t *)cpu->goldenMemManager()->guestToHost( + request->mainReq()->getPaddr()); + load_inst->setGolden(issue_golden); + request->_goldenSnapshotCaptured = true; + } if (!request->isSent() && !load_inst->needBankConflictReplay() && !load_inst->needMshrArbFailReplay() && !load_inst->needMshrAliasFailReplay() &&!load_inst->needHitInWriteBufferReplay()) { iewStage->blockMemInst(load_inst); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index b4e604310d..2e950ce1ce 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -154,6 +154,7 @@ class LSQUnit } LSQRequest* request() { return _request; } + const LSQRequest* request() const { return _request; } void setRequest(LSQRequest* r) { _request = r; } bool hasRequest() { return _request != nullptr; } /** Member accessors. */ @@ -212,6 +213,8 @@ class LSQUnit bool addrReady() const { return _addrReady; } bool dataReady() const { return _dataReady; } + bool staFinish() const { return _staFinish; } + bool stdFinish() const { return _stdFinish; } bool canForwardToLoad() const { return _addrReady && _dataReady; } bool splitStoreFinish() const { return _staFinish && _stdFinish; } @@ -302,6 +305,7 @@ class LSQUnit void insertLoad(const DynInstPtr &load_inst); /** Inserts a store instruction. */ void insertStore(const DynInstPtr &store_inst); + bool splitStoreAddrSquashed(const DynInstPtr &inst); /** Check for ordering violations in the LSQ. For a store squash if we * ever find a conflicting load. For a load, only squash if we @@ -326,6 +330,10 @@ class LSQUnit * of the intermediate invalidate. */ void checkSnoop(PacketPtr pkt); + void checkLocalStoreVisible(Addr store_paddr, + const std::vector &store_byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads); /** Iq issues a load to load pipeline. */ void issueToLoadPipe(const DynInstPtr &inst); @@ -353,9 +361,12 @@ class LSQUnit /** Writes back stores. */ void offloadToStoreBuffer(uint32_t max_entries); - bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector& mask); + bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, + uint64_t size, const std::vector& mask, + InstSeqNum store_seq); bool storeBufferEmpty() { return lsq->storeBufferEmpty(); } + bool storeBufferEmpty(ThreadID tid) { return lsq->storeBufferEmpty(tid); } bool storeBufferSQWillFull() const { return storeQueue.size() > sqFullUpperLimit; @@ -380,6 +391,12 @@ class LSQUnit /** Check if there exists raw nuke between load and store. */ bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst); + /** Returns the current request attached to an active LQ entry. */ + LSQRequest *currentLoadRequest(const DynInstPtr &inst); + + /** Returns the current request attached to an active SQ entry. */ + LSQRequest *currentStoreRequest(const DynInstPtr &inst); + /** Returns the number of free LQ entries. */ unsigned numFreeLoadEntries(); @@ -438,8 +455,11 @@ class LSQUnit /** Returns if there are any stores to writeback. */ bool hasStoresToWB() { return storesToWB > 0; } + /** Returns if there are older stores/atomics still pending writeback. */ + bool hasStoresToWBBefore(InstSeqNum seq_num) const; + /** Returns the number of stores to writeback. */ - int numStoresToSbuffer() { return storesToWB; } + int numStoresToSbuffer() const { return storesToWB; } /** Update loadCompletedIdx and storeCompletedIdx */ void updateCompletedIdx(); @@ -570,6 +590,9 @@ class LSQUnit /** Instruction whose results are being written back. */ DynInstPtr inst; + /** Request that owns the delayed writeback lifecycle. */ + LSQRequest *request; + /** The packet that would have been sent to memory. */ PacketPtr pkt; diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index d3e51e2c86..0be9a0906e 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -78,6 +78,8 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams ¶ms) fixedbuffer[tid] = boost::circular_buffer(renameWidth); renameMap[tid] = nullptr; stalls[tid] = {false, false}; + finalCommitSeq[tid] = 0; + releaseSeq[tid] = 0; } assert(decodeToRenameDelay == 1); @@ -260,6 +262,8 @@ Rename::resetStage() for (ThreadID tid = 0; tid < numThreads; tid++) { stalls[tid].iew = false; + finalCommitSeq[tid] = 0; + releaseSeq[tid] = 0; } } @@ -415,7 +419,15 @@ Rename::tick() updateActivate(); - if (wroteToTimeBuffer || releaseSeq < finalCommitSeq) { + bool release_pending = false; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (releaseSeq[tid] < finalCommitSeq[tid]) { + release_pending = true; + break; + } + } + + if (wroteToTimeBuffer || release_pending) { DPRINTF(Activity, "Activity this cycle.\n"); cpu->activityThisCycle(); } @@ -426,21 +438,26 @@ Rename::releasePhysRegs() { // Release physical registers up to releaseWidth auto threads = activeThreads->begin(); - if (releaseSeq + releaseWidth < finalCommitSeq) { - releaseSeq += releaseWidth; - } else { - releaseSeq = finalCommitSeq; - } while (threads != activeThreads->end()) { ThreadID tid = *threads++; - removeFromHistory(releaseSeq, tid); - // If we committed this cycle then doneSeqNum will be > 0 + if (releaseSeq[tid] + releaseWidth < finalCommitSeq[tid]) { + releaseSeq[tid] += releaseWidth; + } else { + releaseSeq[tid] = finalCommitSeq[tid]; + } + + removeFromHistory(releaseSeq[tid], tid); + // doneSeqNum is also reused as a squash-progress marker while the + // ROB is walking younger entries. Only real commit progress should + // release physical registers. if (fromCommit->commitInfo[tid].doneSeqNum != 0 && - !fromCommit->commitInfo[tid].squash) { + !fromCommit->commitInfo[tid].squash && + !fromCommit->commitInfo[tid].robSquashing) { - finalCommitSeq = fromCommit->commitInfo[tid].doneSeqNum; - releaseSeq = historyBuffer->empty() ? 0 : historyBuffer[tid].back().instSeqNum; + finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum; + releaseSeq[tid] = + historyBuffer[tid].empty() ? 0 : historyBuffer[tid].back().instSeqNum; } } } @@ -600,7 +617,7 @@ Rename::moveInstsToBuffer() for (int i = 0; i < insts_from_decode; ++i) { const DynInstPtr &inst = fromDecode->insts[i]; assert(inst->threadNumber == tid); - if (localSquashVer.largerThan(inst->getVersion())) { + if (localSquashVer[tid].largerThan(inst->getVersion())) { inst->setSquashed(); } else { assert(!fixedbuffer[tid].full()); @@ -625,9 +642,10 @@ Rename::checkSquash() squash(fromCommit->commitInfo[i].doneSeqNum, i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); DPRINTF(Rename, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[i].getVersion()); } } } diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 50c566b31a..ed03f62d8e 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -277,9 +277,9 @@ class Rename */ std::list historyBuffer[MaxThreads]; - InstSeqNum finalCommitSeq = 0; + InstSeqNum finalCommitSeq[MaxThreads] = {}; - InstSeqNum releaseSeq = 0; + InstSeqNum releaseSeq[MaxThreads] = {}; void tryFreePReg(PhysRegIdPtr phys_reg); @@ -451,7 +451,7 @@ class Rename StallReason checkRenameStallFromIEW(ThreadID tid); - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; /** Value predictor */ valuepred::VPUnit *valuePred; diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc index 4e007804c2..d57ea8b0df 100644 --- a/src/cpu/o3/rob.cc +++ b/src/cpu/o3/rob.cc @@ -297,15 +297,23 @@ ROB::countInsts(ThreadID tid) return instList[tid].size(); } +uint32_t +ROB::countInstsOfGroups(ThreadID tid, int groups) +{ + int sum = 0; + auto it = threadGroups[tid].begin(); + for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) { + sum += *it; + } + return sum; +} + uint32_t ROB::countInstsOfGroups(int groups) { int sum = 0; for (ThreadID tid = 0; tid < numThreads; tid++) { - auto it = threadGroups[tid].begin(); - for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) { - sum += *it; - } + sum += countInstsOfGroups(tid, groups); } return sum; } @@ -420,6 +428,36 @@ ROB::retireHead(ThreadID tid) cpu->removeFrontInst(head_inst); } +void +ROB::drainSquashedHead(ThreadID tid) +{ + stats.writes++; + + assert(numInstsInROB > 0); + + InstIt head_it = instList[tid].begin(); + + DynInstPtr head_inst = std::move(*head_it); + instList[tid].erase(head_it); + + assert(head_inst->readyToCommit()); + assert(head_inst->isSquashed()); + + DPRINTF(ROB, "[tid:%i] Draining squashed head instruction, " + "instruction PC %s, [sn:%llu]\n", tid, head_inst->pcState(), + head_inst->seqNum); + + --numInstsInROB; + + commitGroup(head_inst, tid); + + head_inst->clearInROB(); + + updateHead(); + + cpu->removeFrontInst(head_inst); +} + bool ROB::isHeadGroupReady(ThreadID tid) { diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index d9b3e9999b..94b93d2593 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -164,6 +164,11 @@ class ROB */ void retireHead(ThreadID tid); + /** Drains a squashed head instruction from a specific thread without + * marking it committed. + */ + void drainSquashedHead(ThreadID tid); + /** Is the oldest instruction across all threads ready. */ // bool isHeadReady(); @@ -256,6 +261,7 @@ class ROB return sum; } + uint32_t countInstsOfGroups(ThreadID tid, int groups); uint32_t countInstsOfGroups(int groups); bool (ROB::*allocateNewGroup)(const DynInstPtr inst, ThreadID tid); diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh index d5222e758d..74198c44fd 100644 --- a/src/cpu/o3/smt_sched.hh +++ b/src/cpu/o3/smt_sched.hh @@ -28,6 +28,8 @@ class InstsCounter uint64_t getCounter(ThreadID tid) { return counter[tid]; } void setCounter(ThreadID tid, uint64_t value) { counter[tid] = value; } + void incCounter(ThreadID tid, uint64_t value = 1) { counter[tid] += value; } + void decCounter(ThreadID tid, uint64_t value = 1) { counter[tid] -= value; } }; class SMTScheduler @@ -36,7 +38,8 @@ class SMTScheduler int numThreads; public: SMTScheduler(int numThreads) : numThreads(numThreads) {} - virtual ThreadID getThread(); + virtual ~SMTScheduler() = default; + virtual ThreadID getThread() = 0; }; @@ -124,7 +127,28 @@ class MultiPrioritySched : public SMTScheduler } }; +class IndependentIQICountScheduler : public SMTScheduler { +private: + InstsCounter* counter; // Counter for this IQ only +public: + IndependentIQICountScheduler(int numThreads, InstsCounter* counter) + : SMTScheduler(numThreads), counter(counter){} + + ThreadID getThread() override { + ThreadID selectedTid = 0; + uint64_t minCount = counter->getCounter(0); + + for (ThreadID tid = 1; tid < numThreads; ++tid) { + uint64_t count = counter->getCounter(tid); + if (count < minCount) { + minCount = count; + selectedTid = tid; + } + } + return selectedTid; + } +}; }} #endif diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 402d5a84aa..044ff2885e 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1021,6 +1021,7 @@ class BTBRAS(TimedBaseBTBPredictor): cxx_class = 'gem5::branch_prediction::btb_pred::BTBRAS' cxx_header = 'cpu/pred/btb/ras.hh' + numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") numEntries = Param.Unsigned(32, "Number of entries in the RAS") ctrWidth = Param.Unsigned(8, "Width of the counter") numInflightEntries = Param.Unsigned(384, "Number of inflight entries") diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc index c4876e8158..8013900e83 100644 --- a/src/cpu/pred/btb/abtb.cc +++ b/src/cpu/pred/btb/abtb.cc @@ -166,28 +166,42 @@ AheadBTB::setTrace() std::vector AheadBTB::processEntries(const std::vector& entries, Addr startAddr) { - int hitNum = entries.size(); - bool hit = hitNum > 0; + auto processed_entries = entries; + // Sort by instruction order + std::sort(processed_entries.begin(), processed_entries.end(), + [](const BTBEntry &a, const BTBEntry &b) { + return a.pc < b.pc; + }); + + auto it = std::remove_if(processed_entries.begin(), processed_entries.end(), + [startAddr](const BTBEntry &e) { + return e.pc < startAddr; + }); + processed_entries.erase(it, processed_entries.end()); + + Addr abtb_end = (startAddr + predictWidth) & + ~mask(floorLog2(predictWidth) - 1); + it = std::remove_if(processed_entries.begin(), processed_entries.end(), + [abtb_end](const BTBEntry &e) { + return e.pc >= abtb_end; + }); + processed_entries.erase(it, processed_entries.end()); + + int hitNum = processed_entries.size(); + bool hit = hitNum > 0; + // Update prediction statistics if (hit) { DPRINTF(ABTB, "BTB: lookup hit, dumping hit entry\n"); btbStats.predHit += hitNum; - for (auto &entry: entries) { + for (auto &entry: processed_entries) { printTickedBTBEntry(entry); } } else { btbStats.predMiss++; DPRINTF(ABTB, "BTB: lookup miss\n"); } - - auto processed_entries = entries; - - // Sort by instruction order - std::sort(processed_entries.begin(), processed_entries.end(), - [](const BTBEntry &a, const BTBEntry &b) { - return a.pc < b.pc; - }); return processed_entries; } @@ -299,12 +313,13 @@ AheadBTB::putPCHistory(Addr startAddr, std::vector &stagePreds) { meta = std::make_shared(); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; // Lookup all matching entries in BTB - auto find_entries = lookup(startAddr); - + auto find_entries = lookup(startAddr, asidHash); + // Process BTB entries auto processed_entries = processEntries(find_entries, startAddr); - + // Fill predictions for each pipeline stage fillStagePredictions(processed_entries, stagePreds); @@ -313,8 +328,9 @@ AheadBTB::putPCHistory(Addr startAddr, } std::shared_ptr -AheadBTB::getPredictionMeta() +AheadBTB::getPredictionMeta(ThreadID tid) { + (void)tid; // Lazy-initialize meta so callers never observe a null pointer // This avoids early-cycle crashes when prediction hasn't populated meta yet if (!meta) { @@ -342,13 +358,13 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget * @return Vector of matching BTB entries */ std::vector -AheadBTB::lookupSingleBlock(Addr block_pc) +AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { return res; // ignore false hit when lowest bit is 1 } - Addr btb_idx = getIndex(block_pc); + Addr btb_idx = getIndex(block_pc, asidHash); auto btb_set = btb[btb_idx]; assert(btb_idx < numSets); // AheadBTB always uses ahead-pipelined implementation: @@ -356,7 +372,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc) DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx); aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set)); - Addr tag_curStartpc = getTag(block_pc);// abtb uses current FB pc to get tag + Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag Addr pc = 0; Addr idx_prvStartpc = 0;// abtb uses previous FB pc to get index BTBSet set; @@ -391,7 +407,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc) } std::vector -AheadBTB::lookup(Addr block_pc) +AheadBTB::lookup(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { @@ -399,7 +415,7 @@ AheadBTB::lookup(Addr block_pc) } // AheadBTB always uses single block lookup - res = lookupSingleBlock(block_pc); + res = lookupSingleBlock(block_pc, asidHash); return res; } @@ -593,12 +609,12 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC) for (auto &entry : entries_to_update) { Addr startPC = s3Pred.bbStart; - Addr btb_tag = getTag(startPC); // use last pc to get tag + Addr btb_tag = getTag(startPC, s3Pred.asidHash); // use last pc to get tag if (previousPC == 0) { DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n"); return; } - Addr btb_idx = getIndex(previousPC); // use last pc to get idx + Addr btb_idx = getIndex(previousPC, s3Pred.asidHash); // use last pc to get idx BranchInfo takenbranchinfo; takenbranchinfo.pc = s3Pred.getTakenEntry().pc; takenbranchinfo.target = s3Pred.getTakenEntry().target; @@ -669,7 +685,7 @@ AheadBTB::update(const FetchTarget &stream) // 4. Update BTB entries - each entry uses its own PC to calculate index and tag for (auto &entry : entries_to_update) { Addr startPC = stream.getRealStartPC(); - Addr btb_tag = getTag(startPC); // use current pc to get tag + Addr btb_tag = getTag(startPC, stream.asidHash); // use current pc to get tag // AheadBTB always uses ahead-pipelined update logic Addr previousPC = getPreviousPC(stream); @@ -677,7 +693,7 @@ AheadBTB::update(const FetchTarget &stream) DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n"); return; } - Addr btb_idx = getIndex(previousPC); // use last pc to get idx + Addr btb_idx = getIndex(previousPC, stream.asidHash); // use last pc to get idx entry.source = getComponentIdx(); // mark the entry source as AheadBTB updateBTBEntry(btb_idx, btb_tag, entry, stream.exeBranchInfo, stream.exeTaken); } diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh index 9e7abc6260..e5e29f7ffd 100644 --- a/src/cpu/pred/btb/abtb.hh +++ b/src/cpu/pred/btb/abtb.hh @@ -147,7 +147,7 @@ class AheadBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // not used void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -224,8 +224,9 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The branch to look up. * @return Returns the index into the BTB. */ - inline Addr getIndex(Addr instPC) { - return (instPC >> idxShiftAmt) & idxMask; + inline Addr getIndex(Addr instPC, uint8_t asidHash) { + Addr baseIndex = (instPC >> idxShiftAmt) & idxMask; + return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash); } /** Returns the tag bits of a given address. @@ -234,8 +235,9 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The branch's address. * @return Returns the tag bits. */ - inline Addr getTag(Addr instPC) { - return (instPC >> tagShiftAmt) & tagMask; + inline Addr getTag(Addr instPC, uint8_t asidHash) { + Addr baseTag = (instPC >> tagShiftAmt) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } @@ -365,13 +367,13 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The address of the block to look up. * @return Returns all hit BTB entries. */ - std::vector lookup(Addr block_pc); + std::vector lookup(Addr block_pc, uint8_t asidHash); /** Helper function to lookup entries in a single block * @param block_pc The aligned PC to lookup * @return Vector of matching BTB entries */ - std::vector lookupSingleBlock(Addr block_pc); + std::vector lookupSingleBlock(Addr block_pc, uint8_t asidHash); /** The BTB structure: * - Organized as numSets sets diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index 58828467cd..2f3ca7fe9d 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -38,6 +38,8 @@ ittageStats(this, p.numPredictors) tableIndexMasks.resize(numPredictors); tableTagBits.resize(numPredictors); tableTagMasks.resize(numPredictors); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); for (unsigned int i = 0; i < p.numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -53,9 +55,15 @@ ittageStats(this, p.numPredictors) assert(tablePcShifts.size() >= numPredictors); - tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], (int)16)); - altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, (int)16)); - indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], (int)16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i], (int)16); + state.altTagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i] - 1, (int)16); + state.indexFoldedHist.emplace_back( + (int)histLengths[i], (int)tableIndexBits[i], (int)16); + } } // useAlt.resize(128); // for (unsigned i = 0; i < useAlt.size(); ++i) { @@ -64,6 +72,27 @@ ittageStats(this, p.numPredictors) usefulResetCnt = 0; } +ThreadID +BTBITTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBITTAGE::ThreadHistoryState & +BTBITTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBITTAGE::ThreadHistoryState & +BTBITTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + void BTBITTAGE::tickStart() { @@ -73,8 +102,10 @@ void BTBITTAGE::tick() {} void -BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, IndirectTargets& results) +BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, + IndirectTargets& results, ThreadID tid, uint8_t asidHash) { + (void)asidHash; DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr); std::vector preds; for (auto &btb_entry : btbEntries) { @@ -150,7 +181,7 @@ BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, } // Note: predTargetHit will be updated in the update phase when we know the actual target TagePrediction pred(btb_entry.pc, main_info, alt_info, use_alt, main_target); - meta->preds[btb_entry.pc] = pred; + threadMeta[tid]->preds[btb_entry.pc] = pred; } } } @@ -162,17 +193,20 @@ BTBITTAGE::dryRunCycle(Addr startPC) { void BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; + const auto &state = historyState(tid); if (debugPC == stream_start) { debugFlag = true; } DPRINTF(ITTAGE, "putPCHistory startAddr: %#lx\n", stream_start); // clear old metas - meta = std::make_shared(); + threadMeta[tid] = std::make_shared(); // assign history for meta - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; lookupEntries.clear(); lookupIndices.clear(); @@ -181,8 +215,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vectorusefulMask = std::move(useful_mask); + threadMeta[tid]->usefulMask = std::move(useful_mask); for (int s = getDelay(); s < stagePreds.size(); s++) { auto &stage_pred = stagePreds[s]; stage_pred.indirectTargets.clear(); - lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.indirectTargets); + lookupHelper(stream_start, stage_pred.btbEntries, + stage_pred.indirectTargets, tid, asidHash); } DPRINTF(ITTAGE, "putPCHistory end\n"); debugFlag = false; } std::shared_ptr -BTBITTAGE::getPredictionMeta() { - return meta; +BTBITTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } void @@ -367,8 +406,9 @@ BTBITTAGE::update(const FetchTarget &stream) unsigned startTable = main_found ? main_info.table + 1 : 0; for (int ti = startTable; ti < numPredictors; ti++) { - Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get()); - Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(), updateAltTagFoldedHist[ti].get()); + Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get(), stream.asidHash); + Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(), + updateAltTagFoldedHist[ti].get(), stream.asidHash); assert(newIndex < tageTable[ti].size()); auto &newEntry = tageTable[ti][newIndex]; @@ -402,7 +442,8 @@ BTBITTAGE::updateCounter(bool taken, unsigned width, short &counter) { } Addr -BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist) +BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, + uint8_t asidHash) { // Create mask for tableTagBits[t] uint64_t mask = ((1ULL << tableTagBits[t]) - 1); @@ -414,30 +455,33 @@ BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis uint64_t altTagBits = (altFoldedHist << 1); // XOR all components - return (pcBits ^ foldedHist ^ altTagBits) & mask; + return injectAsidHashIntoTag((pcBits ^ foldedHist ^ altTagBits) & mask, + tableTagBits[t], asidHash); } Addr -BTBITTAGE::getTageTag(Addr pc, int t) +BTBITTAGE::getTageTag(Addr pc, int t, uint8_t asidHash) { - return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get()); + const auto &state = historyState(0); + return getTageTag(pc, t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get(), asidHash); } Addr -BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) +BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash) { // Create mask for tableIndexBits[t] uint64_t mask = ((1ULL << tableIndexBits[t]) - 1); // Extract lower bits of PC and XOR with folded history uint64_t pcBits = (pc >> floorLog2(blockSize)); - return (pcBits ^ foldedHist) & mask; + return xorAsidHashIntoIndex((pcBits ^ foldedHist) & mask, tableIndexBits[t], asidHash); } Addr -BTBITTAGE::getTageIndex(Addr pc, int t) +BTBITTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool @@ -478,8 +522,10 @@ BTBITTAGE::satDecrement(int min, short &counter) * @param target The target address of the branch */ void -BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target) +BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, + Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::ITTAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); @@ -492,7 +538,9 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference foldedHist.update(history, 2, taken, pc, target); DPRINTF(ITTAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get()); @@ -503,7 +551,7 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr bool BTBITTAGE::tageHit() { - auto meta = getPredictionMeta(); + auto meta = getPredictionMeta(0); auto preds = std::static_pointer_cast(meta)->preds; bool hit = false; for (auto & [pc, pred] : preds) { @@ -531,7 +579,7 @@ void BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, taken, pc, target); + doUpdateHist(history, taken, pc, target, pred.tid); } /** @@ -550,18 +598,28 @@ BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredic void BTBITTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < numPredictors; i++) { - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } - doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, cond_taken, entry.getControlPC(), + entry.getTakenTarget(), entry.tid); } void BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); if (debugFlag) { DPRINTF(ITTAGE, "checking folded history when %s\n", when); std::string hist_str; @@ -572,7 +630,9 @@ BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe for (int type = 0; type < 2; type++) { DPRINTF(ITTAGE, "t: %d, type: %d\n", t, type); std::string buf2, buf3; - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; foldedHist.check(hist); } } diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh index e86b45817b..7db7e39350 100644 --- a/src/cpu/pred/btb/btb_ittage.hh +++ b/src/cpu/pred/btb/btb_ittage.hh @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ #include "base/statistics.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -30,6 +32,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor { using defer = std::shared_ptr; using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: typedef BTBITTAGEParams Params; @@ -99,7 +102,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update 3 folded history, according history and pred.taken // the other specUpdateHist methods are left blank @@ -116,30 +119,34 @@ class BTBITTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); private: // return provided - void lookupHelper(Addr stream_start, const std::vector &btbEntries, IndirectTargets& results); + void lookupHelper(Addr stream_start, const std::vector &btbEntries, + IndirectTargets& results, ThreadID tid, uint8_t asidHash); // use blockPC - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // use blockPC (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // use blockPC - Addr getTageTag(Addr pc, int table); + Addr getTageTag(Addr pc, int table, uint8_t asidHash = 0); // use blockPC (uint64_t version for performance) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + uint8_t asidHash = 0); Addr getOffset(Addr pc) { return (pc & (blockSize - 1)) >> 1; } // Update branch history - void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target); + void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target, + ThreadID tid); const unsigned numPredictors; @@ -151,9 +158,14 @@ class BTBITTAGE : public TimedBaseBTBPredictor std::vector tableTagMasks; std::vector tablePcShifts; std::vector histLengths; - std::vector tagFoldedHist; - std::vector altTagFoldedHist; - std::vector indexFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + }; + + std::vector threadHistory; LFSR64 allocLFSR; @@ -261,7 +273,10 @@ class BTBITTAGE : public TimedBaseBTBPredictor } } TageMeta; - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; public: diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc index 9011dbbce6..8586816f02 100755 --- a/src/cpu/pred/btb/btb_mgsc.cc +++ b/src/cpu/pred/btb/btb_mgsc.cc @@ -60,41 +60,64 @@ BTBMGSC::initStorage() assert(isPowerOf2(numCtrsPerLine)); numCtrsPerLineBits = log2i(numCtrsPerLine); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + auto bwTableSize = allocPredTable(bwTable, bwTableNum, bwTableIdxWidth); - for (unsigned int i = 0; i < bwTableNum; ++i) { - indexBwFoldedHist.push_back(GlobalBwFoldedHist(bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < bwTableNum; ++i) { + state.indexBwFoldedHist.emplace_back( + bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16); + } } bwIndex.resize(bwTableNum); auto lTableSize = allocPredTable(lTable, lTableNum, lTableIdxWidth); - indexLFoldedHist.resize(numEntriesFirstLocalHistories); - for (unsigned int i = 0; i < lTableNum; ++i) { - for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { - indexLFoldedHist[k].push_back(LocalFoldedHist(lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.indexLFoldedHist.resize(numEntriesFirstLocalHistories); + for (unsigned int i = 0; i < lTableNum; ++i) { + for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { + state.indexLFoldedHist[k].push_back(LocalFoldedHist( + lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16)); + } } } lIndex.resize(lTableNum); auto iTableSize = allocPredTable(iTable, iTableNum, iTableIdxWidth); - for (unsigned int i = 0; i < iTableNum; ++i) { - assert(iHistLen[i] >= 0); - assert(static_cast(iHistLen[i]) < 63); - assert(pow2(static_cast(iHistLen[i])) <= iTableSize); - indexIFoldedHist.push_back(ImliFoldedHist(iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < iTableNum; ++i) { + assert(iHistLen[i] >= 0); + assert(static_cast(iHistLen[i]) < 63); + assert(pow2(static_cast(iHistLen[i])) <= iTableSize); + state.indexIFoldedHist.emplace_back( + iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16); + } } iIndex.resize(iTableNum); auto gTableSize = allocPredTable(gTable, gTableNum, gTableIdxWidth); - for (unsigned int i = 0; i < gTableNum; ++i) { - assert(gTable.size() >= gTableNum); - indexGFoldedHist.push_back(GlobalFoldedHist(gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < gTableNum; ++i) { + assert(gTable.size() >= gTableNum); + state.indexGFoldedHist.emplace_back( + gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16); + } } gIndex.resize(gTableNum); auto pTableSize = allocPredTable(pTable, pTableNum, pTableIdxWidth); - for (unsigned int i = 0; i < pTableNum; ++i) { - assert(pTable.size() >= pTableNum); - indexPFoldedHist.push_back(PathFoldedHist(pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < pTableNum; ++i) { + assert(pTable.size() >= pTableNum); + state.indexPFoldedHist.emplace_back( + pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2); + } } pIndex.resize(pTableNum); @@ -219,6 +242,27 @@ BTBMGSC::BTBMGSC(const Params &p) #endif BTBMGSC::~BTBMGSC() {} +ThreadID +BTBMGSC::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBMGSC::ThreadHistoryState & +BTBMGSC::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBMGSC::ThreadHistoryState & +BTBMGSC::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void BTBMGSC::setTrace() @@ -347,34 +391,41 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum) * @return TagePrediction containing main and alternative predictions */ BTBMGSC::MgscPrediction -BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info) +BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, + const TageInfoForMGSC &tage_info, + ThreadID tid) { DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc, btb_entry.alwaysTaken); + const auto &state = historyState(tid); // Calculate indices for all tables for (unsigned int i = 0; i < bwTableNum; ++i) { - bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, indexBwFoldedHist[i].get()); + bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, + state.indexBwFoldedHist[i].get()); } for (unsigned int i = 0; i < lTableNum; ++i) { lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits, - indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get()); + state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get()); } // std::string buf; // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf); // DPRINTF(MGSC, "startPC: %#lx, local index: %d, local_folded_hist: %s\n", startPC, lIndex[0], buf.c_str()); for (unsigned int i = 0; i < iTableNum; ++i) { - iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, indexIFoldedHist[i].get()); + iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, + state.indexIFoldedHist[i].get()); } for (unsigned int i = 0; i < gTableNum; ++i) { - gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, indexGFoldedHist[i].get()); + gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, + state.indexGFoldedHist[i].get()); } for (unsigned int i = 0; i < pTableNum; ++i) { - pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, indexPFoldedHist[i].get()); + pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, + state.indexPFoldedHist[i].get()); } for (unsigned int i = 0; i < biasTableNum; ++i) { @@ -468,7 +519,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC */ void BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntries, - const std::unordered_map &tageInfoForMgscs, CondTakens &results) + const std::unordered_map &tageInfoForMgscs, + CondTakens &results, ThreadID tid) { DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC); @@ -478,8 +530,9 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntri if (btb_entry.isCond && btb_entry.valid) { auto tage_info = tageInfoForMgscs.find(btb_entry.pc); if (tage_info != tageInfoForMgscs.end()) { - auto pred = generateSinglePrediction(btb_entry, startPC, tage_info->second); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, + tage_info->second, tid); + threadMeta[tid]->preds[btb_entry.pc] = pred; results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } else { assert(false); @@ -504,6 +557,8 @@ void BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const auto &state = historyState(tid); DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start); // IMPORTANT: when this function is called, @@ -515,25 +570,29 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, } // Clear old prediction metadata and save current history state - meta = std::make_shared(); - meta->indexBwFoldedHist = indexBwFoldedHist; - meta->indexLFoldedHist = indexLFoldedHist; - meta->indexIFoldedHist = indexIFoldedHist; - meta->indexGFoldedHist = indexGFoldedHist; - meta->indexPFoldedHist = indexPFoldedHist; + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->indexBwFoldedHist = state.indexBwFoldedHist; + threadMeta[tid]->indexLFoldedHist = state.indexLFoldedHist; + threadMeta[tid]->indexIFoldedHist = state.indexIFoldedHist; + threadMeta[tid]->indexGFoldedHist = state.indexGFoldedHist; + threadMeta[tid]->indexPFoldedHist = state.indexPFoldedHist; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens); + lookupHelper(stream_start, stage_pred.btbEntries, + stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid); } } std::shared_ptr -BTBMGSC::getPredictionMeta() +BTBMGSC::getPredictionMeta(ThreadID tid) { - return meta; + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -1068,10 +1127,11 @@ BTBMGSC::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, bool ta void BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getHistInfo(); - doUpdateHist(history, shamt, cond_taken, indexGFoldedHist); // use global history to update G folded history + doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist); // use global history to update G folded history } /** @@ -1089,8 +1149,9 @@ BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredictio void BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, 2, taken, indexPFoldedHist, pc, target); // only path history needs pc! + doUpdateHist(history, 2, taken, state.indexPFoldedHist, pc, target); // only path history needs pc! } @@ -1109,10 +1170,11 @@ BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti void BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getBwHistInfo(); - doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist); } /** @@ -1130,12 +1192,13 @@ BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPredict void BTBMGSC::specUpdateIHist(FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getBwHistInfo(); // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update) boost::dynamic_bitset<> dummy; - doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist); + doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist); } /** @@ -1153,11 +1216,12 @@ BTBMGSC::specUpdateIHist(FullBTBPrediction &pred) void BTBMGSC::specUpdateLHist(const std::vector> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getHistInfo(); doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]); + state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]); } /** @@ -1179,11 +1243,12 @@ BTBMGSC::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget & if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < gTableNum; i++) { - indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]); + state.indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]); } - doUpdateHist(history, shamt, cond_taken, indexGFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist); } /** @@ -1205,11 +1270,13 @@ BTBMGSC::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < pTableNum; i++) { - indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]); + state.indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]); } - doUpdateHist(history, 2, cond_taken, indexPFoldedHist, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, 2, cond_taken, state.indexPFoldedHist, + entry.getControlPC(), entry.getTakenTarget()); } /** @@ -1231,11 +1298,12 @@ BTBMGSC::recoverBwHist(const boost::dynamic_bitset<> &history, const FetchTarget if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < bwTableNum; i++) { - indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]); + state.indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]); } - doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist); } /** @@ -1257,13 +1325,14 @@ BTBMGSC::recoverIHist(const FetchTarget &entry, int shamt, bool cond_taken) if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < iTableNum; i++) { - indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]); + state.indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]); } // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update) boost::dynamic_bitset<> dummy; - doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist); + doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist); } /** @@ -1286,14 +1355,15 @@ BTBMGSC::recoverLHist(const std::vector> &history, const if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { for (int i = 0; i < lTableNum; i++) { - indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]); + state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]); } } doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]); + state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]); } #ifndef UNIT_TEST @@ -1414,6 +1484,15 @@ void BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, const std::vector> &LHistory, const char *when) { + checkFoldedHist(Ghistory, PHistory, LHistory, 0, when); +} + +void +BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, + const std::vector> &LHistory, + ThreadID tid, const char *when) +{ + auto &state = historyState(tid); DPRINTF(MGSC, "checking folded history when %s\n", when); if (debug::MGSC) { std::string hist_str; @@ -1421,17 +1500,17 @@ BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::d DPRINTF(MGSC, "history:\t%s\n", hist_str.c_str()); } for (int t = 0; t < gTableNum; t++) { - auto &foldedHist = indexGFoldedHist[t]; + auto &foldedHist = state.indexGFoldedHist[t]; foldedHist.check(Ghistory); } for (int t = 0; t < pTableNum; t++) { - auto &foldedHist = indexPFoldedHist[t]; + auto &foldedHist = state.indexPFoldedHist[t]; foldedHist.check(PHistory); } for (int t = 0; t < lTableNum; t++) { - assert(LHistory.size() == indexLFoldedHist.size()); + assert(LHistory.size() == state.indexLFoldedHist.size()); for (int i = 0; i < LHistory.size(); i++) { - auto &foldedHist = indexLFoldedHist[i][t]; + auto &foldedHist = state.indexLFoldedHist[i][t]; foldedHist.check(LHistory[i]); } } diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh index 100fc639a4..6ff29b13c8 100755 --- a/src/cpu/pred/btb/btb_mgsc.hh +++ b/src/cpu/pred/btb/btb_mgsc.hh @@ -14,6 +14,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -39,6 +40,7 @@ namespace test { class BTBMGSC : public TimedBaseBTBPredictor { + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST BTBMGSC(); @@ -157,7 +159,7 @@ class BTBMGSC : public TimedBaseBTBPredictor void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update all folded history, according history and pred.taken void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -191,6 +193,9 @@ class BTBMGSC : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, const std::vector> &LHistory, const char *when); // Check GHR folded + void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, + const std::vector> &LHistory, + ThreadID tid, const char *when); // Check GHR folded // Calculate MGSC weight index Addr getPcIndex(Addr pc, unsigned tableIndexBits); @@ -247,7 +252,8 @@ class BTBMGSC : public TimedBaseBTBPredictor // Look up predictions in MGSC tables for a stream of instructions void lookupHelper(const Addr &stream_start, const std::vector &btbEntries, - const std::unordered_map &tageInfoForMgscs, CondTakens &results); + const std::unordered_map &tageInfoForMgscs, + CondTakens &results, ThreadID tid); // Calculate MGSC history index with folded history Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist); @@ -277,7 +283,8 @@ class BTBMGSC : public TimedBaseBTBPredictor // Helper method to generate prediction for a single BTB entry MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const TageInfoForMGSC &tage_info); + const TageInfoForMGSC &tage_info, + ThreadID tid); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -353,12 +360,16 @@ class BTBMGSC : public TimedBaseBTBPredictor bool enablePCThreshold; Addr focusBranchPC; - // Folded history for index calculation - std::vector indexBwFoldedHist; - std::vector> indexLFoldedHist; - std::vector indexIFoldedHist; - std::vector indexGFoldedHist; - std::vector indexPFoldedHist; + struct ThreadHistoryState + { + std::vector indexBwFoldedHist; + std::vector> indexLFoldedHist; + std::vector indexIFoldedHist; + std::vector indexGFoldedHist; + std::vector indexPFoldedHist; + }; + + std::vector threadHistory; // The actual MGSC prediction tables (table x index x line) std::vector>> bwTable; @@ -552,8 +563,9 @@ class BTBMGSC : public TimedBaseBTBPredictor static const std::unordered_map &preds(const BTBMGSC &mgsc) { - assert(mgsc.meta); - return mgsc.meta->preds; + assert(!mgsc.threadMeta.empty()); + assert(mgsc.threadMeta[0]); + return mgsc.threadMeta[0]->preds; } }; #endif @@ -594,7 +606,10 @@ class BTBMGSC : public TimedBaseBTBPredictor } } MgscMeta; - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 201f79c6a6..672b729a1a 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -104,6 +104,9 @@ tageStats(this, p.numPredictors, p.numBanks) tableTagBits.resize(numPredictors); tableTagMasks.resize(numPredictors); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + for (unsigned int i = 0; i < numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -121,10 +124,15 @@ tageStats(this, p.numPredictors, p.numBanks) tableTagMasks[i].resize(tableTagBits[i], true); assert(tablePcShifts.size() >= numPredictors); - - tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16)); - altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16)); - indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i], 16); + state.altTagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i] - 1, 16); + state.indexFoldedHist.emplace_back( + (int)histLengths[i], (int)tableIndexBits[i], 16); + } } usefulResetCnt = 0; @@ -143,6 +151,27 @@ BTBTAGE::~BTBTAGE() { } +ThreadID +BTBTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBTAGE::ThreadHistoryState & +BTBTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBTAGE::ThreadHistoryState & +BTBTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void BTBTAGE::setTrace() @@ -197,8 +226,11 @@ BTBTAGE::tickStart() {} BTBTAGE::TagePrediction BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - std::shared_ptr predMeta) { + std::shared_ptr predMeta, + ThreadID tid, + uint8_t asidHash) { DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); + const auto &state = historyState(tid); // Find main and alternative predictions bool provided = false; @@ -212,11 +244,13 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, for (int i = numPredictors - 1; i >= 0; --i) { // Calculate index and tag: use snapshot if provided, otherwise use current folded history // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) - Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i); + Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get(), asidHash) + : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash); Addr tag = predMeta ? getTageTag(startPC, i, - predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position) - : getTageTag(startPC, i, position); + predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), + position, asidHash) + : getTageTag(startPC, i, state.tagFoldedHist[i].get(), + state.altTagFoldedHist[i].get(), position, asidHash); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -295,7 +329,8 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, */ void BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, - std::unordered_map &tageInfoForMgscs, CondTakens& results) + std::unordered_map &tageInfoForMgscs, + CondTakens& results, ThreadID tid, uint8_t asidHash) { DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -303,8 +338,8 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntri for (auto &btb_entry : btbEntries) { // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { - auto pred = generateSinglePrediction(btb_entry, startPC); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid, asidHash); + threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); tageInfoForMgscs[btb_entry.pc].tage_pred_taken = pred.taken; @@ -346,6 +381,9 @@ BTBTAGE::dryRunCycle(Addr startPC) { */ void BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; + const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); predBankValid = true; @@ -363,24 +401,28 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector(); - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; - meta->history = history; + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; + threadMeta[tid]->history = history; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens); + lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, + stage_pred.condTakens, tid, asidHash); } } std::shared_ptr -BTBTAGE::getPredictionMeta() { - return meta; +BTBTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -561,6 +603,7 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC, bool actual_taken, unsigned start_table, std::shared_ptr meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way) { @@ -573,9 +616,9 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC, unsigned position = getBranchIndexInBlock(entry.pc, startPC); for (unsigned ti = start_table; ti < numPredictors; ++ti) { - Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get()); + Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get(), asidHash); Addr newTag = getTageTag(startPC, ti, - meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position); + meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position, asidHash); auto &set = tageTable[ti][newIndex]; @@ -704,7 +747,8 @@ BTBTAGE::update(const FetchTarget &stream) { TagePrediction recomputed; if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot // Re-read providers using snapshot (do not rely on prediction-time main/alt) - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); // Track differences for statistics auto it = predMeta->preds.find(btb_entry.pc); if (it != predMeta->preds.end() && recomputed.taken != it->second.taken) { @@ -734,7 +778,8 @@ BTBTAGE::update(const FetchTarget &stream) { start_table = main_info.table + 1; // start from the table after the main prediction table } alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken, - start_table, predMeta, allocated_table, allocated_index, allocated_way); + start_table, predMeta, stream.asidHash, + allocated_table, allocated_index, allocated_way); } #ifndef UNIT_TEST @@ -817,7 +862,8 @@ BTBTAGE::updateCounter(bool taken, unsigned width, short &counter) { // Calculate TAGE tag with folded history - optimized version using bitwise operations Addr -BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position) +BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position, uint8_t asidHash) { // Create mask for tableTagBits[t] to limit result size Addr mask = (1ULL << tableTagBits[t]) - 1; @@ -833,17 +879,20 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr altTagBits = (altFoldedHist << 1) & mask; // XOR all components together, including position (like RTL) - return pcBits ^ foldedBits ^ altTagBits ^ position; + return injectAsidHashIntoTag(pcBits ^ foldedBits ^ altTagBits ^ position, + tableTagBits[t], asidHash); } Addr -BTBTAGE::getTageTag(Addr pc, int t, Addr position) +BTBTAGE::getTageTag(Addr pc, int t, Addr position, uint8_t asidHash) { - return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get(), position); + const auto &state = historyState(0); + return getTageTag(pc, t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get(), position, asidHash); } Addr -BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) +BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash) { // Create mask for tableIndexBits[t] to limit result size Addr mask = (1ULL << tableIndexBits[t]) - 1; @@ -852,13 +901,13 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr pcBits = (pc >> pcShift) & mask; Addr foldedBits = foldedHist & mask; - return pcBits ^ foldedBits; + return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash); } Addr -BTBTAGE::getTageIndex(Addr pc, int t) +BTBTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool @@ -920,8 +969,10 @@ BTBTAGE::getBankId(Addr pc) const * @param taken Whether the branch was taken */ void -BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target) +BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, + Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::TAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); @@ -934,7 +985,9 @@ BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr p for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference foldedHist.update(history, 2, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get()); @@ -958,7 +1011,7 @@ void BTBTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, taken, pc, target); + doUpdateHist(history, taken, pc, target, pred.tid); } /** @@ -978,19 +1031,29 @@ void BTBTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < numPredictors; i++) { - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } - doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, cond_taken, entry.getControlPC(), + entry.getTakenTarget(), entry.tid); } // Check folded history after speculative update and recovery void BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); DPRINTF(TAGE, "checking folded history when %s\n", when); if (debug::TAGEHistory) { std::string hist_str; @@ -1000,7 +1063,9 @@ BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { std::string buf2, buf3; - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; foldedHist.check(hist); } } diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index 969cc43e8d..c4038a1858 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include "base/sat_counter.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -43,6 +45,7 @@ class BTBTAGE : public TimedBaseBTBPredictor { using defer = std::shared_ptr; using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST // Test constructor @@ -125,7 +128,7 @@ class BTBTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update 3 folded history, according history and pred.taken // the other specUpdateHist methods are left blank @@ -163,6 +166,7 @@ class BTBTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover virtual void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); #ifndef UNIT_TEST protected: @@ -170,21 +174,23 @@ class BTBTAGE : public TimedBaseBTBPredictor // Look up predictions in TAGE tables for a stream of instructions void lookupHelper(const Addr &startPC, const std::vector &btbEntries, - std::unordered_map &tageInfoForMgscs, CondTakens& results); + std::unordered_map &tageInfoForMgscs, + CondTakens& results, ThreadID tid, uint8_t asidHash); // Calculate TAGE index for a given PC and table - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // Calculate TAGE index with folded history (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // Calculate TAGE tag for a given PC and table // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, Addr position = 0); + Addr getTageTag(Addr pc, int table, Addr position = 0, uint8_t asidHash = 0); // Calculate TAGE tag with folded history (uint64_t version for performance) // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position = 0, uint8_t asidHash = 0); // Get offset within a block for a given PC Addr getOffset(Addr pc) { @@ -199,7 +205,8 @@ class BTBTAGE : public TimedBaseBTBPredictor unsigned getBankId(Addr pc) const; // Update branch history - void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target); + void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target, + ThreadID tid); // Number of TAGE predictor tables const unsigned numPredictors; @@ -225,14 +232,14 @@ class BTBTAGE : public TimedBaseBTBPredictor // History lengths for each table std::vector histLengths; - // Folded history for tag calculation - std::vector tagFoldedHist; - - // Folded history for alternative tag calculation - std::vector altTagFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + }; - // Folded history for index calculation - std::vector indexFoldedHist; + std::vector threadHistory; // Linear feedback shift register for allocation LFSR64 allocLFSR; @@ -414,7 +421,9 @@ private: // If predMeta is nullptr, use current folded history (prediction path) TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const std::shared_ptr predMeta = nullptr); + const std::shared_ptr predMeta = nullptr, + ThreadID tid = 0, + uint8_t asidHash = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -431,6 +440,7 @@ private: bool actual_taken, unsigned main_table, std::shared_ptr meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way); @@ -440,7 +450,11 @@ private: void updateLRU(int table, Addr index, unsigned way); unsigned getLRUVictim(int table, Addr index); - std::shared_ptr meta; + std::vector> threadMeta; + + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/btb_tage_ub.cc b/src/cpu/pred/btb/btb_tage_ub.cc index 5ea3338aa6..1ecebb7b8d 100644 --- a/src/cpu/pred/btb/btb_tage_ub.cc +++ b/src/cpu/pred/btb/btb_tage_ub.cc @@ -309,8 +309,9 @@ BTBTAGEUpperBound::putPCHistory(Addr startAddr, const bitset &history, } std::shared_ptr -BTBTAGEUpperBound::getPredictionMeta() +BTBTAGEUpperBound::getPredictionMeta(ThreadID tid) { + (void)tid; return ubMeta; } diff --git a/src/cpu/pred/btb/btb_tage_ub.hh b/src/cpu/pred/btb/btb_tage_ub.hh index f97792c713..b4aae9e7cc 100644 --- a/src/cpu/pred/btb/btb_tage_ub.hh +++ b/src/cpu/pred/btb/btb_tage_ub.hh @@ -95,7 +95,7 @@ class BTBTAGEUpperBound : public BTBTAGE const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; diff --git a/src/cpu/pred/btb/btb_ubtb.cc b/src/cpu/pred/btb/btb_ubtb.cc index 755d8d8460..5f809713c3 100644 --- a/src/cpu/pred/btb/btb_ubtb.cc +++ b/src/cpu/pred/btb/btb_ubtb.cc @@ -137,7 +137,8 @@ void UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { meta = std::make_shared(); - auto it = lookup(startAddr); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; + auto it = lookup(startAddr, asidHash); auto& entry = meta->hit_entry; entry = (it != ubtb.end()) ? *it : TickedUBTBEntry(); @@ -151,23 +152,29 @@ UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std:: } UBTB::UBTBIter -UBTB::lookup(Addr startAddr) +UBTB::lookup(Addr startAddr, uint8_t asidHash) { if (startAddr & 0x1) { return ubtb.end(); // ignore false hit when lowest bit is 1 } - Addr current_tag = getTag(startAddr); + Addr current_tag = getTag(startAddr, asidHash); + Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1); DPRINTF(UBTB, "UBTB: Doing tag comparison for tag %#lx\n", current_tag); auto it = std::find_if(ubtb.begin(), ubtb.end(), - [current_tag](const TickedUBTBEntry &way) { return way.valid && way.tag == current_tag; }); + [current_tag, startAddr, block_end](const TickedUBTBEntry &way) { + return way.valid && way.tag == current_tag && + way.pc >= startAddr && way.pc < block_end; + }); if (it != ubtb.end()) { // Found a hit - verify no duplicates - auto duplicate = std::find_if(std::next(it), ubtb.end(), [current_tag](const TickedUBTBEntry &way) { - return way.valid && way.tag == current_tag; + auto duplicate = std::find_if(std::next(it), ubtb.end(), + [current_tag, startAddr, block_end](const TickedUBTBEntry &way) { + return way.valid && way.tag == current_tag && + way.pc >= startAddr && way.pc < block_end; }); if (duplicate != ubtb.end()) { DPRINTF(UBTB, "UBTB: Multiple hits found in uBTB for the same tag %#lx\n", current_tag); @@ -184,7 +191,8 @@ UBTB::lookup(Addr startAddr) void -UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr) +UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, + Addr startAddr, uint8_t asidHash) { assert(newTakenEntry.valid); TickedUBTBEntry newEntry = TickedUBTBEntry(newTakenEntry, curTick()); @@ -192,7 +200,7 @@ UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr newEntry.target = newTakenEntry.target; newEntry.ctr = 0; // have a bug here:ubtb will accept ctr from mbtb, reset it to 0 at here // important: update tag (mbtb and ubtb have different tags, even diffferent tag length) - newEntry.tag = getTag(startAddr); + newEntry.tag = getTag(startAddr, asidHash); *oldEntryIter = newEntry; } @@ -213,13 +221,14 @@ UBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred) auto startAddr = s3Pred.bbStart; UBTBIter oldEntryIter = lastPred.hit_entry; takenEntry.source = getComponentIdx(); - updateNewEntry(oldEntryIter, takenEntry, startAddr); + updateNewEntry(oldEntryIter, takenEntry, startAddr, s3Pred.asidHash); } -void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr) +void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, + const Addr startAddr, uint8_t asidHash) { //using the FB final taken branch to update uBTB if (oldEntryIter != ubtb.end()) { @@ -259,7 +268,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con } // Replace the entry with the new prediction - replaceOldEntry(toBeReplacedIter, takenEntry, startAddr); + replaceOldEntry(toBeReplacedIter, takenEntry, startAddr, asidHash); } else if (oldEntryIter != ubtb.end() && takenEntry.valid) { ubtbStats.s1Hits3Taken++; @@ -269,7 +278,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con updateUCtr(oldEntryIter->uctr, false); if (oldEntryIter->uctr == 0) { // replace the old entry with the new one - replaceOldEntry(oldEntryIter, takenEntry, startAddr); + replaceOldEntry(oldEntryIter, takenEntry, startAddr, asidHash); } } else { // S0 and S3 predict the same (brpc and target) @@ -294,13 +303,15 @@ UBTB::update(const FetchTarget &stream) // Use BTBEntry instead of BranchInfo; make it invalid when not taken BTBEntry takenEntry = stream.exeTaken ? BTBEntry(stream.exeBranchInfo) : BTBEntry(); auto startAddr = stream.getRealStartPC(); - Addr oldtag = getTag(startAddr); + Addr oldtag = getTag(startAddr, stream.asidHash); + Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1); UBTBIter oldEntryIter = ubtb.end(); oldEntryIter = meta->hit_entry.valid ? - std::find_if(ubtb.begin(), ubtb.end(), [oldtag](const TickedUBTBEntry &e) { - return e.valid && e.tag == oldtag; + std::find_if(ubtb.begin(), ubtb.end(), [oldtag, startAddr, block_end](const TickedUBTBEntry &e) { + return e.valid && e.tag == oldtag && + e.pc >= startAddr && e.pc < block_end; }) : ubtb.end(); if (stream.exeTaken) { @@ -315,7 +326,7 @@ UBTB::update(const FetchTarget &stream) // Verify uBTB state assert(ubtb.size() <= numEntries); if (!usingS3Pred) { - updateNewEntry(oldEntryIter, takenEntry, startAddr); + updateNewEntry(oldEntryIter, takenEntry, startAddr, stream.asidHash); } } diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh index 5c394ac9cc..4898cec009 100644 --- a/src/cpu/pred/btb/btb_ubtb.hh +++ b/src/cpu/pred/btb/btb_ubtb.hh @@ -141,7 +141,7 @@ class UBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override { return meta; } @@ -218,8 +218,9 @@ class UBTB : public TimedBaseBTBPredictor * @param startPC The start address of the fetch block * @return Returns the tag bits. */ - inline Addr getTag(Addr startPC) { - return (startPC >> 1) & tagMask; + inline Addr getTag(Addr startPC, uint8_t asidHash) { + Addr baseTag = (startPC >> 1) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } void updateUCtr(unsigned &ctr, bool inc) { @@ -231,7 +232,7 @@ class UBTB : public TimedBaseBTBPredictor * @param startAddr The FB start address to look up * @return Iterator to the matching entry if found, or ubtb.end() if not found */ - UBTBIter lookup(Addr startAddr); + UBTBIter lookup(Addr startAddr, uint8_t asidHash); /** helper method called by putPCHistory: Check uBTB entry pc range and update statistics * @param entry The uBTB entry to check @@ -251,10 +252,12 @@ class UBTB : public TimedBaseBTBPredictor * @param oldEntry Iterator to the entry to replace * @param newPrediction The new prediction to store */ - void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr); + void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, + Addr startAddr, uint8_t asidHash); //using the FB final taken branch to update uBTB - void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr); + void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, + const Addr startAddr, uint8_t asidHash); /** The uBTB structure: diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index e00e7fbcf7..9c2cdf2c8f 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -1,6 +1,7 @@ #ifndef __CPU_PRED_BTB_STREAM_STRUCT_HH__ #define __CPU_PRED_BTB_STREAM_STRUCT_HH__ +#include #include #include @@ -18,6 +19,49 @@ namespace branch_prediction { namespace btb_pred { +inline uint8_t +foldAsidHash16To4(uint16_t asid) +{ + return (asid & 0xf) ^ ((asid >> 4) & 0xf) ^ + ((asid >> 8) & 0xf) ^ ((asid >> 12) & 0xf); +} + +inline Addr +expandAsidHash(uint8_t asid_hash, unsigned bits) +{ + if (bits == 0) { + return 0; + } + + Addr expanded = 0; + for (unsigned shift = 0; shift < bits; shift += 4) { + expanded |= static_cast(asid_hash) << shift; + } + return expanded & mask(bits); +} + +inline Addr +injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash) +{ + if (tag_bits == 0) { + return 0; + } + + const unsigned hash_bits = std::min(4, tag_bits); + const Addr hash_mask = mask(hash_bits); + return (base_tag & ~hash_mask) | (static_cast(asid_hash) & hash_mask); +} + +inline Addr +xorAsidHashIntoIndex(Addr base_index, unsigned index_bits, uint8_t asid_hash) +{ + if (index_bits == 0) { + return 0; + } + + return (base_index ^ expandAsidHash(asid_hash, index_bits)) & mask(index_bits); +} + enum EndType { END_CALL=0, @@ -276,6 +320,7 @@ using IndirectTargets = std::vector>; struct FetchTarget { ThreadID tid; + uint8_t asidHash; Addr startPC; // start pc of the stream bool predTaken; // whether the FetchTarget has taken branch Addr predEndPC; // predicted stream end pc (fall through pc) @@ -323,7 +368,9 @@ struct FetchTarget int s3Source; // which stage the prediction comes from FetchTarget() - : startPC(0), + : tid(0), + asidHash(0), + startPC(0), predTaken(false), predEndPC(0), predBranchInfo(BranchInfo()), @@ -452,6 +499,7 @@ struct FetchTarget struct FullBTBPrediction { ThreadID tid; + uint8_t asidHash; Addr bbStart; std::vector btbEntries; // for BTB, only assigned when hit, sorted by inst order // for conditional branch predictors, mapped with lowest bits of branches @@ -472,6 +520,8 @@ struct FullBTBPrediction int s3Source; FullBTBPrediction() : + tid(0), + asidHash(0), bbStart(0), btbEntries(), condTakens(), diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 37cf705814..e5fd8c68a0 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -2,11 +2,13 @@ #include +#include "arch/riscv/regs/misc.hh" #include "base/debug_helper.hh" #include "base/output.hh" #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst.hh" #include "cpu/pred/btb/folded_hist.hh" +#include "cpu/thread_context.hh" #include "debug/BTB.hh" #include "debug/DecoupleBPHist.hh" #include "debug/DecoupleBPVerbose.hh" @@ -21,6 +23,19 @@ namespace branch_prediction namespace btb_pred { +uint8_t +DecoupledBPUWithBTB::getThreadAsidHash(ThreadID tid) const +{ + if (!cpu) { + return 0; + } + + const RegVal satp = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_SATP, tid); + const uint16_t asid = (satp >> 44) & mask(16); + return foldAsidHash16To4(asid); +} + void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { @@ -45,8 +60,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) // uras(p.uras), bpDBSwitches(p.bpDBSwitches), numStages(p.numStages), - ftq(2, p.ftq_size), - historyManager(16), // TODO: fix this + ftq(p.numThreads, p.ftq_size), resolveBlockThreshold(p.resolveBlockThreshold), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { @@ -86,6 +100,12 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) printf("\n"); } + historyManagers.reserve(numThreads); + resolveDequeueFailCounters.assign(numThreads, 0); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + historyManagers.emplace_back(16); + } + for (int tid=0;tidgetContext(tid); + if (!tc || tc->status() != gem5::ThreadContext::Active) { + continue; + } + } + + nextPredictTid = (tid + 1) % numThreads; + return tid; + } + + return InvalidThreadID; +} + void DecoupledBPUWithBTB::tick() @@ -122,6 +162,9 @@ DecoupledBPUWithBTB::tick() DPRINTF(Override, "DecoupledBPUWithBTB::tick()\n"); ThreadID curTid = scheduleThread(); + if (curTid == InvalidThreadID) { + return; + } // On squash, reset state if there was a valid prediction. bool squashOccurred = false; @@ -180,14 +223,17 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid) { auto& thread = threads[tid]; auto& predsOfEachStage = threads[tid].predsOfEachStage; + const uint8_t asid_hash = getThreadAsidHash(tid); DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC); - - // Initialize prediction state for each stage + // Reset all stage-local prediction fields before components fill them. + clearPreds(tid); for (int i = 0; i < numStages; i++) { predsOfEachStage[i].tid = tid; + predsOfEachStage[i].asidHash = asid_hash; predsOfEachStage[i].bbStart = thread.s0PC; + predsOfEachStage[i].predSource = i; } // Query each predictor component with current PC and history @@ -300,7 +346,7 @@ DecoupledBPUWithBTB::generateFinalPredAndCreateBubbles(ThreadID tid) if (ubtb->isEnabled()) { ubtb->updateUsingS3Pred(predsOfEachStage[numStages - 1]); } - if (abtb->isEnabled() && ftq.backId(tid)) { + if (abtb->isEnabled() && !ftq.empty(tid)) { auto previous_block_startpc = ftq.back(tid).startPC; abtb->updateUsingS3Pred(predsOfEachStage[numStages - 1], previous_block_startpc); } else if (abtb->isEnabled()) { @@ -428,8 +474,14 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id, // Find the target being squashed if (!ftq.hasTarget(target_id, tid)) { - assert(!ftq.empty(tid)); - DPRINTF(DecoupleBP, "The squashing target is insane, ignore squash on it"); + DPRINTF(DecoupleBP, + "Ignore squash for tid %u on missing FTQ target %u; " + "recovering predictor state from redirect PC %#lx\n", + tid, target_id, redirect_pc); + ftq.clear(tid); + clearPreds(tid); + threads[tid].validprediction = false; + threads[tid].s0PC = redirect_pc; return; } @@ -577,7 +629,7 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid) if (!ftq.empty(tid)) printTarget(ftq.front(tid)); - historyManager.commit(target_id); + historyManagers[tid].commit(target_id); } bool @@ -615,26 +667,26 @@ DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid) } void -DecoupledBPUWithBTB::notifyResolveSuccess() +DecoupledBPUWithBTB::notifyResolveSuccess(ThreadID tid) { - resolveDequeueFailCounter = 0; + resolveDequeueFailCounters[tid] = 0; } void -DecoupledBPUWithBTB::notifyResolveFailure() +DecoupledBPUWithBTB::notifyResolveFailure(ThreadID tid) { - resolveDequeueFailCounter++; - if (resolveDequeueFailCounter >= resolveBlockThreshold) { - blockPredictionOnce(); - resolveDequeueFailCounter = 0; + auto &failCounter = resolveDequeueFailCounters[tid]; + failCounter++; + if (failCounter >= resolveBlockThreshold) { + blockPredictionOnce(tid); + failCounter = 0; } } void -DecoupledBPUWithBTB::blockPredictionOnce() +DecoupledBPUWithBTB::blockPredictionOnce(ThreadID tid) { - // smtTODO - threads[0].blockPredictionPending = true; + threads[tid].blockPredictionPending = true; } void @@ -745,6 +797,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid) // Create a new fetch target entry FetchTarget entry; entry.tid = tid; + entry.asidHash = finalPred.asidHash; entry.startPC = s0PC; // Extract branch prediction information @@ -779,7 +832,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid) // Save predictors' metadata for (int i = 0; i < numComponents; i++) { - entry.predMetas[i] = components[i]->getPredictionMeta(); + entry.predMetas[i] = components[i]->getPredictionMeta(tid); } // Initialize default resolution state @@ -814,7 +867,8 @@ DecoupledBPUWithBTB::fillAheadPipeline(FetchTarget &entry) } void -DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history) +DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history, + ThreadID tid) { // This function performs a crucial validation of branch history consistency // It rebuilds the "ideal" history from HistoryManager's records and compares @@ -825,7 +879,7 @@ DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history) boost::dynamic_bitset<> ideal_hash_hist(historyBits, 0); // Iterate through all speculative history entries stored in HistoryManager - for (const auto entry: historyManager.getSpeculativeHist()) { + for (const auto entry: historyManagers[tid].getSpeculativeHist()) { // Only process entries that have non-zero shift amount (actual branches) if (entry.shamt != 0) { // Accumulate total history bits @@ -868,6 +922,12 @@ DecoupledBPUWithBTB::resetPC(Addr new_pc) threads[i].s0PC = new_pc; } +void +DecoupledBPUWithBTB::resetPC(ThreadID tid, Addr new_pc) +{ + threads[tid].s0PC = new_pc; +} + Addr DecoupledBPUWithBTB::getPreservedReturnAddr(const DynInstPtr &dynInst) { @@ -915,7 +975,7 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry) histShiftIn(shamt, taken, s0History); // Update history manager and verify TAGE folded history - historyManager.addSpeculativeHist( + historyManagers[tid].addSpeculativeHist( entry.startPC, shamt, taken, entry.predBranchInfo, ftq.backId(tid) + 1); // Get prediction information for global backward history updates @@ -938,16 +998,17 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry) #ifndef NDEBUG if (tage->isEnabled()) { - tage->checkFoldedHist(s0PHistory, "speculative update"); + tage->checkFoldedHist(s0PHistory, tid, "speculative update"); } if (ittage->isEnabled()) { - ittage->checkFoldedHist(s0PHistory, "speculative update"); + ittage->checkFoldedHist(s0PHistory, tid, "speculative update"); } if (microtage->isEnabled()) { - microtage->checkFoldedHist(s0PHistory, "speculative update"); + microtage->checkFoldedHist(s0PHistory, tid, "speculative update"); } if (mgsc->isEnabled()) { - mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, "speculative update"); + mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid, + "speculative update"); } #endif } @@ -1023,31 +1084,33 @@ DecoupledBPUWithBTB::recoverHistoryForSquash( // Update history manager with appropriate branch info if (squash_type == SQUASH_CTRL) { - historyManager.squash(target_id, real_shamt, real_taken, target.exeBranchInfo); + historyManagers[tid].squash(target_id, real_shamt, real_taken, + target.exeBranchInfo); } else { - historyManager.squash(target_id, real_shamt, real_taken, BranchInfo()); + historyManagers[tid].squash(target_id, real_shamt, real_taken, + BranchInfo()); } // Perform history consistency checks when not a fast build variant #ifndef NDEBUG - checkHistory(s0History); + checkHistory(s0History, tid); if (tage->isEnabled()) { - tage->checkFoldedHist(s0PHistory, + tage->checkFoldedHist(s0PHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (ittage->isEnabled()) { - ittage->checkFoldedHist(s0PHistory, + ittage->checkFoldedHist(s0PHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (microtage->isEnabled()) { - microtage->checkFoldedHist(s0PHistory, + microtage->checkFoldedHist(s0PHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (mgsc->isEnabled()) { - mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, + mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 288450001f..0a46c1a4e5 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -75,8 +75,7 @@ class DecoupledBPUWithBTB : public BPredUnit // FetchTargetId fetchHeadFtqId{1}; // next FSQ id to be consumed by fetch CPU *cpu; - - const int numThreads = 2; + ThreadID nextPredictTid = 0; unsigned predictWidth; // max predict width, default 64 unsigned maxInstsNum; @@ -141,11 +140,11 @@ class DecoupledBPUWithBTB : public BPredUnit bool blockPredictionPending{false}; } threads[MaxThreads]; - HistoryManager historyManager; - unsigned resolveDequeueFailCounter{0}; + std::vector historyManagers; + std::vector resolveDequeueFailCounters; const unsigned resolveBlockThreshold; - ThreadID scheduleThread() { return 0; } + ThreadID scheduleThread(); void processNewPrediction(ThreadID tid); @@ -188,10 +187,9 @@ class DecoupledBPUWithBTB : public BPredUnit void generateFinalPredAndCreateBubbles(ThreadID tid); void clearPreds(ThreadID tid) { - for (auto &stagePred : threads[tid].predsOfEachStage) { - stagePred.condTakens.clear(); - stagePred.indirectTargets.clear(); - stagePred.btbEntries.clear(); + for (int i = 0; i < threads[tid].predsOfEachStage.size(); ++i) { + threads[tid].predsOfEachStage[i] = FullBTBPrediction(); + threads[tid].predsOfEachStage[i].predSource = i; } } @@ -332,6 +330,7 @@ class DecoupledBPUWithBTB : public BPredUnit } void setCpu(CPU *_cpu) { cpu = _cpu; } + uint8_t getThreadAsidHash(ThreadID tid) const; void consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid); @@ -425,7 +424,7 @@ class DecoupledBPUWithBTB : public BPredUnit void overrideStats(OverrideReason overrideReason); - void checkHistory(const boost::dynamic_bitset<> &history); + void checkHistory(const boost::dynamic_bitset<> &history, ThreadID tid); Addr getPreservedReturnAddr(const DynInstPtr &dynInst); @@ -704,6 +703,7 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned control_inst_size = 0); void resetPC(Addr new_pc); + void resetPC(ThreadID tid, Addr new_pc); // Helper functions for update bool resolveUpdate(unsigned &target_id, ThreadID tid); @@ -711,9 +711,9 @@ class DecoupledBPUWithBTB : public BPredUnit void markCFIResolved(unsigned &target, uint64_t resolvedInstPC, ThreadID tid); void updatePredictorComponents(FetchTarget &target); void updateStatistics(const FetchTarget &target); - void notifyResolveSuccess(); - void notifyResolveFailure(); - void blockPredictionOnce(); + void notifyResolveSuccess(ThreadID tid); + void notifyResolveFailure(ThreadID tid); + void blockPredictionOnce(ThreadID tid); /** * @brief Types of control flow instructions for misprediction tracking diff --git a/src/cpu/pred/btb/ftq.cc b/src/cpu/pred/btb/ftq.cc index 3642ef7162..b8abfe7996 100644 --- a/src/cpu/pred/btb/ftq.cc +++ b/src/cpu/pred/btb/ftq.cc @@ -1,3 +1,5 @@ +#include + #include "ftq.hh" namespace gem5 @@ -53,6 +55,19 @@ FetchTargetQueue::squashAfter(FetchTargetId squashId, ThreadID tid) queue[tid].fetchptr = squashId + 1; } +void +FetchTargetQueue::clear(ThreadID tid) +{ + const FetchTargetId nextTargetId = std::max( + queue[tid].fetchptr, + queue[tid].baseTargetId + + static_cast(queue[tid].cap.size())); + + queue[tid].cap.clear(); + queue[tid].baseTargetId = nextTargetId; + queue[tid].fetchptr = nextTargetId; +} + } } diff --git a/src/cpu/pred/btb/ftq.hh b/src/cpu/pred/btb/ftq.hh index c43d071447..c762cd0b83 100644 --- a/src/cpu/pred/btb/ftq.hh +++ b/src/cpu/pred/btb/ftq.hh @@ -80,6 +80,7 @@ public: void finishTarget(ThreadID tid); void commitTarget(ThreadID tid); void squashAfter(FetchTargetId targetId, ThreadID tid); + void clear(ThreadID tid); }; } diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index 4ab8445677..de1e764fce 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -299,8 +299,9 @@ MBTB::putPCHistory(Addr startAddr, std::vector &stagePreds) { meta = std::make_shared(); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; // Lookup all matching entries in BTB - auto find_entries = lookup(startAddr, meta); + auto find_entries = lookup(startAddr, asidHash, meta); // Process BTB entries auto processed_entries = processEntries(find_entries, startAddr); @@ -313,8 +314,9 @@ MBTB::putPCHistory(Addr startAddr, } std::shared_ptr -MBTB::getPredictionMeta() +MBTB::getPredictionMeta(ThreadID tid) { + (void)tid; return meta; } @@ -334,7 +336,7 @@ MBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &ent * @return Vector of matching BTB entries */ std::vector -MBTB::lookupSingleBlock(Addr block_pc) +MBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { @@ -345,11 +347,11 @@ MBTB::lookupSingleBlock(Addr block_pc) auto& target_sram = (sram_id == 0) ? sram0 : sram1; auto& target_mru = (sram_id == 0) ? mru0 : mru1; - Addr btb_idx = getIndex(block_pc); + Addr btb_idx = getIndex(block_pc, asidHash); auto& btb_set = target_sram[btb_idx]; assert(btb_idx < numSets); - Addr current_tag = getTag(block_pc); + Addr current_tag = getTag(block_pc, asidHash); DPRINTF(BTB, "BTB: Doing tag comparison for SRAM%d index 0x%lx tag %#lx\n", sram_id, btb_idx, current_tag); @@ -364,7 +366,7 @@ MBTB::lookupSingleBlock(Addr block_pc) } std::vector -MBTB::lookup(Addr block_pc, std::shared_ptr meta) +MBTB::lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr meta) { std::vector res; if (block_pc & 0x1) { @@ -375,15 +377,15 @@ MBTB::lookup(Addr block_pc, std::shared_ptr meta) // Calculate 32B aligned address Addr alignedPC = block_pc & ~(blockSize - 1); // Lookup first 32B block - res = lookupSingleBlock(alignedPC); + res = lookupSingleBlock(alignedPC, asidHash); // Lookup next 32B block - auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize); + auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize, asidHash); // Merge results res.insert(res.end(), nextBlockRes.begin(), nextBlockRes.end()); // lookup victim cache if victim cache is enabled if (victimCacheSize > 0) { - auto victimResults = lookupVictimCache(block_pc); + auto victimResults = lookupVictimCache(block_pc, asidHash); if (!victimResults.empty()) { DPRINTF(BTB, "Victim cache hit for lookup at %#lx\n", block_pc); btbStats.victimCacheHit++; @@ -459,7 +461,7 @@ MBTB::getAndSetNewBTBEntry(FetchTarget &stream) } // Set tag and update stream metadata for use in update() - entry_to_write.tag = getTag(entry_to_write.pc); + entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash); stream.updateNewBTBEntry = entry_to_write; stream.updateIsOldEntry = is_old_entry; } @@ -507,7 +509,7 @@ MBTB::updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream) auto& target_mru = (sram_id == 0) ? mru0 : mru1; // Calculate index and tag for this entry - Addr btb_idx = getIndex(entry.pc); + Addr btb_idx = getIndex(entry.pc, stream.asidHash); // Look for matching entry in the target SRAM bool found = false; @@ -563,7 +565,7 @@ MBTB::buildUpdatedEntry(const BTBEntry& req_entry, ? BTBEntry(*existing_entry) : req_entry; // Always recalculate tag based on the actual PC being written - entry_to_write.tag = getTag(entry_to_write.pc); + entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash); entry_to_write.resolved = false; // reset resolved status // Update saturating counter and alwaysTaken @@ -722,7 +724,7 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) { * Victim cache operations implementation */ std::vector -MBTB::lookupVictimCache(Addr block_pc) +MBTB::lookupVictimCache(Addr block_pc, uint8_t asidHash) { std::vector results; Addr alignedPC = block_pc & ~(blockSize - 1); @@ -734,7 +736,7 @@ MBTB::lookupVictimCache(Addr block_pc) Addr entryAlignedPC = entry.pc & ~(blockSize - 1); // Check if this entry is in either of the two 32B blocks we're looking for if (entryAlignedPC == alignedPC || entryAlignedPC == (alignedPC + blockSize)) { - Addr current_tag = getTag(entry.pc); + Addr current_tag = getTag(entry.pc, asidHash); if (entry.tag == current_tag) { results.push_back(entry); DPRINTF(BTB, "Victim cache hit for pc %#lx\n", entry.pc); diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh index d736d0f55c..3b2ec76fe4 100644 --- a/src/cpu/pred/btb/mbtb.hh +++ b/src/cpu/pred/btb/mbtb.hh @@ -147,7 +147,7 @@ class MBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // not used void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -215,8 +215,9 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The branch to look up. * @return Returns the index into the BTB. */ - inline Addr getIndex(Addr instPC) { - return (instPC >> idxShiftAmt) & idxMask; + inline Addr getIndex(Addr instPC, uint8_t asidHash) { + Addr baseIndex = (instPC >> idxShiftAmt) & idxMask; + return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash); } /** Returns the tag bits of a given address. @@ -225,8 +226,9 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The branch's address. * @return Returns the tag bits. */ - inline Addr getTag(Addr instPC) { - return (instPC >> tagShiftAmt) & tagMask; + inline Addr getTag(Addr instPC, uint8_t asidHash) { + Addr baseTag = (instPC >> tagShiftAmt) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } /** Update the 2-bit saturating counter for conditional branches @@ -340,16 +342,16 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The address of the block to look up. * @return Returns all hit BTB entries. */ - std::vector lookup(Addr block_pc, std::shared_ptr meta); + std::vector lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr meta); /** Helper function to lookup entries in a single block * @param block_pc The aligned PC to lookup * @return Vector of matching BTB entries */ - std::vector lookupSingleBlock(Addr block_pc); + std::vector lookupSingleBlock(Addr block_pc, uint8_t asidHash); /** Victim cache operations */ - std::vector lookupVictimCache(Addr block_pc); + std::vector lookupVictimCache(Addr block_pc, uint8_t asidHash); void insertVictimCache(const TickedBTBEntry& evicted_entry); bool eraseFromVictimCacheByPC(Addr pc); diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc index bcc57db899..ae291fd3eb 100644 --- a/src/cpu/pred/btb/microtage.cc +++ b/src/cpu/pred/btb/microtage.cc @@ -97,6 +97,9 @@ tageStats(this, p.numPredictors, p.numBanks) } // Initialize base table for fallback predictions + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + for (unsigned int i = 0; i < numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -113,9 +116,15 @@ tageStats(this, p.numPredictors, p.numBanks) assert(tablePcShifts.size() >= numPredictors); - tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16)); - altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16)); - indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i], 16); + state.altTagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i] - 1, 16); + state.indexFoldedHist.emplace_back( + (int)histLengths[i], (int)tableIndexBits[i], 16); + } } usefulResetCnt = 0; @@ -129,6 +138,27 @@ MicroTAGE::~MicroTAGE() { } +ThreadID +MicroTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +MicroTAGE::ThreadHistoryState & +MicroTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const MicroTAGE::ThreadHistoryState & +MicroTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void MicroTAGE::setTrace() @@ -183,8 +213,11 @@ MicroTAGE::tickStart() {} MicroTAGE::TagePrediction MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - std::shared_ptr predMeta) { + std::shared_ptr predMeta, + ThreadID tid, + uint8_t asidHash) { DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); + const auto &state = historyState(tid); bool provided = false; TageTableInfo main_info; @@ -197,11 +230,13 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, // Calculate index and tag: use snapshot if provided, otherwise use current folded history // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) Addr index = predMeta ? getTageIndex(startPC, i, - predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i); + predMeta->indexFoldedHist[i].get(), asidHash) + : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash); Addr tag = predMeta ? getTageTag(startPC, i, - predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position) - : getTageTag(startPC, i, tagFoldedHist[i].get(),altTagFoldedHist[i].get(), position); + predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), + position, asidHash) + : getTageTag(startPC, i, state.tagFoldedHist[i].get(), + state.altTagFoldedHist[i].get(), position, asidHash); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -257,7 +292,8 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, * @return Map of branch PC addresses to their predicted outcomes */ void -MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, CondTakens& results) +MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, + CondTakens& results, ThreadID tid, uint8_t asidHash) { DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -265,8 +301,9 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEnt for (auto &btb_entry : btbEntries) { // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { - auto pred = generateSinglePrediction(btb_entry, startPC); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, + tid, asidHash); + threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } @@ -297,6 +334,9 @@ MicroTAGE::dryRunCycle(Addr startPC) { */ void MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; + const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); predBankValid = true; @@ -314,30 +354,36 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector(); - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; - meta->aheadIndexFoldedHistValid = !aheadindexFoldedHist.empty(); - if (meta->aheadIndexFoldedHistValid) { - meta->aheadIndexFoldedHist = aheadindexFoldedHist.front(); + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; + threadMeta[tid]->aheadIndexFoldedHistValid = + !state.aheadIndexFoldedHist.empty(); + if (threadMeta[tid]->aheadIndexFoldedHistValid) { + threadMeta[tid]->aheadIndexFoldedHist = + state.aheadIndexFoldedHist.front(); } else { - meta->aheadIndexFoldedHist.clear(); + threadMeta[tid]->aheadIndexFoldedHist.clear(); } - meta->history = history; + threadMeta[tid]->history = history; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens); + lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens, + tid, asidHash); } } std::shared_ptr -MicroTAGE::getPredictionMeta() { - return meta; +MicroTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -495,6 +541,7 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC, bool actual_taken, unsigned start_table, std::shared_ptr meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way) { @@ -508,9 +555,10 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC, for (unsigned ti = start_table; ti < numPredictors; ++ti) { Addr newIndex = getTageIndex(startPC, ti, - meta->indexFoldedHist[ti].get()); + meta->indexFoldedHist[ti].get(), asidHash); Addr newTag = getTageTag(startPC, ti, - meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position); + meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), + position, asidHash); auto &set = tageTable[ti][newIndex]; @@ -638,7 +686,8 @@ MicroTAGE::update(const FetchTarget &stream) { TagePrediction recomputed; if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot // Re-read providers using snapshot (do not rely on prediction-time main/alt) - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); } else { // otherwise, use the prediction from the prediction-time main/alt auto pred_it = predMeta->preds.find(btb_entry.pc); if (pred_it != predMeta->preds.end()) { @@ -646,7 +695,8 @@ MicroTAGE::update(const FetchTarget &stream) { } else { DPRINTF(UTAGE, "update: missing predMeta entry for pc %#lx, recompute with snapshot\n", btb_entry.pc); - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); } } if (recomputed.mainprovided) { @@ -669,7 +719,8 @@ MicroTAGE::update(const FetchTarget &stream) { start_table = main_info.table + 1; // start from the table after the main prediction table } alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken, - start_table, predMeta, allocated_table, allocated_index, allocated_way); + start_table, predMeta, stream.asidHash, + allocated_table, allocated_index, allocated_way); } #ifndef UNIT_TEST @@ -751,7 +802,8 @@ MicroTAGE::updateCounter(bool taken, unsigned width, short &counter) { // Calculate TAGE tag with folded history - optimized version using bitwise operations Addr -MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position) +MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position, uint8_t asidHash) { // Create mask for tableTagBits[t] to limit result size Addr mask = (1ULL << tableTagBits[t]) - 1; @@ -766,11 +818,12 @@ MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis Addr altTagBits = (altFoldedHist << 1) & mask; // XOR all components together, including position (like RTL) - return pcBits ^ foldedBits ^ position ^ altTagBits; + return injectAsidHashIntoTag(pcBits ^ foldedBits ^ position ^ altTagBits, + tableTagBits[t], asidHash); } Addr -MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) +MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash) { // Create mask for tableIndexBits[t] to limit result size Addr mask = (1ULL << tableIndexBits[t]) - 1; @@ -779,13 +832,13 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr pcBits = (pc >> pcShift) & mask; Addr foldedBits = foldedHist & mask; - return pcBits ^ foldedBits; + return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash); } Addr -MicroTAGE::getTageIndex(Addr pc, int t) +MicroTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool @@ -851,23 +904,26 @@ MicroTAGE::getBankId(Addr pc) const * @param taken Whether the branch was taken */ void -MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target) +MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, + Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::TAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); DPRINTF(TAGEHistory, "in doUpdateHist, taken %d, pc %#lx, history %s\n", taken, pc, buf.c_str()); } - if (!aheadindexFoldedHist.empty()) { - indexFoldedHist = aheadindexFoldedHist.front(); + if (!state.aheadIndexFoldedHist.empty()) { + state.indexFoldedHist = state.aheadIndexFoldedHist.front(); } if (!taken) { - if (debug::TAGEHistory && !aheadindexFoldedHist.empty()) { + if (debug::TAGEHistory && !state.aheadIndexFoldedHist.empty()) { bool mismatch = false; for (int t = 0; t < numPredictors; t++) { - if (indexFoldedHist[t].get() != aheadindexFoldedHist.front()[t].get()) { + if (state.indexFoldedHist[t].get() != + state.aheadIndexFoldedHist.front()[t].get()) { mismatch = true; break; } @@ -883,22 +939,23 @@ MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr for (int t = 0; t < numPredictors; t++) { // Update tag folded history immediately so tag calculation always sees current history. - tagFoldedHist[t].update(history, 2, taken, pc, target); - altTagFoldedHist[t].update(history, 2, taken, pc, target); + state.tagFoldedHist[t].update(history, 2, taken, pc, target); + state.altTagFoldedHist[t].update(history, 2, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, tag 0x%lx, altTag 0x%lx\n", - t, tagFoldedHist[t].get(), altTagFoldedHist[t].get()); + t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get()); } // Prepare next-cycle index folded history and delay its visibility by one cycle. - auto nextIndexFoldedHist = indexFoldedHist; + auto nextIndexFoldedHist = state.indexFoldedHist; for (int t = 0; t < numPredictors; t++) { nextIndexFoldedHist[t].update(history, 2, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, index foldedHist(next) _folded 0x%lx\n", t, nextIndexFoldedHist[t].get()); } - aheadindexFoldedHist.push(nextIndexFoldedHist); - if (aheadindexFoldedHist.size() > 1) { - aheadindexFoldedHist.pop(); + state.aheadIndexFoldedHist.push(nextIndexFoldedHist); + if (state.aheadIndexFoldedHist.size() > 1) { + state.aheadIndexFoldedHist.pop(); } } @@ -918,7 +975,7 @@ void MicroTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, taken, pc, target); + doUpdateHist(history, taken, pc, target, pred.tid); } /** @@ -938,6 +995,7 @@ void MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); if (!predMeta) { DPRINTF(UTAGE, "recoverPHist: no prediction metadata, cannot recover\n"); @@ -945,21 +1003,22 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, } // Restore current folded index history exactly to prediction-time state. for (int i = 0; i < numPredictors; i++) { - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } // Restore delayed index folded history slot exactly to prediction-time state. - while (!aheadindexFoldedHist.empty()) { - aheadindexFoldedHist.pop(); + while (!state.aheadIndexFoldedHist.empty()) { + state.aheadIndexFoldedHist.pop(); } if (predMeta->aheadIndexFoldedHistValid) { assert(predMeta->aheadIndexFoldedHist.size() == numPredictors); - aheadindexFoldedHist.push(predMeta->aheadIndexFoldedHist); + state.aheadIndexFoldedHist.push(predMeta->aheadIndexFoldedHist); } if (debug::TAGEHistory) { bool queue_valid_mismatch = - (predMeta->aheadIndexFoldedHistValid != !aheadindexFoldedHist.empty()); + (predMeta->aheadIndexFoldedHistValid != + !state.aheadIndexFoldedHist.empty()); if (queue_valid_mismatch) { DPRINTF(TAGEHistory, "recoverPHist: ahead queue valid mismatch after restore, cond_taken %d\n", @@ -968,16 +1027,25 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, } for (int i = 0; i < numPredictors; i++) { - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); } - doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, cond_taken, entry.getControlPC(), + entry.getTakenTarget(), entry.tid); } // Check folded history after speculative update and recovery void MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); DPRINTF(UTAGE, "checking folded history when %s\n", when); if (debug::TAGEHistory) { std::string hist_str; @@ -989,13 +1057,13 @@ MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe // aheadindexFoldedHist in doUpdateHist(). During consistency checks // right after speculative/recovery updates, compare against the staged // next-cycle value when available. - if (!aheadindexFoldedHist.empty()) { - aheadindexFoldedHist.front()[t].check(hist); + if (!state.aheadIndexFoldedHist.empty()) { + state.aheadIndexFoldedHist.front()[t].check(hist); } else { - indexFoldedHist[t].check(hist); + state.indexFoldedHist[t].check(hist); } - tagFoldedHist[t].check(hist); - altTagFoldedHist[t].check(hist); + state.tagFoldedHist[t].check(hist); + state.altTagFoldedHist[t].check(hist); } } diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh index da593f6787..c181117bca 100644 --- a/src/cpu/pred/btb/microtage.hh +++ b/src/cpu/pred/btb/microtage.hh @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -42,6 +44,7 @@ namespace test { class MicroTAGE : public TimedBaseBTBPredictor { using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST // Test constructor @@ -121,7 +124,7 @@ class MicroTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update 3 folded history, according history and pred.taken // the other specUpdateHist methods are left blank @@ -159,23 +162,26 @@ class MicroTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); #ifndef UNIT_TEST private: #endif // Look up predictions in TAGE tables for a stream of instructions - void lookupHelper(const Addr &startPC, const std::vector &btbEntries, CondTakens& results); + void lookupHelper(const Addr &startPC, const std::vector &btbEntries, + CondTakens& results, ThreadID tid, uint8_t asidHash); // Calculate TAGE index for a given PC and table - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // Calculate TAGE index with folded history (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // Calculate TAGE tag with folded history (uint64_t version for performance) // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position = 0, uint8_t asidHash = 0); // Get branch index within a prediction block unsigned getBranchIndexInBlock(Addr branchPC, Addr startPC); @@ -185,7 +191,8 @@ class MicroTAGE : public TimedBaseBTBPredictor unsigned getBankId(Addr pc) const; // Update branch history - void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target); + void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target, + ThreadID tid); // Number of TAGE predictor tables const unsigned numPredictors; @@ -205,14 +212,15 @@ class MicroTAGE : public TimedBaseBTBPredictor // History lengths for each table std::vector histLengths; - // Folded history for tag calculation - std::vector tagFoldedHist; - - // Folded history for alternative tag calculation - std::vector altTagFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + std::queue> aheadIndexFoldedHist; + }; - // Folded history for index calculation - std::vector indexFoldedHist; + std::vector threadHistory; // Maximum history length, not used unsigned maxHistLen; @@ -259,8 +267,6 @@ class MicroTAGE : public TimedBaseBTBPredictor unsigned lastPredBankId; // Bank ID of last prediction bool predBankValid; // Whether lastPredBankId is valid - std::queue> aheadindexFoldedHist; - #ifdef UNIT_TEST typedef uint64_t Scalar; #else @@ -351,7 +357,9 @@ private: // If predMeta is nullptr, use current folded history (prediction path) TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const std::shared_ptr predMeta = nullptr); + const std::shared_ptr predMeta = nullptr, + ThreadID tid = 0, + uint8_t asidHash = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -368,11 +376,15 @@ private: bool actual_taken, unsigned main_table, std::shared_ptr meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way); - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc index 4dabf6dabf..8dd5b80aea 100644 --- a/src/cpu/pred/btb/ras.cc +++ b/src/cpu/pred/btb/ras.cc @@ -21,28 +21,13 @@ namespace btb_pred { : TimedBaseBTBPredictor(), numEntries(numEntries), ctrWidth(ctrWidth), - numInflightEntries(numInflightEntries) + numInflightEntries(numInflightEntries), + maxCtr((1 << ctrWidth) - 1), + numThreads(1), + threadStates(numThreads) { - // Initialize RAS state - ssp = 0; - nsp = 0; - sctr = 0; - stack.resize(numEntries); - maxCtr = (1 << ctrWidth) - 1; - TOSW = 0; - TOSR = 0; - inflightPtrDec(TOSR); - BOS = 0; - inflightStack.resize(numInflightEntries); - - // Initialize stack entries - for (auto &entry : stack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; - } - for (auto &entry : inflightStack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; + for (auto &state : threadStates) { + initThreadState(state); } } #else @@ -51,49 +36,61 @@ namespace btb_pred { : TimedBaseBTBPredictor(p), numEntries(p.numEntries), ctrWidth(p.ctrWidth), - numInflightEntries(p.numInflightEntries), - rasStats(this) + numInflightEntries(p.numInflightEntries), + maxCtr((1 << ctrWidth) - 1), + numThreads(p.numThreads), + threadStates(numThreads), + rasStats(this) { - // Initialize RAS state - ssp = 0; - nsp = 0; - sctr = 0; - stack.resize(numEntries); - maxCtr = (1 << ctrWidth) - 1; - TOSW = 0; - TOSR = 0; - inflightPtrDec(TOSR); - BOS = 0; - inflightStack.resize(numInflightEntries); - - // Initialize stack entries - for (auto &entry : stack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; - } - for (auto &entry : inflightStack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; + for (auto &state : threadStates) { + initThreadState(state); } } #endif void -BTBRAS::checkCorrectness() { +BTBRAS::initThreadState(ThreadRASState &state) +{ + state.TOSW = 0; + state.TOSR = 0; + inflightPtrDec(state.TOSR); + state.BOS = 0; + state.ssp = 0; + state.nsp = 0; + state.sctr = 0; + state.meta.reset(); + + state.stack.resize(numEntries); + state.inflightStack.resize(numInflightEntries); + + for (auto &entry : state.stack) { + entry.data.ctr = 0; + entry.data.retAddr = 0x80000000L; + } + for (auto &entry : state.inflightStack) { + entry.data.ctr = 0; + entry.data.retAddr = 0x80000000L; + entry.nos = 0; + } +} + +void +BTBRAS::checkCorrectness(ThreadID tid) { + auto &state = threadStates[tid]; /* - auto tosr = TOSR; - int checkssp = ssp; - while (inflightInRange(tosr)) { - if (!inflightStack[tosr].data.ctr) { + auto tosr = state.TOSR; + int checkssp = state.ssp; + while (inflightInRange(state, tosr)) { + if (!state.inflightStack[tosr].data.ctr) { checkssp = (checkssp - 1 + numEntries) % numEntries; } else { // just dec sctr, fixme here } - tosr = inflightStack[tosr].nos; + tosr = state.inflightStack[tosr].nos; } - if (checkssp != (nsp + numEntries - 1) % numEntries) { + if (checkssp != (state.nsp + numEntries - 1) % numEntries) { DPRINTF(RAS, "NSP and SSP check failed\n"); - printStack("checkCorrectness"); + printStack("checkCorrectness", tid); }*/ } @@ -102,28 +99,39 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { assert(getDelay() < stagePreds.size()); - meta = std::make_shared(); + const ThreadID tid = stagePreds.back().tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; + state.meta = std::make_shared(); DPRINTFR(RAS, "putPC startAddr %lx", startAddr); - // checkCorrectness(); + // checkCorrectness(tid); + auto top = getTop_meta(tid); for (int i = getDelay(); i < stagePreds.size(); i++) { - stagePreds[i].returnTarget = getTop_meta().retAddr; // stack[sp].retAddr; + stagePreds[i].returnTarget = top.retAddr; } /* if (stagePreds.back().btbEntry.slots[0].isCall || stagePreds.back().btbEntry.slots[0].isReturn || stagePreds.back().btbEntry.slots[1].isCall || stagePreds.back().btbEntry.slots[1].isReturn) { - printStack("putPCHistory"); + printStack("putPCHistory", tid); } */ } std::shared_ptr -BTBRAS::getPredictionMeta() +BTBRAS::getPredictionMeta(ThreadID tid) { - return meta; + if (tid >= threadStates.size()) { + return nullptr; + } + return threadStates[tid].meta; } void BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + const ThreadID tid = pred.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; + assert(state.meta); // do push & pops on prediction // pred.returnTarget = stack[sp].retAddr; auto takenEntry = pred.getTakenEntry(); @@ -131,11 +139,11 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction if (takenEntry.isCall) { Addr retAddr = takenEntry.pc + takenEntry.size; - push(retAddr); + push(tid, retAddr); } if (takenEntry.isReturn) { // do pop - pop(); + pop(tid); } if (takenEntry.isCall) { DPRINTFR(RAS, "IsCall spec PC %lx\n", takenEntry.pc); @@ -145,36 +153,39 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction } if (takenEntry.isCall || takenEntry.isReturn) - printStack("after specUpdateHist"); - DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", meta->TOSR, meta->TOSW); + printStack("after specUpdateHist", tid); + DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", state.meta->TOSR, state.meta->TOSW); } void BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + const ThreadID tid = entry.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; auto takenEntry = entry.exeBranchInfo; /* if (takenEntry.isCall || takenEntry.isReturn) { - printStack("before recoverHist"); + printStack("before recoverHist", tid); }*/ // recover sp and tos first auto meta_ptr = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); DPRINTF(RAS, "recover called, meta TOSR %d TOSW %d ssp %d sctr %u entry PC %lx end PC %lx\n", meta_ptr->TOSR, meta_ptr->TOSW, meta_ptr->ssp, meta_ptr->sctr, entry.startPC, entry.predEndPC); - TOSR = meta_ptr->TOSR; - TOSW = meta_ptr->TOSW; - ssp = meta_ptr->ssp; - sctr = meta_ptr->sctr; + state.TOSR = meta_ptr->TOSR; + state.TOSW = meta_ptr->TOSW; + state.ssp = meta_ptr->ssp; + state.sctr = meta_ptr->sctr; Addr retAddr = takenEntry.pc + takenEntry.size; // do push & pops on control squash if (entry.exeTaken) { if (takenEntry.isCall) { - push(retAddr); + push(tid, retAddr); } if (takenEntry.isReturn) { - pop(); + pop(tid); //TOSW = (TOSR + 1) % numInflightEntries; } } @@ -186,7 +197,7 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e DPRINTF(RAS, "IsRet expect target %lx, preded %lx, pred taken %d pred target %lx\n", takenEntry.target, meta_ptr->target, entry.predTaken, entry.predBranchInfo.target); } - printStack("after recoverHist"); + printStack("after recoverHist", tid); } } @@ -194,83 +205,89 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e void BTBRAS::update(const FetchTarget &entry) { + const ThreadID tid = entry.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; auto meta_ptr = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); auto takenEntry = entry.exeBranchInfo; if (entry.exeTaken) { - if (meta_ptr->ssp != nsp || meta_ptr->sctr != stack[nsp].data.ctr) { + if (meta_ptr->ssp != state.nsp || meta_ptr->sctr != state.stack[state.nsp].data.ctr) { DPRINTF(RAS, "ssp and nsp mismatch, recovering, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n", - meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr); - nsp = meta_ptr->ssp; + meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr); + state.nsp = meta_ptr->ssp; } else DPRINTF(RAS, "ssp and nsp match, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n", - meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr); + meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr); if (takenEntry.isCall) { DPRINTF(RAS, "real update call BTB hit %d meta TOSR %d TOSW %d\n entry PC %lx", entry.isHit, meta_ptr->TOSR, meta_ptr->TOSW, entry.startPC); Addr retAddr = takenEntry.pc + takenEntry.size; - push_stack(retAddr); - BOS = inflightPtrPlus1(meta_ptr->TOSW); + push_stack(tid, retAddr); + state.BOS = inflightPtrPlus1(meta_ptr->TOSW); } if (takenEntry.isReturn) { DPRINTF(RAS, "update ret entry PC %lx\n", entry.startPC); - pop_stack(); + pop_stack(tid); } } if (takenEntry.isCall || takenEntry.isReturn) { - printStack("after update(commit)"); + printStack("after update(commit)", tid); } } void -BTBRAS::push_stack(Addr retAddr) +BTBRAS::push_stack(ThreadID tid, Addr retAddr) { - auto tos = stack[nsp]; + auto &state = threadStates[tid]; + auto tos = state.stack[state.nsp]; if (tos.data.retAddr == retAddr && tos.data.ctr < maxCtr) { - stack[nsp].data.ctr++; + state.stack[state.nsp].data.ctr++; } else { // push new entry - ptrInc(nsp); - stack[nsp].data.retAddr = retAddr; - stack[nsp].data.ctr = 0; + ptrInc(state.nsp); + state.stack[state.nsp].data.retAddr = retAddr; + state.stack[state.nsp].data.ctr = 0; } // ++ndepth; } void -BTBRAS::push(Addr retAddr) +BTBRAS::push(ThreadID tid, Addr retAddr) { + auto &state = threadStates[tid]; rasStats.Pushes++; DPRINTF(RAS, "doing push "); // update ssp and sctr first // meta has recorded their old value - auto topAddr = getTop(); - if (retAddr == topAddr.retAddr && sctr < maxCtr) { - sctr++; + auto topAddr = getTop(tid); + if (retAddr == topAddr.retAddr && state.sctr < maxCtr) { + state.sctr++; } else { - ptrInc(ssp); - sctr = 0; + ptrInc(state.ssp); + state.sctr = 0; // do not update non-spec stack here } // push will always enter inflight queue RASInflightEntry t; t.data.retAddr = retAddr; - t.data.ctr = sctr; - t.nos = TOSR; - inflightStack[TOSW] = t; - TOSR = TOSW; - inflightPtrInc(TOSW); + t.data.ctr = state.sctr; + t.nos = state.TOSR; + state.inflightStack[state.TOSW] = t; + state.TOSR = state.TOSW; + inflightPtrInc(state.TOSW); } void -BTBRAS::pop_stack() +BTBRAS::pop_stack(ThreadID tid) { + auto &state = threadStates[tid]; //if (ndepth) { - auto tos = stack[nsp]; + auto tos = state.stack[state.nsp]; if (tos.data.ctr > 0) { - stack[nsp].data.ctr--; + state.stack[state.nsp].data.ctr--; } else { - ptrDec(nsp); + ptrDec(state.nsp); } //--ndepth; //} else { @@ -280,30 +297,31 @@ BTBRAS::pop_stack() } void -BTBRAS::pop() +BTBRAS::pop(ThreadID tid) { + auto &state = threadStates[tid]; // DPRINTFR(RAS, "doing pop ndepth = %d", ndepth); rasStats.Pops++; // pop may need to deal with committed stack - if (inflightInRange(TOSR)) { - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); - TOSR = inflightStack[TOSR].nos; - if (sctr > 0) { - sctr--; + if (inflightInRange(state, state.TOSR)) { + DPRINTF(RAS, "Select from inflight, addr %lx\n", state.inflightStack[state.TOSR].data.retAddr); + state.TOSR = state.inflightStack[state.TOSR].nos; + if (state.sctr > 0) { + state.sctr--; } else { - ptrDec(ssp); - auto newTop = getTop(); - sctr = newTop.ctr; + ptrDec(state.ssp); + auto newTop = getTop(tid); + state.sctr = newTop.ctr; } } else /*if (ndepth)*/ { // TOSR not valid, operate on committed stack DPRINTF(RAS, "in committed range\n"); - if (sctr > 0) { - sctr--; + if (state.sctr > 0) { + state.sctr--; } else { - ptrDec(ssp); - auto newTop = getTop(); - sctr = newTop.ctr; + ptrDec(state.ssp); + auto newTop = getTop(tid); + state.sctr = newTop.ctr; } } //else { @@ -351,12 +369,12 @@ BTBRAS::inflightPtrPlus1(int ptr) { } bool -BTBRAS::inflightInRange(int &ptr) +BTBRAS::inflightInRange(const ThreadRASState &state, int ptr) { - if (TOSW > BOS) { - return ptr >= BOS && ptr < TOSW; - } else if (TOSW < BOS) { - return ptr < TOSW || ptr >= BOS; + if (state.TOSW > state.BOS) { + return ptr >= state.BOS && ptr < state.TOSW; + } else if (state.TOSW < state.BOS) { + return ptr < state.TOSW || ptr >= state.BOS; } else { // empty inflight queue return false; @@ -364,64 +382,79 @@ BTBRAS::inflightInRange(int &ptr) } BTBRAS::RASEssential -BTBRAS::getTop() +BTBRAS::getTop(ThreadID tid) { + auto &state = threadStates[tid]; // results may come from two sources: inflight queue and committed stack - if (inflightInRange(TOSR)) { + if (inflightInRange(state, state.TOSR)) { // result come from inflight queue - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); + DPRINTF(RAS, "Select from inflight, addr %lx\n", + state.inflightStack[state.TOSR].data.retAddr); // additional check: if nos is out of bound, check if commit stack top == inflight[nos] /* - if (!inflightInRange(inflightStack[TOSR].nos)) { - auto top = stack[nsp]; - if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) { + if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) { + auto top = state.stack[state.nsp]; + if (top.data.retAddr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.retAddr || + top.data.ctr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.ctr) { // inflight[nos] is not the same as stack[nsp] DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n"); - printStack("Error case stack dump"); + printStack("Error case stack dump", tid); } }*/ - return inflightStack[TOSR].data; + return state.inflightStack[state.TOSR].data; } else { // result come from commit queue - DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr); - return stack[ssp].data; + DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr); + return state.stack[state.ssp].data; } } BTBRAS::RASEssential -BTBRAS::getTop_meta() { +BTBRAS::getTop_meta(ThreadID tid) { + auto &state = threadStates[tid]; + assert(state.meta); // results may come from two sources: inflight queue and committed stack - if (inflightInRange(TOSR)) { + if (inflightInRange(state, state.TOSR)) { // result come from inflight queue - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); - meta->ssp = ssp; - meta->sctr = sctr; - meta->TOSR = TOSR; - meta->TOSW = TOSW; - meta->target = inflightStack[TOSR].data.retAddr; + DPRINTF(RAS, "Select from inflight, addr %lx\n", + state.inflightStack[state.TOSR].data.retAddr); + state.meta->ssp = state.ssp; + state.meta->sctr = state.sctr; + state.meta->TOSR = state.TOSR; + state.meta->TOSW = state.TOSW; + state.meta->target = state.inflightStack[state.TOSR].data.retAddr; // additional check: if nos is out of bound, check if commit stack top == inflight[nos] /* - if (!inflightInRange(inflightStack[TOSR].nos)) { - auto top = stack[nsp]; - if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) { + if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) { + auto top = state.stack[state.nsp]; + if (top.data.retAddr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.retAddr || + top.data.ctr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.ctr) { // inflight[nos] is not the same as stack[nsp] DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n"); - printStack("Error case stack dump"); + printStack("Error case stack dump", tid); } }*/ - return inflightStack[TOSR].data; + return state.inflightStack[state.TOSR].data; } else { // result come from commit queue - meta->ssp = ssp; - meta->sctr = sctr; - meta->TOSR = TOSR; - meta->TOSW = TOSW; - meta->target = stack[ssp].data.retAddr; - DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr); - return stack[ssp].data; + state.meta->ssp = state.ssp; + state.meta->sctr = state.sctr; + state.meta->TOSR = state.TOSR; + state.meta->TOSW = state.TOSW; + state.meta->target = state.stack[state.ssp].data.retAddr; + DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr); + return state.stack[state.ssp].data; } } diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh index 0055446013..19bb1f0e15 100644 --- a/src/cpu/pred/btb/ras.hh +++ b/src/cpu/pred/btb/ras.hh @@ -94,7 +94,7 @@ namespace btb_pred { void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -112,14 +112,28 @@ namespace btb_pred { Addr getTopAddrFromMetas(const FetchTarget &stream); private: + struct ThreadRASState + { + int TOSW = 0; // inflight pointer to the write top of stack + int TOSR = 0; // inflight pointer to the read top of stack + int BOS = 0; // inflight pointer to the bottom of stack + int ssp = 0; // speculative stack pointer + int nsp = 0; // committed stack pointer + int sctr = 0; + std::vector stack; + std::vector inflightStack; + std::shared_ptr meta; + }; - void push(Addr retAddr); + void initThreadState(ThreadRASState &state); - void pop(); + void push(ThreadID tid, Addr retAddr); - void push_stack(Addr retAddr); - - void pop_stack(); + void pop(ThreadID tid); + + void push_stack(ThreadID tid, Addr retAddr); + + void pop_stack(ThreadID tid); void ptrInc(int &ptr); @@ -129,38 +143,43 @@ namespace btb_pred { void inflightPtrDec(int &ptr); - bool inflightInRange(int &ptr); + bool inflightInRange(const ThreadRASState &state, int ptr); int inflightPtrPlus1(int ptr); - void checkCorrectness(); + void checkCorrectness(ThreadID tid); - RASEssential getTop(); + RASEssential getTop(ThreadID tid); - RASEssential getTop_meta(); + RASEssential getTop_meta(ThreadID tid); - void printStack(const char *when) { - DPRINTF(RAS, "printStack when %s: \n", when); + void printStack(const char *when, ThreadID tid) { + auto &state = threadStates[tid]; + DPRINTF(RAS, "[tid:%u] printStack when %s: \n", tid, when); for (int i = 0; i < numEntries; i++) { - DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, stack[i].data.retAddr, stack[i].data.ctr); - if (ssp == i) { + DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, + state.stack[i].data.retAddr, state.stack[i].data.ctr); + if (state.ssp == i) { DPRINTFR(RAS, " <-- SSP"); } - if (nsp == i) { + if (state.nsp == i) { DPRINTFR(RAS, " <-- NSP"); } DPRINTFR(RAS, "\n"); } DPRINTFR(RAS, "non-volatile stack:\n"); for (int i = 0; i < numInflightEntries; i++) { - DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, inflightStack[i].data.retAddr, inflightStack[i].data.ctr, inflightStack[i].nos); - if (TOSW == i) { + DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, + state.inflightStack[i].data.retAddr, + state.inflightStack[i].data.ctr, + state.inflightStack[i].nos); + if (state.TOSW == i) { DPRINTFR(RAS, " <-- TOSW"); } - if (TOSR == i) { + if (state.TOSR == i) { DPRINTFR(RAS, " <-- TOSR"); } - if (BOS == i) { + if (state.BOS == i) { DPRINTFR(RAS, " <-- BOS"); } DPRINTFR(RAS, "\n"); @@ -190,27 +209,11 @@ namespace btb_pred { unsigned numInflightEntries; - int TOSW; // inflight pointer to the write top of stack - - int TOSR; // inflight pointer to the read top of stack - - int BOS; // inflight pointer to the bottom of stack - int maxCtr; - int ssp; // spec sp - - int nsp; // non-spec sp - - int sctr; - - //int ndepth; - - std::vector stack; - - std::vector inflightStack; + unsigned numThreads; - std::shared_ptr meta; + std::vector threadStates; #ifdef UNIT_TEST typedef uint64_t Scalar; diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index 6b2eddc1ef..e0b75613fe 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -314,7 +314,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) { // Test case 1: Update with taken branch (PHR shifts in 2 bits from PC hash) // Correct order: first update folded histories with pre-update PHR, then mutate PHR - tage->doUpdateHist(history, true, pc, target); + tage->doUpdateHist(history, true, pc, target, 0); applyPathHistoryTaken(history, pc, target); // Verify folded history matches the ideal fold of the updated PHR @@ -322,7 +322,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) { // Test case 2: Update with not-taken branch (PHR unchanged, folded update is no-op) boost::dynamic_bitset<> before_not_taken = history; - tage->doUpdateHist(history, false, pc, target); + tage->doUpdateHist(history, false, pc, target, 0); // Verify folded history remains consistent tage->checkFoldedHist(history, "not-taken update"); @@ -458,9 +458,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) { std::vector originalIndexFoldedHist; for (int i = 0; i < tage->numPredictors; i++) { - originalTagFoldedHist.push_back(tage->tagFoldedHist[i]); - originalAltTagFoldedHist.push_back(tage->altTagFoldedHist[i]); - originalIndexFoldedHist.push_back(tage->indexFoldedHist[i]); + originalTagFoldedHist.push_back(tage->threadHistory[0].tagFoldedHist[i]); + originalAltTagFoldedHist.push_back(tage->threadHistory[0].altTagFoldedHist[i]); + originalIndexFoldedHist.push_back(tage->threadHistory[0].indexFoldedHist[i]); } // Make a prediction @@ -491,9 +491,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) { // Verify recovery produced the expected history for (int i = 0; i < tage->numPredictors; i++) { - tage->tagFoldedHist[i].check(expectedHistory); - tage->altTagFoldedHist[i].check(expectedHistory); - tage->indexFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].tagFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].altTagFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].indexFoldedHist[i].check(expectedHistory); } } diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh index fce1a6aef1..db611fef25 100644 --- a/src/cpu/pred/btb/timed_base_pred.hh +++ b/src/cpu/pred/btb/timed_base_pred.hh @@ -61,7 +61,10 @@ class TimedBaseBTBPredictor: public SimObject const boost::dynamic_bitset<> &history, std::vector &stagePreds) {} - virtual std::shared_ptr getPredictionMeta() { return nullptr; } + virtual std::shared_ptr getPredictionMeta(ThreadID tid = 0) + { + return nullptr; + } virtual void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {} virtual void specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {} diff --git a/src/cpu/pred/btb/uras.cc b/src/cpu/pred/btb/uras.cc index c507956d0e..53825d818a 100644 --- a/src/cpu/pred/btb/uras.cc +++ b/src/cpu/pred/btb/uras.cc @@ -85,8 +85,9 @@ BTBuRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, } std::shared_ptr -BTBuRAS::getPredictionMeta() +BTBuRAS::getPredictionMeta(ThreadID tid) { + (void)tid; std::shared_ptr meta_void_ptr = std::make_shared(meta); return meta_void_ptr; } diff --git a/src/cpu/pred/btb/uras.hh b/src/cpu/pred/btb/uras.hh index cdcde96b54..4ba12b3099 100644 --- a/src/cpu/pred/btb/uras.hh +++ b/src/cpu/pred/btb/uras.hh @@ -43,7 +43,7 @@ class BTBuRAS : public TimedBaseBTBPredictor void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -161,4 +161,4 @@ struct NonSpecRASTrace : public Record { } // namespace branch_prediction } // namespace gem5 -#endif // __CPU_PRED_BTB_URAS_HH__ \ No newline at end of file +#endif // __CPU_PRED_BTB_URAS_HH__ diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index fc91c8d2f3..27adf7f598 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -519,13 +519,14 @@ BaseSimpleCPU::readMiscReg(int misc_reg, ThreadID tid) } void -BaseSimpleCPU::readGem5Regs() +BaseSimpleCPU::readGem5Regs(ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; for (int i = 0; i < 32; i++) { diffAllStates->gem5RegFile[i] = - threadContexts[curThread]->getReg(RegId(IntRegClass, i)); + threadContexts[tid]->getReg(RegId(IntRegClass, i)); diffAllStates->gem5RegFile[i + 32] = - threadContexts[curThread]->getReg(RegId(FloatRegClass, i)); + threadContexts[tid]->getReg(RegId(FloatRegClass, i)); } } diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh index b289ac778f..bcdd7c9066 100644 --- a/src/cpu/simple/base.hh +++ b/src/cpu/simple/base.hh @@ -207,7 +207,7 @@ class BaseSimpleCPU : public BaseCPU RegVal readMiscReg(int misc_reg, ThreadID tid) override; - void readGem5Regs() override; + void readGem5Regs(ThreadID tid) override; }; } // namespace gem5 diff --git a/src/dev/riscv/HartCtrl.py b/src/dev/riscv/HartCtrl.py new file mode 100644 index 0000000000..242c10cccd --- /dev/null +++ b/src/dev/riscv/HartCtrl.py @@ -0,0 +1,13 @@ +from m5.params import * +from m5.proxy import * + +from m5.objects.Device import BasicPioDevice + + +class HartCtrl(BasicPioDevice): + type = 'HartCtrl' + cxx_header = "dev/riscv/hart_ctrl.hh" + cxx_class = 'gem5::HartCtrl' + pio_addr = 0x39001000 + pio_size = Param.Addr(0x1000, "Hart control register space size") + num_threads = Param.Int("Number of threads in the system.") diff --git a/src/dev/riscv/SConscript b/src/dev/riscv/SConscript index 15bf707400..267399e9c0 100755 --- a/src/dev/riscv/SConscript +++ b/src/dev/riscv/SConscript @@ -34,6 +34,7 @@ SimObject('HiFive.py', sim_objects=['HiFive', 'GenericRiscvPciHost'], SimObject('LupV.py', sim_objects=['LupV'], tags='riscv isa') SimObject('Clint.py', sim_objects=['Clint'], tags='riscv isa') SimObject('Lint.py', sim_objects=['Lint'], tags='riscv isa') +SimObject('HartCtrl.py', sim_objects=['HartCtrl'], tags='riscv isa') SimObject('PlicDevice.py', sim_objects=['PlicIntDevice'], tags='riscv isa') SimObject('Plic.py', sim_objects=['Plic'], tags='riscv isa') SimObject('RTC.py', sim_objects=['RiscvRTC'], tags='riscv isa') @@ -55,6 +56,7 @@ Source('hifive.cc', tags='riscv isa') Source('lupv.cc', tags='riscv isa') Source('clint.cc', tags='riscv isa') Source('lint.cc', tags='riscv isa') +Source('hart_ctrl.cc', tags='riscv isa') Source('plic_device.cc', tags='riscv isa') Source('plic.cc', tags='riscv isa') Source('rtc.cc', tags='riscv isa') diff --git a/src/dev/riscv/hart_ctrl.cc b/src/dev/riscv/hart_ctrl.cc new file mode 100644 index 0000000000..b0afe6c8a9 --- /dev/null +++ b/src/dev/riscv/hart_ctrl.cc @@ -0,0 +1,98 @@ +#include "dev/riscv/hart_ctrl.hh" + +#include "cpu/thread_context.hh" +#include "mem/packet_access.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +HartCtrl::HartCtrl(const Params &p) + : BasicPioDevice(p, p.pio_size), + hartResetState(p.num_threads, 1) +{ + if (!hartResetState.empty()) { + // Hart 0 is the boot hart and is considered released by default. + hartResetState[0] = 0; + } +} + +Tick +HartCtrl::read(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize); + assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t)); + + const Addr offset = pkt->getAddr() - pioAddr; + panic_if(offset % sizeof(uint64_t) != 0, + "HartCtrl only supports 64-bit aligned accesses: addr=%#lx", + pkt->getAddr()); + + const ThreadID tid = offset / sizeof(uint64_t); + panic_if(tid >= hartResetState.size(), + "HartCtrl access out of range: tid=%u addr=%#lx", + tid, pkt->getAddr()); + + pkt->setLE(hartResetState[tid]); + pkt->makeAtomicResponse(); + return pioDelay; +} + +Tick +HartCtrl::write(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize); + assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t)); + + const Addr offset = pkt->getAddr() - pioAddr; + panic_if(offset % sizeof(uint64_t) != 0, + "HartCtrl only supports 64-bit aligned accesses: addr=%#lx", + pkt->getAddr()); + + const ThreadID tid = offset / sizeof(uint64_t); + panic_if(tid >= hartResetState.size(), + "HartCtrl access out of range: tid=%u addr=%#lx", + tid, pkt->getAddr()); + + uint64_t value = 0; + switch (pkt->getSize()) { + case sizeof(uint8_t): + value = pkt->getLE(); + break; + case sizeof(uint16_t): + value = pkt->getLE(); + break; + case sizeof(uint32_t): + value = pkt->getLE(); + break; + case sizeof(uint64_t): + value = pkt->getLE(); + break; + default: + panic("Unsupported HartCtrl write size %u\n", pkt->getSize()); + } + + hartResetState[tid] = value; + + if (value == 0) { + tryWakeHart(tid); + } + + pkt->makeAtomicResponse(); + return pioDelay; +} + +void +HartCtrl::tryWakeHart(ThreadID tid) +{ + panic_if(tid >= sys->threads.size(), + "HartCtrl wake target %u out of system thread range %zu", + tid, sys->threads.size()); + + auto *tc = sys->threads[tid]; + panic_if(!tc, "HartCtrl target %u has no thread context", tid); + + tc->activate(); +} + +} // namespace gem5 diff --git a/src/dev/riscv/hart_ctrl.hh b/src/dev/riscv/hart_ctrl.hh new file mode 100644 index 0000000000..5fe47306f6 --- /dev/null +++ b/src/dev/riscv/hart_ctrl.hh @@ -0,0 +1,33 @@ +// +// Created for Xiangshan bare-metal hart control MMIO. +// + +#ifndef GEM5_HART_CTRL_HH +#define GEM5_HART_CTRL_HH + +#include + +#include "dev/io_device.hh" +#include "params/HartCtrl.hh" + +namespace gem5 +{ + +class HartCtrl : public BasicPioDevice +{ + public: + typedef HartCtrlParams Params; + explicit HartCtrl(const Params &p); + + Tick read(PacketPtr pkt) override; + Tick write(PacketPtr pkt) override; + + private: + void tryWakeHart(ThreadID tid); + + std::vector hartResetState; +}; + +} // namespace gem5 + +#endif // GEM5_HART_CTRL_HH diff --git a/src/sim/system.cc b/src/sim/system.cc index 7bc4ec37ce..c640334f4d 100644 --- a/src/sim/system.cc +++ b/src/sim/system.cc @@ -562,8 +562,8 @@ void System::initState() } // have to initiate golden memory after checkpoint restored - if (numCPUs > 1 && enableDifftest) { - warn("Creating golden memory for multi-core difftest\n"); + if (multiContextDifftest()) { + warn("Creating golden memory for multi-context difftest\n"); assert(enableMemDedup); goldenMem = dedupMemManager.createCopyOnWriteBranch(); goldenMemManager.initGoldenMem(physmem.getStartaddr(), memSize(), goldenMem); diff --git a/src/sim/system.hh b/src/sim/system.hh index db49b66926..1dca935d6e 100644 --- a/src/sim/system.hh +++ b/src/sim/system.hh @@ -416,6 +416,11 @@ class System : public SimObject, public PCEventScope bool multiCore() const { return numCPUs > 1; } + bool multiContextDifftest() const + { + return enableDifftest && (multiCore() || multiThread); + } + uint8_t *getGoldenMemPtr() const { return goldenMem; } GoldenGloablMem *getGoldenMemManager() { return &goldenMemManager; }