diff --git a/.gitignore b/.gitignore
index 6825061db1..43f20a137b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,4 +69,10 @@ AGENTS.md
 
 microbench/build/
 microbench/output/
-microbench/dramsim3*
\ No newline at end of file
+microbench/dramsim3*
+
+*.bin
+*.db
+*.log
+*.gz
+*.zstd
\ No newline at end of file
diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py
index dc66ed7833..d650b82f70 100644
--- a/configs/common/FSConfig.py
+++ b/configs/common/FSConfig.py
@@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None):
     self.system_port = self.membus.cpu_side_ports
     return self
 
-def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False):
-    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby)
+def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1,
+                                 ruby=False, num_threads=None):
+    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby,
+                                       num_threads=num_threads)
     self.workload = RiscvBareMetal()
     self.workload.reset_vect = 0x80000000
     return self
 
 
-def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
+def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False,
+                                num_threads=None):
     self = System()
     if not mdesc:
         # generic system
         mdesc = SysConfig()
+    if num_threads is None:
+        num_threads = np
     self.mem_mode = mem_mode
     self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
     print(self.mem_ranges)
@@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
     self.lint = Clint()
     self.lint.pio = self.iobus.mem_side_ports
     self.lint.pio_addr = 0x38000000
-    self.lint.num_threads = np
+    self.lint.num_threads = num_threads
+
+    self.hartctrl = HartCtrl()
+    self.hartctrl.pio = self.iobus.mem_side_ports
+    self.hartctrl.num_threads = num_threads
 
     self.mmcs = NemuMMC()
     self.mmcs.pio = self.iobus.mem_side_ports
@@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
             AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr +
             self.uartlite.pio_size),
             AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size),
+            AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size),
             AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size),
             AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size),
             ]
diff --git a/configs/common/Options.py b/configs/common/Options.py
index 937bdecac4..b9c89ed25c 100644
--- a/configs/common/Options.py
+++ b/configs/common/Options.py
@@ -344,16 +344,14 @@ def addCommonOptions(parser, configure_xiangshan=False):
         "that are present under any of the roots. If not given, dump all "
         "stats. ")
 
+    parser.add_argument("--smt", action="store_true", default=False,
+                        help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""")
+
     if configure_xiangshan:
         return
     # Following options are not available in XiangShan
 
     parser.add_argument("--checker", action="store_true")
-    parser.add_argument("--smt", action="store_true", default=False,
-                        help="""
-                      Only used if multiple programs are specified. If true,
-                      then the number of threads per cpu is same as the
-                      number of programs.""")
     parser.add_argument(
         "--elastic-trace-en", action="store_true",
         help="""Enable capture of data dependency and instruction
diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
index ed78ebc922..e05644c05e 100644
--- a/configs/common/xiangshan.py
+++ b/configs/common/xiangshan.py
@@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace):
     if args.difftest_ref_so is not None:
         ref_so = args.difftest_ref_so
         print("Obtained ref_so from args.difftest_ref_so: ", ref_so)
-    elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ:
+    elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ:
         ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"]
         print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so)
     elif "GCBV_REF_SO" in os.environ:
@@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         if args.raw_cpt:
             # If using raw binary, no restorer is needed.
             gcpt_restorer = None
-        elif args.num_cpus > 1:
+        elif args.num_cpus > 1 or args.smt:
             if "GCB_MULTI_CORE_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"]
                 print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer)
             else:
-                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core")
+                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest")
         elif args.restore_rvv_cpt:
             if "GCBV_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCBV_RESTORER"]
@@ -359,8 +359,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer)
         gcpt_restorer = args.gcpt_restorer
 
-    if args.num_cpus > 1:
-        print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).")
+    if args.num_cpus > 1 or args.smt:
+        print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).")
         sys.gcpt_restorer_size_limit = 2**20
     elif args.restore_rvv_cpt:
         print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.")
@@ -407,7 +407,7 @@ def config_difftest(cpu_list, args, sys):
     if not args.enable_difftest:
         return
     else:
-        if len(cpu_list) > 1:
+        if len(cpu_list) > 1 or args.smt:
             sys.enable_mem_dedup = True
             for cpu in cpu_list:
                 cpu.enable_mem_dedup = True
@@ -443,7 +443,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
     test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i)
                     for i in range(np)]
     # Configure MMU for trace-aware FS mode
+    if args.smt:
+        test_sys.multi_thread = True
+
     for cpu in test_sys.cpu:
+        if args.smt:
+            cpu.numThreads = 2
         cpu.mmu.pma_checker = PMAChecker(
             uncacheable=[AddrRange(0, size=0x80000000)])
         cpu.mmu.functional = args.functional_tlb
@@ -802,8 +807,11 @@ def build_xiangshan_system(args):
 
     TestCPUClass = get_xiangshan_cpu_class(args)
     ruby = bool(hasattr(args, 'ruby') and args.ruby)
+    num_threads = np * (2 if getattr(args, 'smt', False) else 1)
 
-    test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby)
+    test_sys = makeBareMetalXiangshanSystem(
+        'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby,
+        num_threads=num_threads)
 
     if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode:
         if bool(getattr(args, 'trace_timing_ptw', False)):
diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
index e97eef0940..2448a9ad95 100644
--- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;
diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
index a8e5b71f99..4b64f5dac0 100644
--- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;
diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc
index 07b883fc30..b3f150ede9 100644
--- a/src/arch/riscv/tlb.cc
+++ b/src/arch/riscv/tlb.cc
@@ -2114,7 +2114,6 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
 
     return NoFault;
 }
-
 PrivilegeMode
 TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
 {
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 63c0e7964a..264e17bf4d 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -43,6 +43,7 @@
 
 #include "cpu/base.hh"
 
+#include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -208,40 +209,52 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker)
               "of threads (%i).\n", params().isa.size(), numThreads);
     }
 
-    diffAllStates = std::make_shared<DiffAllStates>();
+    diffAllStates.resize(numThreads);
+    recentCommittedStores.resize(numThreads);
+    syncVisibleStoreReplayArmed.resize(numThreads, false);
     if (enableDifftest) {
         assert(params().difftest_ref_so.length() > 2);
-        diffAllStates->diff.nemu_reg = &(diffAllStates->referenceRegFile);
-        diffAllStates->diff.nemu_this_pc = 0x80000000u;
-        diffAllStates->diff.cpu_id = params().cpu_id;
-        warn("cpu_id set to %d\n", params().cpu_id);
-
-        if (params().difftest_ref_so.find("spike") != std::string::npos) {
-            assert(!system->multiCore());
-            diffAllStates->proxy = new SpikeProxy(
-                params().cpu_id, params().difftest_ref_so.c_str(),
-                params().nemuSDimg.size() && params().nemuSDCptBin.size());
-        } else {
-            diffAllStates->proxy =
-                new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(),
-                              params().nemuSDimg.size() && params().nemuSDCptBin.size(), system->enabledMemDedup(),
-                              system->multiCore());
-        }
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid] = std::make_shared<DiffAllStates>();
+            auto diff_state = diffAllStates[tid];
+            diff_state->diff.nemu_reg = &(diff_state->referenceRegFile);
+            diff_state->diff.nemu_this_pc = 0x80000000u;
+            diff_state->diff.cpu_id = difftestHartId(tid);
+            warn("difftest hart id set to %d for tid %d\n",
+                 diff_state->diff.cpu_id, tid);
+
+            if (params().difftest_ref_so.find("spike") != std::string::npos) {
+                assert(!system->multiContextDifftest());
+                diff_state->proxy = new SpikeProxy(
+                    params().cpu_id, params().difftest_ref_so.c_str(),
+                    params().nemuSDimg.size() && params().nemuSDCptBin.size());
+            } else {
+                diff_state->proxy =
+                    new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(),
+                                  params().nemuSDimg.size() && params().nemuSDCptBin.size(),
+                                  system->enabledMemDedup(),
+                                  system->multiContextDifftest());
+            }
 
-        warn("Difftest is enabled with ref so: %s.\n", params().difftest_ref_so.c_str());
+            warn("Difftest is enabled with ref so: %s.\n",
+                 params().difftest_ref_so.c_str());
 
-        diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), REF_TO_DUT);
-        diffAllStates->diff.dynamic_config.ignore_illegal_mem_access = false;
-        diffAllStates->diff.dynamic_config.debug_difftest = false;
-        diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
-        if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) {
-            diffAllStates->proxy->sdcard_init(params().nemuSDimg.c_str(),
-                               params().nemuSDCptBin.c_str());
+            diff_state->proxy->regcpy(&(diff_state->gem5RegFile), REF_TO_DUT);
+            diff_state->diff.dynamic_config.ignore_illegal_mem_access = false;
+            diff_state->diff.dynamic_config.debug_difftest = false;
+            diff_state->proxy->update_config(&diff_state->diff.dynamic_config);
+            if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) {
+                diff_state->proxy->sdcard_init(params().nemuSDimg.c_str(),
+                                   params().nemuSDCptBin.c_str());
+            }
+            diff_state->diff.will_handle_intr = false;
         }
-        diffAllStates->diff.will_handle_intr = false;
     } else {
         warn("Difftest is disabled\n");
-        diffAllStates->hasCommit = true;
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid] = std::make_shared<DiffAllStates>();
+            diffAllStates[tid]->hasCommit = true;
+        }
     }
 
     if (dumpCommitFlag) {
@@ -404,11 +417,14 @@ BaseCPU::startup()
     if (powerState->get() == enums::PwrState::UNDEFINED)
         powerState->set(enums::PwrState::ON);
 
-    if (system->multiCore()) {
+    if (system->multiContextDifftest()) {
         goldenMemPtr = system->getGoldenMemPtr();
         _goldenMemManager = system->getGoldenMemManager();
 
-        diffAllStates->proxy->initState(params().cpu_id, goldenMemPtr);
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid]->proxy->initState(difftestHartId(tid),
+                                                 goldenMemPtr);
+        }
     } else {
         goldenMemPtr = nullptr;
         _goldenMemManager = nullptr;
@@ -417,6 +433,33 @@ BaseCPU::startup()
 
 }
 
+void
+BaseCPU::recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst)
+{
+    RecentCommittedStore recent;
+
+    if (!system->multiContextDifftest() || !_goldenMemManager ||
+        !inst->isStore() || inst->isAtomic() ||
+        (inst->isStoreConditional() && !inst->lockedWriteSuccess()) ||
+        !inst->memData || inst->effSize == 0 ||
+        inst->effSize > sizeof(recent.data) ||
+        !_goldenMemManager->inPmem(inst->physEffAddr)) {
+        return;
+    }
+
+    auto &recent_history = recentCommittedStores.at(tid);
+    recent.valid = true;
+    recent.addr = inst->physEffAddr;
+    recent.size = inst->effSize;
+    recent.seq = inst->seqNum;
+    std::memcpy(recent.data, inst->memData, recent.size);
+    recent_history.push_back(recent);
+    constexpr size_t max_store_history = 16;
+    if (recent_history.size() > max_store_history) {
+        recent_history.pop_front();
+    }
+}
+
 probing::PMUUPtr
 BaseCPU::pmuProbePoint(const char *name)
 {
@@ -702,7 +745,7 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU)
     if (enable_diff) {
         warn("Take over difftest state to new CPU\n");
         enableDifftest = enable_diff;
-        takeOverDiffAllStates(diff_all);
+        takeOverDiffAllStates(std::move(diff_all));
     }
 }
 
@@ -865,6 +908,12 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent)
     hostOpRate = simOps / hostSeconds;
 }
 
+int
+BaseCPU::difftestHartId(ThreadID tid) const
+{
+    return params().cpu_id * numThreads + tid;
+}
+
 void
 BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq,
                         std::string error_csr_name, int &diff_at)
@@ -883,6 +932,8 @@ BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint
 std::pair<int, bool>
 BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     int diff_at = DiffAt::NoneDiff;
     bool npc_match = false;
     bool is_mmio = diffInfo.curInstStrictOrdered;
@@ -966,7 +1017,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
 
     if (enableRVV) {
         if (diffInfo.inst->isVector()) {
-            readGem5Regs();
+            readGem5Regs(tid);
             uint64_t* nemu_val = (uint64_t*)&(diffAllStates->referenceRegFile.vr[0]);
             uint64_t* gem5_val = (uint64_t*)&(diffAllStates->gem5RegFile.vr[0]);
             bool maybe_error = false;
@@ -1431,35 +1482,104 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                                         diffInfo.physEffAddr, diffInfo.effSize);
                 }
 
-                if (system->multiCore() && (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) &&
+                if (system->multiContextDifftest() &&
+                    (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) &&
                     _goldenMemManager->inPmem(diffInfo.physEffAddr)) {
-                    warn("Difference on %s instr found in multicore mode, check in golden memory\n",
-                         diffInfo.inst->isLoad() ? "load" : "amo");
-                    uint8_t *golden_ptr = diffInfo.goldenValue;
+                    DPRINTF(Diff,
+                            "Difference on %s instr found in multicore mode, "
+                            "check in golden memory\n",
+                            diffInfo.inst->isLoad() ? "load" : "amo");
+                    uint8_t current_golden_data[16] = {};
+                    panic_if(diffInfo.effSize > sizeof(current_golden_data),
+                             "Unexpected large mem diff size: %u\n",
+                             diffInfo.effSize);
+                    _goldenMemManager->readGoldenMem(diffInfo.physEffAddr,
+                                                     current_golden_data,
+                                                     diffInfo.effSize);
+                    uint8_t *golden_ptr = current_golden_data;
+                    uint8_t *exec_golden_ptr = diffInfo.goldenValue;
+                    const RecentCommittedStore *matched_recent_store = nullptr;
+                    if (diffInfo.inst->isLoad()) {
+                        const auto &recent_history =
+                            recentCommittedStores.at(tid);
+                        for (auto it = recent_history.rbegin();
+                             it != recent_history.rend(); ++it) {
+                            if (!it->valid ||
+                                it->addr != diffInfo.physEffAddr ||
+                                it->size != diffInfo.effSize ||
+                                it->seq >= seq ||
+                                (seq - it->seq) > 256) {
+                                continue;
+                            }
+                            if (memcmp(it->data, &gem5_val,
+                                       diffInfo.effSize) == 0) {
+                                matched_recent_store = &(*it);
+                                break;
+                            }
+                        }
+                    }
+                    auto sync_reg = [&]() {
+                        diffAllStates->referenceRegFile[dest_tag] = gem5_val;
+                        diffAllStates->proxy->regcpy(
+                            &(diffAllStates->referenceRegFile), DUT_TO_REF);
+                    };
 
-                    // a lambda function to sync memory and register from golden results to ref
-                    auto sync_mem_reg = [&]() {
-                        diffAllStates->proxy->memcpy(diffInfo.physEffAddr, golden_ptr, diffInfo.effSize,
+                    // Sync both memory and register when the value is already
+                    // globally visible in golden memory.
+                    auto sync_mem_reg = [&](const uint8_t *mem_src) {
+                        diffAllStates->proxy->memcpy(diffInfo.physEffAddr,
+                                                     const_cast<uint8_t *>(mem_src),
+                                                     diffInfo.effSize,
                                                      DIFFTEST_TO_REF);
-                        diffAllStates->referenceRegFile[dest_tag] = gem5_val;
-                        diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF);
+                        sync_reg();
                     };
 
-                    if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) {
-                        DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n");
-                        sync_mem_reg();
+                    if (diffInfo.inst->isLoad() &&
+                               memcmp(golden_ptr, &gem5_val,
+                                      diffInfo.effSize) == 0) {
+                        DPRINTF(Diff,
+                                "Load content matched in golden memory. "
+                                "Sync from golden to ref\n");
+                        sync_mem_reg(golden_ptr);
+                        continue;
+                    } else if (diffInfo.inst->isLoad() && exec_golden_ptr &&
+                               memcmp(exec_golden_ptr, &gem5_val,
+                                      diffInfo.effSize) == 0) {
+                        DPRINTF(Diff,
+                                "Load content matched the execution-time "
+                                "golden snapshot. Sync from the recorded "
+                                "snapshot to ref\n");
+                        sync_mem_reg(exec_golden_ptr);
+                        continue;
+                    } else if (matched_recent_store) {
+                        DPRINTF(Diff,
+                                "Load content matched recent committed store "
+                                "[sn:%llu] at addr %#lx. Syncing ref from the "
+                                "store snapshot for this hart.\n",
+                                matched_recent_store->seq,
+                                diffInfo.physEffAddr);
+                        sync_mem_reg(matched_recent_store->data);
                         continue;
                     } else if (diffInfo.inst->isAtomic()) {
                         DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue,
                                 gem5_val);
                         DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr);
-                        if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) {
+                        if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val,
+                                   diffInfo.effSize) == 0) {
                             DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n");
-                            sync_mem_reg();
+                            sync_mem_reg(golden_ptr);
                             continue;
-                        } else {
-                            warn("Atomic old value not matched!\n");
                         }
+                    } else if (diffInfo.inst->isLoad()) {
+                        DPRINTF(Diff,
+                                "Unresolved shared-memory load mismatch at "
+                                "addr=%#lx gem5=%#lx current_golden=%#lx "
+                                "exec_snapshot=%#lx; falling back to normal "
+                                "difftest reporting.\n",
+                                diffInfo.physEffAddr, gem5_val,
+                                *(uint64_t *)golden_ptr,
+                                exec_golden_ptr ?
+                                    *(uint64_t *)exec_golden_ptr : 0);
                     }
                 }
 
@@ -1517,9 +1637,10 @@ BaseCPU::clearDiffMismatch(ThreadID tid, InstSeqNum seq) {
 void
 BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     warn("%s", diffMsg.str());
     diffAllStates->proxy->isa_reg_display();
-    displayGem5Regs();
+    displayGem5Regs(tid);
     warn("start dump last %lu committed msg\n", diffInfo.lastCommittedMsg.size());
     while (diffInfo.lastCommittedMsg.size()) {
         auto &inst = diffInfo.lastCommittedMsg.front();
@@ -1531,6 +1652,8 @@ BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq)
 void
 BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     bool should_diff = false;
     DPRINTF(DumpCommit, "[sn:%llu] %#lx, %s\n",
             seq, diffInfo.pc->instAddr(), diffInfo.inst->disassemble(diffInfo.pc->instAddr()));
@@ -1550,22 +1673,26 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
         should_diff = true;
         if (!diffAllStates->hasCommit && diffInfo.pc->instAddr() == 0x80000000u) {
             diffAllStates->hasCommit = true;
-            readGem5Regs();
+            readGem5Regs(tid);
             diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr();
             if (noHypeMode) {
-                auto start = pmemStart + pmemSize * diffAllStates->diff.cpu_id;
-                warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize);
+                auto start = pmemStart + pmemSize * difftestHartId(tid);
                 diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF);
             } else if (enableMemDedup) {
-                warn("Let ref share a COW mirror of root memory\n");
-                assert(diffAllStates->proxy->ref_get_backed_memory);
-                diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);
+                if (system->multiContextDifftest()) {
+                    assert(goldenMemPtr);
+                    assert(diffAllStates->proxy->ref_get_backed_memory);
+                    diffAllStates->proxy->ref_get_backed_memory(
+                        system->createCopyOnWriteBranch(), pmemSize);
+                    diffAllStates->proxy->memcpy_init(
+                        0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF);
+                } else {
+                    assert(diffAllStates->proxy->ref_get_backed_memory);
+                    diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);
+                }
             } else {
-                warn("MemDedup disabled, copying pmem to NEMU\n");
-                warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize);
                 diffAllStates->proxy->memcpy_init(0x80000000u, pmemStart, pmemSize, DUT_TO_REF);
             }
-            warn("Start regcpy to NEMU\n");
             diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), DUT_TO_REF);
         }
     }
@@ -1603,9 +1730,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
 }
 
 void
-BaseCPU::displayGem5Regs()
+BaseCPU::displayGem5Regs(ThreadID tid)
 {
-    readGem5Regs();
+    auto diffAllStates = this->diffAllStates[tid];
+    readGem5Regs(tid);
     std::string str;
     //reg
     for (size_t i = 0; i < 32; i++)
@@ -1712,8 +1840,9 @@ BaseCPU::displayGem5Regs()
 }
 
 void
-BaseCPU::difftestRaiseIntr(uint64_t no)
+BaseCPU::difftestRaiseIntr(uint64_t no, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     diffAllStates->diff.will_handle_intr = true;
     diffAllStates->proxy->raise_intr(no);
 }
@@ -1721,19 +1850,24 @@ BaseCPU::difftestRaiseIntr(uint64_t no)
 void
 BaseCPU::clearGuideExecInfo()
 {
-    diffAllStates->diff.guide.force_raise_exception = false;
-    diffAllStates->diff.guide.force_set_jump_target = false;
+    for (auto &diffAllStates : this->diffAllStates) {
+        diffAllStates->diff.guide.force_raise_exception = false;
+        diffAllStates->diff.guide.force_set_jump_target = false;
+    }
 }
 
 void
 BaseCPU::enableDiffPrint()
 {
-    diffAllStates->diff.dynamic_config.debug_difftest = true;
-    diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
+    for (auto &diffAllStates : this->diffAllStates) {
+        diffAllStates->diff.dynamic_config.debug_difftest = true;
+        diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
+    }
 }
 
-void BaseCPU::setSCSuccess(bool success, paddr_t addr)
+void BaseCPU::setSCSuccess(bool success, paddr_t addr, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     diffAllStates->diff.sync.lrscValid = success;
     diffAllStates->diff.sync.lrscAddr = addr; // used for spike diff
 }
@@ -1742,6 +1876,8 @@ void
 BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, bool force_set_jump_target,
                                    uint64_t jump_target, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     auto &gd = diffAllStates->diff.guide;
     gd.force_raise_exception = true;
     gd.exception_num = exception_num;
@@ -1769,7 +1905,7 @@ BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint6
 void
 BaseCPU::checkL1DRefill(Addr paddr, const uint8_t* refill_data, size_t size) {
     assert(size == 64);
-    if (system->multiCore()) {
+    if (system->multiContextDifftest()) {
         uint8_t *golden_ptr = (uint8_t *)_goldenMemManager->guestToHost(paddr);
         if (memcmp(golden_ptr, refill_data, size)) {
             panic("Refill data diff with Golden addr %#lx with size %d\n", paddr, size);
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 8fe6d55d61..21c13388db 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -42,6 +42,7 @@
 #ifndef __CPU_BASE_HH__
 #define __CPU_BASE_HH__
 
+#include <deque>
 #include <queue>
 #include <vector>
 
@@ -138,6 +139,17 @@ struct DiffAllStates
 class BaseCPU : public ClockedObject
 {
   protected:
+    struct RecentCommittedStore
+    {
+        bool valid = false;
+        Addr addr = 0;
+        size_t size = 0;
+        InstSeqNum seq = 0;
+        uint8_t data[16] = {};
+    };
+
+    std::vector<std::deque<RecentCommittedStore>> recentCommittedStores;
+    std::vector<bool> syncVisibleStoreReplayArmed;
 
     const unsigned IntRegIndexBase = 0;
     const unsigned FPRegIndexBase = 32;
@@ -693,7 +705,7 @@ class BaseCPU : public ClockedObject
     bool enableRVV{false};
     bool enableRVHDIFF{false};
     bool enableSkipCSR{false};
-    std::shared_ptr<DiffAllStates> diffAllStates{};
+    std::vector<std::shared_ptr<DiffAllStates>> diffAllStates{};
 
     enum  diffRegConfig
     {
@@ -701,7 +713,7 @@ class BaseCPU : public ClockedObject
       diffCsrNum = 36,
     };
 
-    virtual void readGem5Regs()
+    virtual void readGem5Regs(ThreadID tid)
     {
         panic("difftest:readGem5Regs() is not implemented\n");
     }
@@ -709,6 +721,7 @@ class BaseCPU : public ClockedObject
     void csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq,
                         std::string error_csr_name,int &diff_at);
     std::pair<int, bool> diffWithNEMU(ThreadID tid, InstSeqNum seq);
+    int difftestHartId(ThreadID tid) const;
 
     std::stringstream diffMsg;
     void reportDiffMismatch(ThreadID tid, InstSeqNum seq);
@@ -777,13 +790,25 @@ class BaseCPU : public ClockedObject
 
     void difftestStep(ThreadID tid, InstSeqNum seq);
 
+    void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst);
+    void armSyncVisibleStoreReplay(ThreadID tid)
+    {
+        syncVisibleStoreReplayArmed.at(tid) = true;
+    }
+    bool consumeSyncVisibleStoreReplay(ThreadID tid)
+    {
+        bool armed = syncVisibleStoreReplayArmed.at(tid);
+        syncVisibleStoreReplayArmed.at(tid) = false;
+        return armed;
+    }
+
     inline bool difftestEnabled() const { return enableDifftest; }
 
-    void displayGem5Regs();
+    void displayGem5Regs(ThreadID tid);
 
-    void difftestRaiseIntr(uint64_t no);
+    void difftestRaiseIntr(uint64_t no, ThreadID tid = 0);
 
-    void setSCSuccess(bool success, paddr_t addr);
+    void setSCSuccess(bool success, paddr_t addr, ThreadID tid);
 
     void setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval,
                                    // force set jump target
@@ -793,14 +818,14 @@ class BaseCPU : public ClockedObject
 
     void enableDiffPrint();
 
-    std::pair<bool, std::shared_ptr<DiffAllStates>> getDiffAllStates()
+    std::pair<bool, std::vector<std::shared_ptr<DiffAllStates>>> getDiffAllStates()
     {
         return std::make_pair(enableDifftest, diffAllStates);
     }
 
-    void takeOverDiffAllStates(std::shared_ptr<DiffAllStates> diffAllStates)
+    void takeOverDiffAllStates(std::vector<std::shared_ptr<DiffAllStates>> diffAllStates)
     {
-        this->diffAllStates = diffAllStates;
+        this->diffAllStates = std::move(diffAllStates);
     }
 
     int committedInstNum = 0;
diff --git a/src/cpu/difftest.cc b/src/cpu/difftest.cc
index 7293e51b9a..63665f194b 100644
--- a/src/cpu/difftest.cc
+++ b/src/cpu/difftest.cc
@@ -149,6 +149,12 @@ NemuProxy::NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bo
 #endif
 
     multiCore = multi_core;
+    if (multiCore) {
+        nemuSetHartId = (void (*)(int))dlsym(handle, "difftest_set_mhartid");
+        assert(nemuSetHartId);
+        nemuPutGmaddr = (void (*)(uint8_t *))dlsym(handle, "difftest_put_gmaddr");
+        assert(nemuPutGmaddr);
+    }
 
     if (enable_sdcard_diff) {
         sdcard_init = (void (*)(const char *, const char *))dlsym(
@@ -168,15 +174,18 @@ void
 NemuProxy::initState(int coreid, uint8_t *golden_mem)
 {
     if (multiCore) {
-        auto nemu_difftest_set_mhartid = (void (*)(int))dlsym(handle, "difftest_set_mhartid");
         warn("Setting mhartid to %d\n", coreid);
-        assert(nemu_difftest_set_mhartid);
-        nemu_difftest_set_mhartid(coreid);
-
-        auto nemu_difftest_put_gmaddr = (void (*)(uint8_t *ptr))dlsym(handle, "difftest_put_gmaddr");
+        setHartId(coreid);
         warn("Setting gmaddr to %#lx\n", (uint64_t) golden_mem);
-        assert(nemu_difftest_put_gmaddr);
-        nemu_difftest_put_gmaddr(golden_mem);
+        nemuPutGmaddr(golden_mem);
+    }
+}
+
+void
+NemuProxy::setHartId(int coreid)
+{
+    if (multiCore) {
+        nemuSetHartId(coreid);
     }
 }
 
diff --git a/src/cpu/difftest.hh b/src/cpu/difftest.hh
index af4eee4d96..7d91201b4f 100644
--- a/src/cpu/difftest.hh
+++ b/src/cpu/difftest.hh
@@ -195,6 +195,7 @@ class RefProxy
     void (*sdcard_init)(const char *img_path,
                         const char *sd_cpt_bin_path) = nullptr;
     virtual void initState(int coreid, uint8_t *golden_mem) = 0;
+    virtual void setHartId(int coreid) = 0;
 
   protected:
     bool multiCore;
@@ -208,6 +209,11 @@ class NemuProxy : public RefProxy
     NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bool enable_mem_dedup, bool multi_core);
 
     void initState(int coreid, uint8_t *golden_mem) override;
+    void setHartId(int coreid) override;
+
+  private:
+    void (*nemuSetHartId)(int) = nullptr;
+    void (*nemuPutGmaddr)(uint8_t *) = nullptr;
 };
 
 
@@ -217,6 +223,7 @@ class SpikeProxy : public RefProxy
     SpikeProxy(int coreid, const char *ref_so, bool enable_sdcard_diff);
 
     void initState(int coreid, uint8_t *golden_mem) override { panic("Not implemented\n"); }
+    void setHartId(int coreid) override { panic("Not implemented\n"); }
 };
 
 #define DIFFTEST_WIDTH 8
diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py
index 2d088a6032..7676f6d643 100644
--- a/src/cpu/o3/FuncScheduler.py
+++ b/src/cpu/o3/FuncScheduler.py
@@ -75,6 +75,11 @@ class PAgeSelector(BaseSelector):
 
     piece = Param.Int(2, "number of instructions in a group")
 
+class SMTBasedSelector(BaseSelector):
+    type = 'SMTBasedSelector'
+    cxx_class = 'gem5::o3::SMTBasedSelector'
+    cxx_header = "cpu/o3/issue_queue.hh"
+
 class IssueQue(SimObject):
     type = 'IssueQue'
     cxx_class = 'gem5::o3::IssueQue'
@@ -85,7 +90,7 @@ class IssueQue(SimObject):
     inports = Param.Int(2, "")
     scheduleToExecDelay = Param.Cycles(2, "")
     oports = VectorParam.IssuePort("")
-    sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ (default: age first)")
+    sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)")
 
 class Scheduler(SimObject):
     type = 'Scheduler'
diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript
index 1ee4cf9448..463a8cdfc0 100755
--- a/src/cpu/o3/SConscript
+++ b/src/cpu/o3/SConscript
@@ -32,7 +32,7 @@ Import('*')
 
 if env['CONF']['TARGET_ISA'] != 'null':
     SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel',
-              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler'])
+              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler'])
     SimObject('FuncUnitConfig.py', sim_objects=[])
     SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[
         'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord'])
diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh
index cb88ad769f..ade70ed5e3 100644
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -168,6 +168,12 @@ struct IssueStruct
     DynInstPtr insts[MaxWidth];
 };
 
+struct SquashInfo
+{
+    InstSeqNum squashSn;
+    ThreadID   squashTid;
+};
+
 struct SquashVersion
 {
     uint8_t version;
@@ -181,14 +187,23 @@ struct SquashVersion
         return (version + 1) % versionLimit;
     }
     bool largerThan(uint8_t other) const {
-        bool larger = version > other && version - other <= maxInflightSquash;
-        bool wrapped_larger =
-            version + versionLimit > other &&
-            version + versionLimit - other <= maxInflightSquash;
-        if (!(larger || wrapped_larger || (version == other))) {
+        const uint8_t distance = (version + versionLimit - other) % versionLimit;
+        if (distance == 0) {
+            return false;
+        }
+
+        if (distance <= maxInflightSquash) {
+            return true;
+        }
+
+        if (versionLimit - distance <= maxInflightSquash) {
+            return false;
+        }
+
+        if (version != other) {
             panic("SquashVersion: %d, other: %d\n", version, other);
         }
-        return larger || wrapped_larger;
+        return false;
     }
     void update(uint8_t v) {
         version = v;
@@ -199,6 +214,7 @@ struct SquashVersion
 
 struct ResolveQueueEntry
 {
+    ThreadID resolvedTid;
     uint64_t resolvedFTQId;
     std::vector<uint64_t> resolvedInstPC;
 };
@@ -246,6 +262,10 @@ struct TimeStruct
         };
         /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */
         std::vector<ResolvedCFIEntry> resolvedCFIs;  // *F
+
+        unsigned iqCount;
+        unsigned ldstqCount;
+        unsigned robCount;
     };
 
     IewComm iewInfo[MaxThreads]; // iew to rename, fetch
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index c06acc1221..c7b69d656c 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -42,6 +42,7 @@
 #include "cpu/o3/commit.hh"
 
 #include <algorithm>
+#include <array>
 #include <cstring>
 #include <set>
 #include <string>
@@ -104,32 +105,35 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara
     : commitPolicy(params.smtCommitPolicy),
       stuckCheckEvent([this]() {
         static std::vector<DynInstPtr> debug_insts;
-        if (cpu->curCycle() - this->lastCommitCycle > 40000) {
-            if (traceMaybeExitOnPipelineDrainFromStuckCheck()) {
-                return;
-            }
 
-            if (auto inst = rob->readHeadInst(0)) {
-                warn("can't commit inst %s\n", inst->genDisassembly());
-                debug_insts.insert(
-                    debug_insts.begin(), rob->getInstList(0).begin(),
-                    rob->getInstList(0).end());
-                warn("dump rob front 10 insts\n");
-                int i = 0;
-                for (auto inst = debug_insts.begin();
-                     inst != debug_insts.end() && i < 10; inst++, i++) {
-                    warn("%s\n", (*inst)->genDisassembly());
+        for (ThreadID tid = 0; tid < numThreads; tid++) {
+            if (cpu->curCycle() - this->lastCommitCycle[tid] > 40000) {
+                if (traceMaybeExitOnPipelineDrainFromStuckCheck()) {
+                    return;
                 }
-            } else {
-                warn("rob was empty, may be fetch or rename stuck\n");
+
+                if (auto inst = rob->readHeadInst(0)) {
+                    warn("can't commit inst %s\n", inst->genDisassembly());
+                    debug_insts.insert(
+                        debug_insts.begin(), rob->getInstList(tid).begin(),
+                        rob->getInstList(tid).end());
+                    warn("dump rob front 10 insts\n");
+                    int i = 0;
+                    for (auto inst = debug_insts.begin();
+                        inst != debug_insts.end() && i < 10; inst++, i++) {
+                        warn("%s\n", (*inst)->genDisassembly());
+                    }
+                } else {
+                    warn("rob was empty, may be fetch or rename stuck\n");
+                }
+                panic(
+                    "Commit stage is stucked for more than 40,000 cycles!\n"
+                    "Thread: %d Last commit cycle: %lu, current cycle: %lu, suggested "
+                    "--debug-start=%llu --debug-end=%llu\n", tid,
+                    lastCommitCycle[tid], cpu->curCycle(),
+                    cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] - 200)),
+                    cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] + 200)));
             }
-            panic(
-                "Commit stage is stucked for more than 40,000 cycles!\n"
-                "Last commit cycle: %lu, current cycle: %lu, suggested "
-                "--debug-start=%llu --debug-end=%llu\n",
-                lastCommitCycle, cpu->curCycle(),
-                cpu->cyclesToTicks(Cycles(lastCommitCycle - 200)),
-                cpu->cyclesToTicks(Cycles(lastCommitCycle + 200)));
         }
         cpu->schedule(this->stuckCheckEvent, cpu->clockEdge(Cycles(40010)));
       }, "CommitStuckCheckEvent"),
@@ -1184,342 +1188,391 @@ Commit::commitInsts()
     DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
 
     unsigned num_committed = 0;
+    std::array<unsigned, MaxThreads> num_committed_per_thread = {};
+    std::array<unsigned, MaxThreads> commit_width_per_thread = {};
 
     DynInstPtr head_inst;
 
-    int commit_width = rob->countInstsOfGroups(commitWidth);
+    int commit_width = 0;
+    for (ThreadID tid : *activeThreads) {
+        commit_width_per_thread[tid] =
+            rob->countInstsOfGroups(tid, commitWidth);
+        commit_width += commit_width_per_thread[tid];
+    }
 
     if (commit_width >= 0) {
         cpu->activityThisCycle();
     }
 
-    // Commit as many instructions as possible until the commit bandwidth
-    // limit is reached, or it becomes impossible to commit any more.
-    while (num_committed < commit_width) {
-        // hardware transactionally memory
-        // If executing within a transaction,
-        // need to handle interrupts specially
-
-        ThreadID commit_thread = getCommittingThread();
-
-        // Check for any interrupt that we've already squashed for
-        // and start processing it.
-        if (interrupt != NoFault) {
-            // If inside a transaction, postpone interrupts
-            if (executingHtmTransaction(commit_thread)) {
-                cpu->clearInterrupts(0);
-                toIEW->commitInfo[0].clearInterrupt = true;
-                interrupt = NoFault;
-                avoidQuiesceLiveLock = true;
-            } else {
-                handleInterrupt();
-            }
+    // Commit each thread independently for up to its local commit window.
+    for (ThreadID commit_thread : *activeThreads) {
+        if (commitStatus[commit_thread] != Running &&
+            commitStatus[commit_thread] != Idle &&
+            commitStatus[commit_thread] != FetchTrapPending) {
+            continue;
         }
 
-        // ThreadID commit_thread = getCommittingThread();
-
-        if (commit_thread == -1)
-            break;
-
-        head_inst = rob->readHeadInst(commit_thread);
-
-        if (!rob->isHeadGroupReady(commit_thread)) {
-            if (debug::Commit && head_inst->readyToCommit()) {
-                InstSeqNum seqnum = rob->getHeadGroupLastDoneSeq(commit_thread);
-                DPRINTF(
-                    Commit,
-                    "[sn:%llu] Head is ready to commit, but the group is not all ready, last done inst [sn:%llu]\n",
-                    head_inst->seqNum, seqnum);
+            while (num_committed < commit_width &&
+                num_committed_per_thread[commit_thread] <
+                    commit_width_per_thread[commit_thread]) {
+            // hardware transactionally memory
+            // If executing within a transaction,
+            // need to handle interrupts specially
+
+            // Check for any interrupt that we've already squashed for
+            // and start processing it.
+            if (interrupt != NoFault) {
+                // If inside a transaction, postpone interrupts
+                if (executingHtmTransaction(commit_thread)) {
+                    cpu->clearInterrupts(0);
+                    toIEW->commitInfo[0].clearInterrupt = true;
+                    interrupt = NoFault;
+                    avoidQuiesceLiveLock = true;
+                } else {
+                    handleInterrupt();
+                }
             }
-            break;
-        }
 
-        ThreadID tid = head_inst->threadNumber;
-
-        assert(tid == commit_thread);
-
-        DPRINTF(Commit,
-                "Trying to commit head instruction, [tid:%i] [sn:%llu]\n",
-                tid, head_inst->seqNum);
+            head_inst = rob->readHeadInst(commit_thread);
+
+            if (!rob->isHeadGroupReady(commit_thread)) {
+                if (debug::Commit && head_inst->readyToCommit()) {
+                    InstSeqNum seqnum =
+                        rob->getHeadGroupLastDoneSeq(commit_thread);
+                    DPRINTF(
+                        Commit,
+                        "[sn:%llu] Head is ready to commit, but the group "
+                        "is not all ready, last done inst [sn:%llu]\n",
+                        head_inst->seqNum, seqnum);
+                }
+                break;
+            }
 
-        // If the head instruction is squashed, it is ready to retire
-        // (be removed from the ROB) at any time.
-        if (head_inst->isSquashed()) {
+            ThreadID tid = head_inst->threadNumber;
 
-            DPRINTF(Commit, "Retiring squashed instruction from "
-                    "ROB.\n");
+            assert(tid == commit_thread);
 
-            rob->retireHead(commit_thread);
+            DPRINTF(Commit,
+                    "Trying to commit head instruction, [tid:%i] [sn:%llu]\n",
+                    tid, head_inst->seqNum);
 
-            ++stats.commitSquashedInsts;
-            // Notify potential listeners that this instruction is squashed
-            ppSquash->notify(head_inst);
+            // If the head instruction is squashed, it is ready to retire
+            // (be removed from the ROB) at any time.
+            if (head_inst->isSquashed()) {
 
-            // Record that the number of ROB entries has changed.
-            changedROBNumEntries[tid] = true;
-        } else {
-            set(pc[tid], head_inst->pcState());
-            traceMaybeInjectCtrlFlowChangeFault(tid, head_inst);
+                DPRINTF(Commit, "Retiring squashed instruction from "
+                        "ROB.\n");
 
-            // Try to commit the head instruction.
-            bool commit_success = commitHead(head_inst, num_committed);
+                rob->drainSquashedHead(commit_thread);
 
-            if (commit_success) {
-                cpu->perfCCT->updateInstPos(head_inst->seqNum, PerfRecord::AtCommit);
-                auto res = head_inst->getResult();
-                if (res.is<RegVal>()) {
-                    cpu->perfCCT->updateInstMeta(head_inst->seqNum, InstDetail::Result, res.as<RegVal>());
-                }
-                cpu->perfCCT->commitMeta(head_inst->seqNum);
+                ++stats.commitSquashedInsts;
+                // Notify potential listeners that this instruction is squashed
+                ppSquash->notify(head_inst);
 
-                DPRINTF(CommitTrace, "CT: %s\n", head_inst->genDisassembly());
+                // Record that the number of ROB entries has changed.
+                changedROBNumEntries[tid] = true;
+            } else {
+                set(pc[tid], head_inst->pcState());
+                traceMaybeInjectCtrlFlowChangeFault(tid, head_inst);
+
+                // Try to commit the head instruction.
+                bool commit_success = commitHead(head_inst,
+                                                num_committed_per_thread[tid]);
+
+                if (commit_success) {
+                    cpu->perfCCT->updateInstPos(head_inst->seqNum,
+                                                PerfRecord::AtCommit);
+                    auto res = head_inst->getResult();
+                    if (res.is<RegVal>()) {
+                        cpu->perfCCT->updateInstMeta(
+                            head_inst->seqNum, InstDetail::Result,
+                            res.as<RegVal>());
+                    }
+                    cpu->perfCCT->commitMeta(head_inst->seqNum);
 
-                if (ismispred) {
-                    ismispred = false;
-                    stats.recovery_bubble += (cpu->curCycle() - lastCommitCycle) * renameWidth;
-                }
-                if (head_inst->mispredicted()) {
-                    ismispred = true;
-                }
+                    DPRINTF(CommitTrace, "CT [tid:%d]: %s\n",
+                            head_inst->threadNumber,
+                            head_inst->genDisassembly());
 
-                lastCommitCycle = cpu->curCycle();
-                const auto &head_rv_pc = head_inst->pcState().as<RiscvISA::PCState>();
-                if (bp->isBTB()) {
-                    auto dbbtb = dynamic_cast<branch_prediction::btb_pred::DecoupledBPUWithBTB*>(bp);
-                    bool miss = head_inst->mispredicted();
-                    if (head_inst->isReturn()) {
-                        DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n",
-                                head_inst->pcState().instAddr(), miss,
-                                head_rv_pc.npc(), *(head_inst->predPC));
+                    if (ismispred) {
+                        ismispred = false;
+                        stats.recovery_bubble +=
+                            (cpu->curCycle() - lastCommitCycle[tid]) *
+                            renameWidth;
+                    }
+                    if (head_inst->mispredicted()) {
+                        ismispred = true;
                     }
 
-                    // FIXME: ignore mret/sret/uret in correspond with RTL
-                    if (!head_inst->isNonSpeculative() && head_inst->isControl()) {
-                        dbbtb->commitBranch(head_inst, miss);
-                        if (!head_inst->isReturn() && head_inst->isIndirectCtrl() && miss) {
-                            misPredIndirect[head_inst->pcState().instAddr()]++;
+                    lastCommitCycle[tid] = cpu->curCycle();
+                    const auto &head_rv_pc =
+                        head_inst->pcState().as<RiscvISA::PCState>();
+                    if (bp->isBTB()) {
+                        auto dbbtb = dynamic_cast<
+                            branch_prediction::btb_pred::
+                                DecoupledBPUWithBTB *>(bp);
+                        bool miss = head_inst->mispredicted();
+                        if (head_inst->isReturn()) {
+                            DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n",
+                                    head_inst->pcState().instAddr(), miss,
+                                    head_rv_pc.npc(), *(head_inst->predPC));
                         }
-                    }
-                    dbbtb->notifyInstCommit(head_inst);
-                }
-                    if (traceMaybeExitOnLastTraceInst(head_inst)) {
-                        return;
-                    }
 
-                if (head_inst->isUpdateVsstatusSd()) {
-                    auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
-                    RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
-                    RiscvISA::VSSTATUS vsstatus =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-                    RiscvISA::VSSTATUS32 vsstatus32 =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-
-                    if (v) {
-                        if (hstatus.vsxl ==1) {
-                            vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid);
-                        } else {
-                            vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid);
+                        // FIXME: ignore mret/sret/uret in correspond with RTL
+                        if (!head_inst->isNonSpeculative() && head_inst->isControl()) {
+                            dbbtb->commitBranch(head_inst, miss);
+                            if (!head_inst->isReturn() &&
+                                head_inst->isIndirectCtrl() && miss) {
+                                misPredIndirect[head_inst->pcState().instAddr()]++;
+                            }
                         }
+                        dbbtb->notifyInstCommit(head_inst);
                     }
+                        if (traceMaybeExitOnLastTraceInst(head_inst)) {
+                            return;
+                        }
 
-                }
-                if (head_inst->isUpdateMstatusSd()) {
-                    updateMstatusSd(tid);
-                }
+                    if (head_inst->isUpdateVsstatusSd()) {
+                        auto v = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
+                        RiscvISA::HSTATUS hstatus =
+                            cpu->readMiscRegNoEffect(
+                                RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
+                        RiscvISA::VSSTATUS vsstatus =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+                        RiscvISA::VSSTATUS32 vsstatus32 =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+
+                        if (v) {
+                            if (hstatus.vsxl ==1) {
+                                vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus32, tid);
+                            } else {
+                                vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus, tid);
+                            }
+                        }
 
-                ++num_committed;
-                stats.committedInstType[tid][head_inst->opClass()]++;
-                ppCommit->notify(head_inst);
+                    }
+                    if (head_inst->isUpdateMstatusSd()) {
+                        updateMstatusSd(tid);
+                    }
 
-                // hardware transactional memory
+                    ++num_committed;
+                    ++num_committed_per_thread[tid];
+                    stats.committedInstType[tid][head_inst->opClass()]++;
+                    ppCommit->notify(head_inst);
 
-                // update nesting depth
-                if (head_inst->isHtmStart())
-                    htmStarts[tid]++;
+                    // hardware transactional memory
 
-                // sanity check
-                if (head_inst->inHtmTransactionalState()) {
-                    assert(executingHtmTransaction(tid));
-                } else {
-                    assert(!executingHtmTransaction(tid));
-                }
+                    // update nesting depth
+                    if (head_inst->isHtmStart())
+                        htmStarts[tid]++;
 
-                // update nesting depth
-                if (head_inst->isHtmStop())
-                    htmStops[tid]++;
+                    // sanity check
+                    if (head_inst->inHtmTransactionalState()) {
+                        assert(executingHtmTransaction(tid));
+                    } else {
+                        assert(!executingHtmTransaction(tid));
+                    }
 
-                changedROBNumEntries[tid] = true;
+                    // update nesting depth
+                    if (head_inst->isHtmStop())
+                        htmStops[tid]++;
 
-                // Set the doneSeqNum to the youngest committed instruction.
-                toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
+                    changedROBNumEntries[tid] = true;
 
-                if (head_inst->getFtqId() > 1) {
-                    toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1;
-                }
-                committedTargetId = head_inst->getFtqId();
-                committedLoopIter = head_inst->getLoopIteration();
-
-                if (tid == 0)
-                    canHandleInterrupts = !head_inst->isDelayedCommit();
-
-                // at this point store conditionals should either have
-                // been completed or predicated false
-                assert(!head_inst->isStoreConditional() ||
-                       head_inst->isCompleted() ||
-                       !head_inst->readPredicate());
-
-                // Updates misc. registers.
-                head_inst->updateMiscRegs();
-                if (head_inst->staticInst->isVectorConfig()) {
-                    auto vset = static_cast<RiscvISA::VConfOp*>(head_inst->staticInst.get());
-                    if (!(vset->vtypeIsImm)) {
-                        auto tc = head_inst->tcBase();
-                        RiscvISA::VTYPE new_vtype = head_inst->readMiscReg(RiscvISA::MISCREG_VTYPE);
-                        tc->getDecoderPtr()->as<RiscvISA::Decoder>().setVtype(new_vtype);
+                    // Set the doneSeqNum to the youngest committed instruction.
+                    toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
+
+                    if (head_inst->getFtqId() > 1) {
+                        toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1;
                     }
-                }
-                if (head_inst->isFloating() && head_inst->isLoad()){
-                    RiscvISA::STATUS status = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, tid);
-                    status.sd = 1;
-                    status.fs = 3;
-                    cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, (RegVal)status, tid);
-                }
-                if (head_inst->isUpdateVsstatusSd()) {
-                    auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
-                    RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
-                    RiscvISA::VSSTATUS vsstatus =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-                    RiscvISA::VSSTATUS32 vsstatus32 =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-
-                    if (v) {
-                        if (hstatus.vsxl ==1) {
-                            vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid);
-                        } else {
-                            vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid);
+                    committedTargetId = head_inst->getFtqId();
+                    committedLoopIter = head_inst->getLoopIteration();
+
+                    if (tid == 0)
+                        canHandleInterrupts = !head_inst->isDelayedCommit();
+
+                    // at this point store conditionals should either have
+                    // been completed or predicated false
+                    assert(!head_inst->isStoreConditional() ||
+                        head_inst->isCompleted() ||
+                        !head_inst->readPredicate());
+
+                    // Updates misc. registers.
+                    head_inst->updateMiscRegs();
+                    if (head_inst->staticInst->isVectorConfig()) {
+                        auto vset = static_cast<RiscvISA::VConfOp *>(
+                            head_inst->staticInst.get());
+                        if (!(vset->vtypeIsImm)) {
+                            auto tc = head_inst->tcBase();
+                            RiscvISA::VTYPE new_vtype =
+                                head_inst->readMiscReg(
+                                    RiscvISA::MISCREG_VTYPE);
+                            tc->getDecoderPtr()->as<RiscvISA::Decoder>().setVtype(new_vtype);
                         }
                     }
-
-                }
-
-                if (cpu->difftestEnabled()) {
-                    diffInst(tid, head_inst);
-                }
-
-                if (head_inst->isLoad()) {
-                    Addr load_pc = head_inst->pcState().instAddr();
-                    Addr load_addr = head_inst->physEffAddr;
-                    char buffer[8] = {0};
-                    if (head_inst->memData) {
-                        std::memcpy(buffer, head_inst->memData,
-                                    std::min<size_t>(head_inst->effSize,
-                                                     sizeof(buffer)));
+                    if (head_inst->isFloating() && head_inst->isLoad()) {
+                        RiscvISA::STATUS status = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_STATUS, tid);
+                        status.sd = 1;
+                        status.fs = 3;
+                        cpu->setMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_STATUS,
+                            (RegVal)status, tid);
                     }
-                    Addr load_value = *((uint64_t *)buffer);
-                    bool hit = loadTripleCounter.update(load_pc, load_addr, load_value);
-                    if (hit) {
-                        // same PC && same addr && same value
-                        stats.loadTriple++;
+                    if (head_inst->isUpdateVsstatusSd()) {
+                        auto v = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
+                        RiscvISA::HSTATUS hstatus =
+                            cpu->readMiscRegNoEffect(
+                                RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
+                        RiscvISA::VSSTATUS vsstatus =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+                        RiscvISA::VSSTATUS32 vsstatus32 =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+
+                        if (v) {
+                            if (hstatus.vsxl ==1) {
+                                vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus32, tid);
+                            } else {
+                                vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus, tid);
+                            }
+                        }
+
                     }
-                    // EA reuse: compare to last committed EA of same static load
-                    auto itEA = lastLoadEA.find(load_pc);
-                    if (itEA != lastLoadEA.end() && itEA->second == load_addr) {
-                        stats.loadEAReused++;
+
+                    if (head_inst->isReadBarrier() ||
+                        head_inst->isWriteBarrier()) {
+                        cpu->armSyncVisibleStoreReplay(tid);
                     }
-                    lastLoadEA[load_pc] = load_addr;
-                    // Producer stability: only if this load had a forwarding producer
-                    if (head_inst->hasProducerStorePC()) {
-                        stats.loadsWithProducer++;
-                        const Addr prodPC = head_inst->producerStorePC();
-                        auto itP = lastLoadProducerStorePC.find(load_pc);
-                        if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) {
-                            stats.producerStable++;
-                        }
-                        lastLoadProducerStorePC[load_pc] = prodPC;
 
-                    // optional: clear after use to avoid confusing later stages
-                    head_inst->clearProducerStorePC();
+                    if (cpu->difftestEnabled()) {
+                        diffInst(tid, head_inst);
                     }
-                }
 
+                    if (head_inst->isLoad()) {
+                        Addr load_pc = head_inst->pcState().instAddr();
+                        Addr load_addr = head_inst->physEffAddr;
+                        char buffer[8] = {0};
+                        if (head_inst->memData) {
+                            std::memcpy(buffer, head_inst->memData,
+                                        std::min<size_t>(head_inst->effSize,
+                                                        sizeof(buffer)));
+                        }
+                        Addr load_value = *((uint64_t *)buffer);
+                        bool hit = loadTripleCounter.update(load_pc, load_addr, load_value);
+                        if (hit) {
+                            // same PC && same addr && same value
+                            stats.loadTriple++;
+                        }
+                        // EA reuse: compare to last committed EA of same static load
+                        auto itEA = lastLoadEA.find(load_pc);
+                        if (itEA != lastLoadEA.end() && itEA->second == load_addr) {
+                            stats.loadEAReused++;
+                        }
+                        lastLoadEA[load_pc] = load_addr;
+                        // Producer stability: only if this load had a forwarding producer
+                        if (head_inst->hasProducerStorePC()) {
+                            stats.loadsWithProducer++;
+                            const Addr prodPC = head_inst->producerStorePC();
+                            auto itP = lastLoadProducerStorePC.find(load_pc);
+                            if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) {
+                                stats.producerStable++;
+                            }
+                            lastLoadProducerStorePC[load_pc] = prodPC;
+
+                        // optional: clear after use to avoid confusing later stages
+                        head_inst->clearProducerStorePC();
+                        }
+                    }
 
-                // Check instruction execution if it successfully commits and
-                // is not carrying a fault.
-                if (cpu->checker) {
-                    cpu->checker->verify(head_inst);
-                }
 
-                cpu->traceFunctions(pc[tid]->instAddr());
-                traceOnCommit(tid, head_inst);
+                    // Check instruction execution if it successfully commits and
+                    // is not carrying a fault.
+                    if (cpu->checker) {
+                        cpu->checker->verify(head_inst);
+                    }
 
-                head_inst->staticInst->advancePC(*pc[tid]);
+                    cpu->traceFunctions(pc[tid]->instAddr());
+                    traceOnCommit(tid, head_inst);
 
-                // Keep track of the last sequence number commited
-                lastCommitedSeqNum[tid] = head_inst->seqNum;
+                    head_inst->staticInst->advancePC(*pc[tid]);
 
-                // If this is an instruction that doesn't play nicely with
-                // others squash everything and restart fetch
-                if (head_inst->isSquashAfter())
-                    squashAfter(tid, head_inst);
+                    // Keep track of the last sequence number commited
+                    lastCommitedSeqNum[tid] = head_inst->seqNum;
 
-                if (drainPending) {
-                    if (pc[tid]->microPC() == 0 && interrupt == NoFault &&
-                        !thread[tid]->trapPending) {
-                        // Last architectually committed instruction.
-                        // Squash the pipeline, stall fetch, and use
-                        // drainImminent to disable interrupts
-                        DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]);
+                    // If this is an instruction that doesn't play nicely with
+                    // others squash everything and restart fetch
+                    if (head_inst->isSquashAfter())
                         squashAfter(tid, head_inst);
-                        cpu->commitDrained(tid);
-                        drainImminent = true;
-                    }
-                }
 
-                bool onInstBoundary = !head_inst->isMicroop() ||
-                                      head_inst->isLastMicroop() ||
-                                      !head_inst->isDelayedCommit();
-
-                if (onInstBoundary) {
-                    int count = 0;
-                    Addr oldpc;
-                    // Make sure we're not currently updating state while
-                    // handling PC events.
-                    assert(!thread[tid]->noSquashFromTC &&
-                           !thread[tid]->trapPending);
-                    do {
-                        oldpc = pc[tid]->instAddr();
-                        thread[tid]->pcEventQueue.service(
-                                oldpc, thread[tid]->getTC());
-                        count++;
-                    } while (oldpc != pc[tid]->instAddr());
-                    if (count > 1) {
-                        DPRINTF(Commit,
-                                "PC skip function event, stopping commit\n");
-                        break;
-                    }
-                        traceOnMacroCommit(tid);
+                    if (drainPending) {
+                        if (pc[tid]->microPC() == 0 && interrupt == NoFault &&
+                            !thread[tid]->trapPending) {
+                            // Last architectually committed instruction.
+                            // Squash the pipeline, stall fetch, and use
+                            // drainImminent to disable interrupts
+                            DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]);
+                            squashAfter(tid, head_inst);
+                            cpu->commitDrained(tid);
+                            drainImminent = true;
+                        }
                     }
 
-                // Check if an instruction just enabled interrupts and we've
-                // previously had an interrupt pending that was not handled
-                // because interrupts were subsequently disabled before the
-                // pipeline reached a place to handle the interrupt. In that
-                // case squash now to make sure the interrupt is handled.
-                //
-                // If we don't do this, we might end up in a live lock
-                // situation.
-                if (!interrupt && avoidQuiesceLiveLock &&
-                    onInstBoundary && cpu->checkInterrupts(0))
-                    squashAfter(tid, head_inst);
-            } else {
-                DPRINTF(Commit, "Unable to commit head instruction PC:%s "
-                        "[tid:%i] [sn:%llu].\n",
-                        head_inst->pcState(), tid ,head_inst->seqNum);
-                break;
+                    bool onInstBoundary = !head_inst->isMicroop() ||
+                                        head_inst->isLastMicroop() ||
+                                        !head_inst->isDelayedCommit();
+
+                    if (onInstBoundary) {
+                        int count = 0;
+                        Addr oldpc;
+                        // Make sure we're not currently updating state while
+                        // handling PC events.
+                        assert(!thread[tid]->noSquashFromTC &&
+                            !thread[tid]->trapPending);
+                        do {
+                            oldpc = pc[tid]->instAddr();
+                            thread[tid]->pcEventQueue.service(
+                                    oldpc, thread[tid]->getTC());
+                            count++;
+                        } while (oldpc != pc[tid]->instAddr());
+                        if (count > 1) {
+                            DPRINTF(Commit,
+                                    "PC skip function event, stopping commit\n");
+                            break;
+                        }
+                            traceOnMacroCommit(tid);
+                        }
+
+                    // Check if an instruction just enabled interrupts and we've
+                    // previously had an interrupt pending that was not handled
+                    // because interrupts were subsequently disabled before the
+                    // pipeline reached a place to handle the interrupt. In that
+                    // case squash now to make sure the interrupt is handled.
+                    //
+                    // If we don't do this, we might end up in a live lock
+                    // situation.
+                    if (!interrupt && avoidQuiesceLiveLock &&
+                        onInstBoundary && cpu->checkInterrupts(0))
+                        squashAfter(tid, head_inst);
+                } else {
+                    DPRINTF(Commit, "Unable to commit head instruction PC:%s "
+                            "[tid:%i] [sn:%llu].\n",
+                            head_inst->pcState(), tid ,head_inst->seqNum);
+                    break;
+                }
             }
         }
     }
@@ -1569,6 +1622,8 @@ Commit::diffInst(ThreadID tid, const DynInstPtr &inst) {
     cpu->diffInfo.physEffAddr = inst->physEffAddr;
     cpu->diffInfo.effSize = inst->effSize;
     cpu->diffInfo.goldenValue = inst->getGolden();
+    cpu->diffInfo.amoOldGoldenValue = inst->getAmoOldGoldenValue();
+    cpu->recordCommittedStore(tid, inst);
     cpu->difftestStep(tid, inst->seqNum);
 }
 
@@ -1599,9 +1654,12 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
         // Memory-ordering instructions such as sfence.vma must not execute
         // until older stores are visible; otherwise page-table updates may
         // race with the TLB invalidation.
-        if ((head_inst->isMemRef() || head_inst->isReturn() ||
-             head_inst->isReadBarrier() || head_inst->isWriteBarrier()) &&
-            (inst_num > 0 || !iewStage->flushStores(tid))) {
+        const bool needs_store_drain =
+            head_inst->isMemRef() || head_inst->isReturn() ||
+            head_inst->isReadBarrier() || head_inst->isWriteBarrier();
+        const bool stores_drained =
+            !needs_store_drain || iewStage->flushStores(tid, head_inst->seqNum);
+        if (needs_store_drain && (inst_num > 0 || !stores_drained)) {
             DPRINTF(Commit,
                     "[tid:%i] [sn:%llu] "
                     "Waiting for all stores to writeback.\n",
@@ -1655,7 +1713,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
 
     if (inst_fault != NoFault) {
         traceLogInstFault(head_inst, inst_fault);
-        if (!iewStage->flushStores(tid) || inst_num > 0) {
+        if (!iewStage->flushStores(tid, head_inst->seqNum) || inst_num > 0) {
             DPRINTF(Commit,
                     "[tid:%i] [sn:%llu] "
                     "Stores outstanding, fault must wait.\n",
@@ -1816,7 +1874,8 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
     if (head_inst->isStoreConditional()) {
         DPRINTF(Commit, "[tid:%i] [sn:%llu] Store Conditional success: %i\n", tid, head_inst->seqNum,
                 head_inst->lockedWriteSuccess());
-        cpu->setSCSuccess(head_inst->lockedWriteSuccess(), head_inst->physEffAddr);
+        cpu->setSCSuccess(head_inst->lockedWriteSuccess(),
+                          head_inst->physEffAddr, tid);
     }
 
     // Update the commit rename map
@@ -1962,6 +2021,13 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid)
     DPRINTF(Commit, "Squashing in-flight renamed instructions\n");
     for (unsigned i_idx = 0; i_idx < fromRename->size; i_idx++) {
         const DynInstPtr &inst = fromRename->insts[i_idx];
+        if (inst->threadNumber != tid) {
+            DPRINTF(Commit,
+                    "[tid:%i] [sn:%llu] Preserving other-thread in-flight "
+                    "instruction during squash for tid %i\n",
+                    inst->threadNumber, inst->seqNum, tid);
+            continue;
+        }
         DPRINTF(Commit, "[tid:%i] [sn:%llu] Squashing in-flight "
                 "instruction PC %s\n",
                 inst->threadNumber, inst->seqNum, inst->pcState());
@@ -1970,10 +2036,10 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid)
 
     fixedbuffer[tid].clear();
 
-    localSquashVer.update(localSquashVer.nextVersion());
-    toIEW->commitInfo[tid].squashVersion = localSquashVer;
+    localSquashVer[tid].update(localSquashVer[tid].nextVersion());
+    toIEW->commitInfo[tid].squashVersion = localSquashVer[tid];
     DPRINTF(Commit, "Updating squash version to %u\n",
-            localSquashVer.getVersion());
+            localSquashVer[tid].getVersion());
 }
 
 void
@@ -1994,7 +2060,9 @@ Commit::markCompletedInsts()
             fromIEW->insts[inst_num]->setCanCommit();
             auto &inst = fromIEW->insts[inst_num];
 
-            panic_if(!rob->findInst(0, inst->seqNum), "[sn:%llu] Committed instruction not found in ROB",
+            panic_if(!rob->findInst(inst->threadNumber, inst->seqNum),
+                     "[tid:%i] [sn:%llu] Committed instruction not found in ROB",
+                     inst->threadNumber,
                      inst->seqNum);
         }
     }
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index cc13cff324..418dc0b779 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -196,7 +196,7 @@ class Commit
     };
     std::list<BranchInfo> branchLog;
 
-    uint64_t lastCommitCycle = 0;
+    uint64_t lastCommitCycle[MaxThreads] = {0};
 
     EventFunctionWrapper stuckCheckEvent;
 
@@ -215,8 +215,6 @@ class Commit
     /** Returns the name of the Commit. */
     std::string name() const;
 
-    uint64_t getLastCommitCycle() const { return lastCommitCycle; }
-
     /** Registers probes. */
     void regProbePoints();
 
@@ -430,7 +428,7 @@ class Commit
     /** Wire to read information from rename queue. */
     TimeBuffer<RenameStruct>::wire fromRename;
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
   public:
     /** ROB interface. */
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index f43ae5e861..34f6844f37 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -134,13 +134,6 @@ CPU::CPU(const BaseO3CPUParams &params)
       cpuStats(this),
       valuePred(params.valuePred)
 {
-    fatal_if(FullSystem && params.numThreads > 1,
-            "SMT is not supported in O3 in full system mode currently.");
-
-    fatal_if(!FullSystem && params.numThreads < params.workload.size(),
-            "More workload items (%d) than threads (%d) on CPU %s.",
-            params.workload.size(), params.numThreads, name());
-
     if (!params.switched_out) {
         _status = Running;
     } else {
@@ -205,7 +198,10 @@ CPU::CPU(const BaseO3CPUParams &params)
 
     ThreadID active_threads;
     if (FullSystem) {
-        active_threads = 1;
+        // FS-SMT still uses one shared workload/system image, but the O3 core
+        // must provision per-thread architectural state for every hardware
+        // thread context exposed by the CPU.
+        active_threads = numThreads;
     } else {
         active_threads = params.workload.size();
 
@@ -282,9 +278,7 @@ CPU::CPU(const BaseO3CPUParams &params)
 
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
         if (FullSystem) {
-            // SMT is not supported in FS mode yet.
-            assert(numThreads == 1);
-            thread[tid] = new ThreadState(this, 0, NULL);
+            thread[tid] = new ThreadState(this, tid, NULL);
         } else {
             if (tid < params.workload.size()) {
                 DPRINTF(O3CPU, "Workload[%i] process is %#x", tid,
@@ -1382,10 +1376,10 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst)
             cpi_r.roll(1);
         }
 
-        uint64_t committedInsts = totalInsts();
+        const uint64_t committedThreadInsts = thread[tid]->numInst;
 
         if (this->nextDumpInstCount && !dump_done
-                && committedInsts >= this->nextDumpInstCount) {
+                && committedThreadInsts >= this->nextDumpInstCount) {
             fprintf(stderr, "Will trigger stat dump and reset\n");
             statistics::schedStatEvent(true, true, curTick(), 0);
             scheduleInstStop(tid,0,"Will trigger stat dump and reset");
@@ -1399,7 +1393,8 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst)
         // Check for instruction-count-based events.
         thread[tid]->comInstEventQueue.serviceEvents(thread[tid]->numInst);
 
-        if (this->warmupInstCount && !warmup_done && committedInsts >= this->warmupInstCount) {
+        if (this->warmupInstCount && !warmup_done &&
+                committedThreadInsts >= this->warmupInstCount) {
             fprintf(stderr, "Will trigger stat dump and reset\n");
             statistics::schedStatEvent(true, true, curTick(), 0);
             scheduleInstStop(tid,0,"Will trigger stat dump and reset");
@@ -1740,12 +1735,13 @@ CPU::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid,
 }
 
 void
-CPU::readGem5Regs()
+CPU::readGem5Regs(ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     for (int i = 0; i < 32; i++) {
-        diffAllStates->gem5RegFile[i] = readArchIntReg(i, 0);
-        diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, 0);
-        readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], 0);
+        diffAllStates->gem5RegFile[i] = readArchIntReg(i, tid);
+        diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, tid);
+        readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], tid);
     }
 }
 
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 3a01e6cbbe..e49c00f5b0 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -739,7 +739,7 @@ class CPU : public BaseCPU
                             HtmFailureFaultCause cause) override;
 
     //difftest virtual function
-    void readGem5Regs() override;
+    void readGem5Regs(ThreadID tid) override;
 
   private:
     /** Value predictor */
diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index 2b101f96c1..0901476f2b 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -72,6 +72,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams &params)
       iewToDecodeDelay(params.iewToDecodeDelay),
       commitToDecodeDelay(params.commitToDecodeDelay),
       fetchToDecodeDelay(params.fetchToDecodeDelay),
+      decodeToFetchDelay(params.decodeToFetchDelay),
       decodeWidth(params.decodeWidth),
       numThreads(params.numThreads),
       enableLoadFusion(params.enable_loadFusion),
@@ -86,8 +87,15 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams &params)
     for (int i=0;i<numThreads;i++) {
         fixedbuffer[i] = boost::circular_buffer<DynInstPtr>(decodeWidth);
     }
-    stallBuffer = boost::circular_buffer<DynInstPtr>(decodeWidth * (fetchToDecodeDelay + 1));
-    eachstallSize = boost::circular_buffer<int>(fetchToDecodeDelay + 1);
+    // This buffer preserves the fetch->decode pipeline contents when decode
+    // stalls while TimeBuffer keeps advancing. Its depth matches the original
+    // forward pipeline window; fetch is backpressured before full to absorb
+    // both the decode->fetch feedback delay and the request already issued in
+    // the current cycle before decode computes backpressure.
+    const auto stallGroupDepth = fetchToDecodeDelay + 1;
+    stallBuffer = boost::circular_buffer<DynInstPtr>(
+        decodeWidth * stallGroupDepth);
+    eachstallSize = boost::circular_buffer<int>(stallGroupDepth);
 
 
     decodeStalls.resize(decodeWidth, StallReason::NoStall);
@@ -130,8 +138,14 @@ Decode::DecodeStats::DecodeStats(CPU *cpu)
     : statistics::Group(cpu, "decode"),
       ADD_STAT(idleCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is idle"),
+      ADD_STAT(smtidleCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch was idle per tid"),           
       ADD_STAT(blockedCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is blocked"),
+      ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch has spent blocked per tid"),  
+      ADD_STAT(smtnotactiveCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch no active per tid"),                
       ADD_STAT(runCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is running"),
       ADD_STAT(unblockCycles, statistics::units::Cycle::get(),
@@ -171,6 +185,16 @@ Decode::DecodeStats::DecodeStats(CPU *cpu)
     mispredictedByPC.flags(statistics::total);
     mispredictedByNPC.flags(statistics::total);
     fusedInsts.init(128).flags(statistics::nozero);
+
+    smtidleCycles
+            .init(4)
+            .flags(statistics::total);
+    smtblockedCycles
+            .init(4)
+            .flags(statistics::total);    
+    smtnotactiveCycles
+            .init(4)
+            .flags(statistics::total);          
 }
 
 void
@@ -373,6 +397,38 @@ Decode::updateActivate()
 void
 Decode::moveInstsToBuffer()
 {
+    auto tryMoveHeadGroupToFixedBuffer = [&]() -> bool {
+        if (stallBuffer.empty()) {
+            return false;
+        }
+
+        // stallbuffer moves to fixedbuffer in strict FIFO order.
+        ThreadID tid = stallBuffer.front()->threadNumber;
+        if (!fixedbuffer[tid].empty()) {
+            return false;
+        }
+
+        int insts_from_stall = eachstallSize.front();
+        eachstallSize.pop_front();
+        for (int i = 0; i < insts_from_stall; ++i) {
+            const DynInstPtr &inst = stallBuffer.front();
+            assert(tid == inst->threadNumber);
+            if (localSquashVer[tid].largerThan(inst->getVersion())) {
+                inst->setSquashed();
+            }
+            assert(!fixedbuffer[inst->threadNumber].full());
+            fixedbuffer[inst->threadNumber].push_back(inst);
+            stallBuffer.pop_front();
+        }
+
+        return true;
+    };
+
+    // Model one stage advance before latching the next cycle's input so a
+    // full stall buffer can still accept a new fetch bundle when its head
+    // group moves forward in the same cycle.
+    const bool moved_group = tryMoveHeadGroupToFixedBuffer();
+
     // do not support mixed thread instructions in one fetch group
     int insts_from_fetch = fromFetch->size;
     if (insts_from_fetch != 0) {
@@ -392,23 +448,12 @@ Decode::moveInstsToBuffer()
     if (stallBuffer.empty()) {
         return;
     }
-    // stallbuffer move to fixedbuffer
-    ThreadID tid = stallBuffer.front()->threadNumber;
-    if (!fixedbuffer[tid].empty())
-        return;
-    insts_from_fetch = eachstallSize.front();
-    eachstallSize.pop_front();
-    for (int i = 0; i < insts_from_fetch; ++i) {
-        const DynInstPtr &inst = stallBuffer.front();
-        assert(tid == inst->threadNumber);
-        if (localSquashVer.largerThan(inst->getVersion())) {
-            inst->setSquashed();
-        }
-        assert(!fixedbuffer[inst->threadNumber].full());
-        fixedbuffer[inst->threadNumber].push_back(inst);
-        stallBuffer.pop_front();
-    }
 
+    // If nothing advanced before latching new input, allow the current head
+    // (possibly the just-arrived group) to fill an empty stage this cycle.
+    if (!moved_group) {
+        tryMoveHeadGroupToFixedBuffer();
+    }
 }
 
 void
@@ -419,9 +464,10 @@ Decode::checkSquash()
             DPRINTF(Decode, "[tid:%i] Squashing instructions due to squash "
                     "from commit.\n", i);
             squash(i);
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
             DPRINTF(Decode, "Updating squash version to %u\n",
-                    localSquashVer.getVersion());
+                    localSquashVer[i].getVersion());
         }
     }
 }
@@ -442,13 +488,36 @@ Decode::tick()
     // check threads stall & status
     ThreadID tid = InvalidThreadID;
     ThreadID blocked_tid = InvalidThreadID;
+    const bool fifoBackpressured =
+        !stallBuffer.empty() &&
+        eachstallSize.size() + decodeToFetchDelay + 1 >=
+            eachstallSize.capacity();
+    const ThreadID fifoHeadTid =
+        !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID;
+    const StallReason fifoBlockReason =
+        (fifoBackpressured && fifoHeadTid != InvalidThreadID &&
+         stallSig->blockDecode[fifoHeadTid]) ?
+            stallSig->decodeBlockReason[fifoHeadTid] :
+            (fifoBackpressured ? StallReason::OtherFragStall :
+                                 StallReason::NoStall);
     for (int i = 0; i < numThreads; i++) {
         bool block = stallSig->blockDecode[i];
         bool active = !block && !fixedbuffer[i].empty();
 
-        stallSig->blockFetch[i] = block;
+        if(block){
+            ++stats.smtblockedCycles[i];
+        }
+
+        if(!active)
+        {
+            ++stats.smtnotactiveCycles[i];
+        }
+
+        stallSig->blockFetch[i] = block || fifoBackpressured;
         stallSig->fetchBlockReason[i] =
-            block ? stallSig->decodeBlockReason[i] : StallReason::NoStall;
+            stallSig->blockFetch[i] ?
+                (block ? stallSig->decodeBlockReason[i] : fifoBlockReason) :
+                StallReason::NoStall;
         toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i];
         if (active) {
             if (tid == InvalidThreadID)
@@ -539,6 +608,7 @@ Decode::decodeInsts(ThreadID tid)
                 " early.\n",tid);
         // Should I change the status to idle?
         ++stats.idleCycles;
+        ++stats.smtidleCycles[tid];
 
         StallReason stall = StallReason::NoStall;
         for (auto iter : fromFetch->fetchStallReason) {
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index a510d8dd9d..f2e39b56a6 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -236,6 +236,9 @@ class Decode
     /** Fetch to decode delay. */
     Cycles fetchToDecodeDelay;
 
+    /** Decode to fetch feedback delay for stage backpressure. */
+    Cycles decodeToFetchDelay;
+
     /** The width of decode, in instructions. */
     unsigned decodeWidth;
 
@@ -256,8 +259,12 @@ class Decode
 
         /** Stat for total number of idle cycles. */
         statistics::Scalar idleCycles;
+
+        statistics::Vector smtidleCycles;
         /** Stat for total number of blocked cycles. */
         statistics::Scalar blockedCycles;
+        statistics::Vector smtblockedCycles;
+        statistics::Vector smtnotactiveCycles;
         /** Stat for total number of normal running cycles. */
         statistics::Scalar runCycles;
         /** Stat for total number of unblocking cycles. */
@@ -293,7 +300,7 @@ class Decode
 
     void setAllStalls(StallReason decodeStall);
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 };
 
 } // namespace o3
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 21c9cec4e6..ff31aa9bb9 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -98,7 +98,6 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
       fetchWidth(params.fetchWidth),
       decodeWidth(params.decodeWidth),
       retryPkt(),
-      retryTid(InvalidThreadID),
       cacheBlkSize(cpu->cacheLineSize()),
       fetchBufferSize(params.fetchBufferSize),
       fetchQueueSize(params.fetchQueueSize),
@@ -148,6 +147,8 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
         threads[tid].data = new uint8_t[fetchBufferSize];
     }
 
+    initDecodeScheduler();
+
     // Get the size of an instruction.
     // stallReason size should be the same as decodeWidth,renameWidth,dispWidth
     stallReason.resize(decodeWidth, StallReason::NoStall);
@@ -203,8 +204,12 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
              "Number of cycles fetch has spent waiting for tlb"),
     ADD_STAT(idleCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch was idle"),
+    ADD_STAT(smtidleCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch was idle per tid"),         
     ADD_STAT(blockedCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch has spent blocked"),
+    ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch has spent blocked per tid"),         
     ADD_STAT(miscStallCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch has spent waiting on interrupts, or bad "
              "addresses, or out of MSHRs"),
@@ -240,6 +245,10 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
              "Distribution of fetch status"),
     ADD_STAT(decodeStalls, statistics::units::Count::get(),
              "Number of decode stalls"),
+    ADD_STAT(smtdecodeStalls, statistics::units::Count::get(),
+             "Number of decode stalls per tid"),  
+    ADD_STAT(smtftqempty, statistics::units::Count::get(),
+             "Number of ftq empty per tid"),                  
     ADD_STAT(decodeStallRate, statistics::units::Rate<
                     statistics::units::Count, statistics::units::Cycle>::get(),
              "Number of decode stalls per cycle",
@@ -335,6 +344,18 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
         }
         decodeStalls
             .prereq(decodeStalls);
+        smtdecodeStalls
+            .init(fetch->numThreads)
+            .flags(statistics::total);  
+        smtftqempty
+            .init(fetch->numThreads)
+            .flags(statistics::total);
+        smtidleCycles
+            .init(fetch->numThreads)
+            .flags(statistics::total);
+        smtblockedCycles
+            .init(fetch->numThreads)
+            .flags(statistics::total);     
         decodeStallRate
             .flags(statistics::total);
         fetchBubbles
@@ -372,6 +393,41 @@ Fetch::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
     fromCommit = timeBuffer->getWire(-commitToFetchDelay);
 }
 
+void
+Fetch::initDecodeScheduler()
+{
+     // Initialize counters (same as before)
+    lsqCounter = new InstsCounter();
+    iqCounter  = new InstsCounter();
+    robCounter = new InstsCounter();
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 0\n");
+
+    for (ThreadID tid = 0; tid < numThreads; tid++) 
+    {
+        lsqCounter->setCounter(tid, 0);
+        iqCounter->setCounter(tid, 0);
+        robCounter->setCounter(tid, 0);
+    }
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 1\n");
+    
+    if (smtDecodePolicy == "icount") {
+        // Use ROB as default counter for icount
+        decodeScheduler = new ICountScheduler(numThreads, robCounter);
+    }
+    else if (smtDecodePolicy == "delayed") {
+        decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, delayedSchedulerDelay);
+    }
+    else if (smtDecodePolicy == "multi_priority") {
+        decodeScheduler = new MultiPrioritySched(numThreads, {lsqCounter, iqCounter, robCounter});
+    }
+    else {
+        // Default: round-robin like (use delayed with thread cycling)
+        decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, numThreads);
+    }
+
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: %s\n", smtDecodePolicy.c_str());
+}
+
 void
 Fetch::setActiveThreads(std::list<ThreadID> *at_ptr)
 {
@@ -423,6 +479,10 @@ Fetch::resetStage()
 {
     numInst = 0;
     interruptPending = false;
+    for (auto *pkt : retryPkt) {
+        delete pkt;
+    }
+    retryPkt.clear();
     cacheBlocked = false;
 
     priorityList.clear();
@@ -452,7 +512,9 @@ Fetch::resetStage()
     }
 
     assert(dbpbtb);
-    dbpbtb->resetPC(threads[0].fetchpc->instAddr());
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        dbpbtb->resetPC(tid, threads[tid].fetchpc->instAddr());
+    }
 }
 
 bool
@@ -550,8 +612,35 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt)
         DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n",
                 tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size());
 
-        // Note: retry is handled completely by the standard gem5 recvReqRetry mechanism
-        // No need to handle retry here to avoid duplicate packet sending
+        bool waitingOnRetry = false;
+        for (const auto status : threads[tid].cacheReq.requestStatus) {
+            if (status == CacheWaitRetry) {
+                waitingOnRetry = true;
+                break;
+            }
+        }
+
+        if (waitingOnRetry && cacheBlocked && !retryPkt.empty()) {
+            PacketPtr queuedPkt = retryPkt.front();
+            const ThreadID queuedTid =
+                cpu->contextToThread(queuedPkt->req->contextId());
+            const bool sameThreadRetry = queuedTid == tid &&
+                threads[tid].cacheReq.findRequestIndex(queuedPkt->req) != SIZE_MAX;
+
+            if (sameThreadRetry && icachePort.sendTimingReq(queuedPkt)) {
+                DPRINTF(Fetch,
+                        "[tid:%i] Retrying matching queued I-cache packet %#lx "
+                        "after sibling response\n",
+                        tid, queuedPkt->req->getVaddr());
+                updateCacheRequestStatusByRequest(tid, queuedPkt->req,
+                                                  CacheWaitResponse);
+                ppFetchRequestSent->notify(queuedPkt->req);
+                retryPkt.erase(retryPkt.begin());
+                if (retryPkt.empty()) {
+                    cacheBlocked = false;
+                }
+            }
+        }
 
         return false;  // Return false to indicate we're still waiting
     }
@@ -619,8 +708,8 @@ Fetch::processCacheCompletion(PacketPtr pkt)
     }
 
     // Verify fetchBufferPC alignment with the supplying FSQ entry.
-    if (threads[tid].valid && dbpbtb->ftqHasFetching(0)) {
-        const auto &stream = dbpbtb->ftqFetchingTarget(0);
+    if (threads[tid].valid && dbpbtb->ftqHasFetching(tid)) {
+        const auto &stream = dbpbtb->ftqFetchingTarget(tid);
         if (threads[tid].startPC != stream.startPC) {
             panic("fetchBufferPC %#x should be aligned with FSQ startPC %#x",
                   threads[tid].startPC, stream.startPC);
@@ -650,7 +739,6 @@ Fetch::drainSanityCheck() const
 {
     assert(isDrained());
     assert(retryPkt.size() == 0);
-    assert(retryTid == InvalidThreadID);
     assert(!cacheBlocked);
     assert(!interruptPending);
 
@@ -756,7 +844,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc)
     // Decoupled+BTB-only: compute next PC directly from the supplying FSQ entry.
     ThreadID tid = inst->threadNumber;
     assert(dbpbtb);
-    assert(dbpbtb->ftqHasFetching(0));
+    assert(dbpbtb->ftqHasFetching(tid));
     const auto &stream = dbpbtb->ftqFetchingTarget(tid);
 
     const Addr curr_pc = next_pc.instAddr();
@@ -902,6 +990,16 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr
 
     fetchStats.cacheLines++;
 
+    if (cacheBlocked) {
+        DPRINTF(Fetch, "[tid:%i] I-cache port already waiting for retry, queueing %#lx\n",
+                tid, mem_req->getVaddr());
+
+        updateCacheRequestStatusByRequest(tid, mem_req, CacheWaitRetry);
+        setAllFetchStalls(StallReason::IcacheStall);
+        retryPkt.push_back(data_pkt);
+        return;
+    }
+
     // Access the cache.
     if (!icachePort.sendTimingReq(data_pkt)) {
         DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
@@ -913,7 +1011,6 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr
                 mem_req->getVaddr());
         setAllFetchStalls(StallReason::IcacheStall);
         retryPkt.push_back(data_pkt);
-        retryTid = tid;
         cacheBlocked = true;
     } else {
         DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid);
@@ -965,7 +1062,7 @@ Fetch::handleTranslationFault(ThreadID tid, const RequestPtr &mem_req, const Fau
     // We will use a nop in order to carry the fault.
     DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
             fetch_pc, fetch_pc, false);
-    instruction->setVersion(localSquashVer);
+    instruction->setVersion(localSquashVer[tid]);
     instruction->setNotAnInst();
 
     instruction->setPredTarg(fetch_pc);
@@ -1073,15 +1170,17 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN
     // Reset the cache request after cancelling
     threads[tid].cacheReq.reset();
 
-    // Get rid of the retrying packet if it was from this thread.
-    if (retryTid == tid) {
-        assert(cacheBlocked);
-        for (auto it : retryPkt) {
-            delete it;
+    // Drop any retry packets that belong to this squashed thread.
+    for (auto it = retryPkt.begin(); it != retryPkt.end();) {
+        if (cpu->contextToThread((*it)->req->contextId()) == tid) {
+            delete *it;
+            it = retryPkt.erase(it);
+        } else {
+            ++it;
         }
-        retryPkt.clear();
-        retryTid = InvalidThreadID;
-        cacheBlocked = false;   // clear cache blocked
+    }
+    if (retryPkt.empty()) {
+        cacheBlocked = false;
     }
 
     if (squashInst && !squashInst->isControl()) {
@@ -1285,6 +1384,32 @@ Fetch::handleInterrupts()
     }
 }
 
+ThreadID
+Fetch::selectUnstalledThread()
+{
+
+    // if (numThreads == 1) {
+    //     return 0;
+    // }
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (!stallSig->blockFetch[tid]) {
+            lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount);
+            iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount);
+            robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount);
+           
+        } else {
+            lsqCounter->setCounter(tid, UINT64_MAX);
+            iqCounter->setCounter(tid, UINT64_MAX);
+            robCounter->setCounter(tid, UINT64_MAX);
+            
+        }
+        DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount);
+    }
+
+    ThreadID selected = decodeScheduler->getThread();
+    return selected;
+}
+
 void
 Fetch::sendInstructionsToDecode()
 {
@@ -1296,9 +1421,12 @@ Fetch::sendInstructionsToDecode()
     for (int i = 0; i < numThreads; i++) {
         if (!stallSig->blockFetch[i]) {
             any_thread_active = true;
-            break;
+            //break;
+        }else{
+            fetchStats.smtdecodeStalls[i]++; 
         }
     }
+
     if (!any_thread_active) {
         // All threads are blocked, no instructions to send
         ThreadID blocked_tid = InvalidThreadID;
@@ -1321,7 +1449,8 @@ Fetch::sendInstructionsToDecode()
         return;
     }
 
-    ThreadID tid = 0; // TODO: smt support
+    ThreadID tid =selectUnstalledThread();
+    DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid);
 
     // fetch totally stalled
     if (stallSig->blockFetch[tid]) {
@@ -1407,6 +1536,7 @@ Fetch::measureFrontendBubbles(unsigned insts_to_decode, ThreadID tid)
 
     if (stallSig->blockFetch[tid]) {
         fetchStats.decodeStalls++;
+        //fetchStats.smtdecodeStalls[tid]++;
     }
 }
 
@@ -1459,35 +1589,42 @@ Fetch::handleIEWSignals()
         return;
     }
 
-    auto &incoming = fromIEW->iewInfo->resolvedCFIs;
     const bool had_pending_resolve = !resolveQueue.empty();
-    uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size();
     uint8_t enqueueCount = 0;
+    uint8_t enqueueSize = 0;
+
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        enqueueSize += fromIEW->iewInfo[tid].resolvedCFIs.size();
+    }
 
     if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) {
         fetchStats.resolveQueueFullEvents++;
         fetchStats.resolveEnqueueFailEvent += enqueueSize;
     } else {
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            auto &incoming = fromIEW->iewInfo[tid].resolvedCFIs;
+            for (const auto &resolved : incoming) {
+                bool merged = false;
+                for (auto &queued : resolveQueue) {
+                    if (queued.resolvedTid == tid &&
+                        queued.resolvedFTQId == resolved.ftqId) {
+                        queued.resolvedInstPC.push_back(resolved.pc);
+                        merged = true;
+                        break;
+                    }
+                }
 
-        for (const auto &resolved : incoming) {
-            bool merged = false;
-            for (auto &queued : resolveQueue) {
-                if (queued.resolvedFTQId == resolved.ftqId) {
-                    queued.resolvedInstPC.push_back(resolved.pc);
-                    merged = true;
-                    break;
+                if (merged) {
+                    continue;
                 }
-            }
 
-            if (merged) {
-                continue;
+                ResolveQueueEntry new_entry;
+                new_entry.resolvedTid = tid;
+                new_entry.resolvedFTQId = resolved.ftqId;
+                new_entry.resolvedInstPC.push_back(resolved.pc);
+                resolveQueue.push_back(std::move(new_entry));
+                enqueueCount++;
             }
-
-            ResolveQueueEntry new_entry;
-            new_entry.resolvedFTQId = resolved.ftqId;
-            new_entry.resolvedInstPC.push_back(resolved.pc);
-            resolveQueue.push_back(std::move(new_entry));
-            enqueueCount++;
         }
         fetchStats.resolveEnqueueCount.sample(enqueueCount);
     }
@@ -1499,18 +1636,19 @@ Fetch::handleIEWSignals()
     // and fetch consuming them as predictor resolved updates.
     if (had_pending_resolve && !resolveQueue.empty()) {
         auto &entry = resolveQueue.front();
+        ThreadID tid = entry.resolvedTid;
         unsigned int stream_id = entry.resolvedFTQId;
-        dbpbtb->prepareResolveUpdateEntries(stream_id, 0);
+        dbpbtb->prepareResolveUpdateEntries(stream_id, tid);
         for (const auto resolvedInstPC : entry.resolvedInstPC) {
-            dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0);
+            dbpbtb->markCFIResolved(stream_id, resolvedInstPC, tid);
         }
-        bool success = dbpbtb->resolveUpdate(stream_id, 0);
+        bool success = dbpbtb->resolveUpdate(stream_id, tid);
         if (success) {
-            dbpbtb->notifyResolveSuccess();
+            dbpbtb->notifyResolveSuccess(tid);
             resolveQueue.pop_front();
             fetchStats.resolveDequeueCount++;
         } else {
-            dbpbtb->notifyResolveFailure();
+            dbpbtb->notifyResolveFailure(tid);
         }
     }
 }
@@ -1549,8 +1687,10 @@ Fetch::handleCommitSignals(ThreadID tid)
     squash(*fromCommit->commitInfo[tid].pc, squash_seq,
            squash_inst, tid);
 
-    localSquashVer.update(fromCommit->commitInfo[tid].squashVersion.getVersion());
-    DPRINTF(Fetch, "Updating squash version to %u\n", localSquashVer.getVersion());
+    localSquashVer[tid].update(
+        fromCommit->commitInfo[tid].squashVersion.getVersion());
+    DPRINTF(Fetch, "Updating squash version to %u\n",
+            localSquashVer[tid].getVersion());
 
     auto mispred_inst = fromCommit->commitInfo[tid].mispredictInst;
 
@@ -1658,8 +1798,8 @@ Fetch::buildInst(ThreadID tid, StaticInstPtr staticInst,
             instruction->isMov());
     assert(dbpbtb);
     DPRINTF(DecoupleBP, "Set instruction %lu with fetch id %lu\n",
-            instruction->seqNum, dbpbtb->ftqHeadId(0));
-    instruction->setFtqId(dbpbtb->ftqHeadId(0));
+            instruction->seqNum, dbpbtb->ftqHeadId(tid));
+    instruction->setFtqId(dbpbtb->ftqHeadId(tid));
 
 #if TRACING_ON
     if (trace) {
@@ -1734,6 +1874,7 @@ Fetch::prepareFetchAddress(ThreadID tid, bool &status_change)
     } else {
         if (fetchStatus[tid] == Idle) {
             ++fetchStats.idleCycles;
+            ++fetchStats.smtidleCycles[tid];
             DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid);
         }
         // Status is Idle, so fetch should do nothing.
@@ -1861,7 +2002,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
                 tid, waitForVsetvl);
     }
 
-    instruction->setVersion(localSquashVer);
+    instruction->setVersion(localSquashVer[tid]);
     ppFetch->notify(instruction);
     numInst++;
 
@@ -1996,6 +2137,7 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
     }
 
     if (ftqEmpty(tid)) {
+        ++fetchStats.smtftqempty[tid];
         DPRINTF(Fetch, "[tid:%i] No FSQ entry available for next fetch\n", tid);
         return;
     }
@@ -2003,8 +2145,22 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
     assert(dbpbtb);
     const auto &stream = dbpbtb->ftqFetchingTarget(tid);
     const Addr start_pc = stream.startPC;
+    const Addr current_pc = pc_state.instAddr();
     threads[tid].startPC = start_pc;
 
+    if (current_pc < stream.startPC ||
+        current_pc >= stream.predEndPC) {
+        auto &reset_pc = threads[tid].fetchpc->as<RiscvISA::PCState>();
+        reset_pc.pc(stream.startPC);
+        reset_pc.npc(stream.startPC + 4);
+        reset_pc.uReset();
+        DPRINTF(Fetch,
+                "[tid:%i] Resetting fetch PC to new FTQ stream start %s "
+                "(previous PC %#lx outside [%#lx, %#lx))\n",
+                tid, *threads[tid].fetchpc, current_pc,
+                stream.startPC, stream.predEndPC);
+    }
+
     DPRINTF(Fetch, "[tid:%i] Issuing a pipelined I-cache access for new FSQ entry, "
                   "starting at PC %#x (endPC %#x; original PC %s)\n",
             tid, start_pc, stream.predEndPC, pc_state);
@@ -2014,36 +2170,32 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
 void
 Fetch::recvReqRetry()
 {
-    if (retryPkt.size() == 0) {
-        assert(retryTid == InvalidThreadID);
+    if (retryPkt.empty()) {
         // Access has been squashed since it was sent out.  Just clear
         // the cache being blocked.
         cacheBlocked = false;
         return;
     }
     assert(cacheBlocked);
-    assert(retryTid != InvalidThreadID);
-    // Note: In multi-cacheline fetch, overall status may not be CacheWaitRetry
-    // if some requests have progressed while others still need retry.
-    // The presence of retryPkt itself indicates retry is needed.
+    retryPendingIcacheRequests();
+}
 
-    for (auto it = retryPkt.begin(); it != retryPkt.end();) {
-        if (icachePort.sendTimingReq(*it)) {
-            // Use new cache state management with specific RequestPtr
-            updateCacheRequestStatusByRequest(retryTid, (*it)->req, CacheWaitResponse);
-            // Notify Fetch Request probe when a retryPkt is successfully sent.
-            // Note that notify must be called before retryPkt is set to NULL.
-            ppFetchRequestSent->notify((*it)->req);
-            it = retryPkt.erase(it);
-        } else {
-            it++;
+void
+Fetch::retryPendingIcacheRequests()
+{
+    while (!retryPkt.empty()) {
+        PacketPtr pkt = retryPkt.front();
+        if (!icachePort.sendTimingReq(pkt)) {
+            return;
         }
-    }
 
-    if (retryPkt.size() == 0) {
-        retryTid = InvalidThreadID;
-        cacheBlocked = false;
+        const ThreadID tid = cpu->contextToThread(pkt->req->contextId());
+        updateCacheRequestStatusByRequest(tid, pkt->req, CacheWaitResponse);
+        ppFetchRequestSent->notify(pkt->req);
+        retryPkt.erase(retryPkt.begin());
     }
+
+    cacheBlocked = false;
 }
 
 void
@@ -2058,6 +2210,7 @@ Fetch::profileStall(ThreadID tid)
         DPRINTF(Fetch, "Fetch has no active thread!\n");
     } else if (fetchStatus[tid] == Blocked) {
         ++fetchStats.blockedCycles;
+        ++fetchStats.smtblockedCycles[tid];
         DPRINTF(Fetch, "[tid:%i] Fetch is blocked!\n", tid);
     } else if (fetchStatus[tid] == Squashing) {
         ++fetchStats.squashCycles;
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 19091ef30e..18e6159022 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -65,6 +65,7 @@
 #include "mem/port.hh"
 #include "sim/eventq.hh"
 #include "sim/probe/probe.hh"
+#include "cpu/o3/smt_sched.hh"
 
 namespace gem5
 {
@@ -233,6 +234,18 @@ class Fetch
     /** To probe when a fetch request is successfully sent. */
     ProbePointArg<RequestPtr> *ppFetchRequestSent;
 
+    // SMT Decode Scheduler
+    SMTScheduler* decodeScheduler;
+
+    // Counters from backend structures (to be passed in)
+    InstsCounter* lsqCounter;
+    InstsCounter* iqCounter;
+    InstsCounter* robCounter;
+
+    // Configuration parameters
+    std::string smtDecodePolicy ="multi_priority";
+    int delayedSchedulerDelay;
+
   public:
     /** Fetch constructor. */
     Fetch(CPU *_cpu, const BaseO3CPUParams &params);
@@ -299,10 +312,19 @@ class Fetch
 
     /** For priority-based fetch policies, need to keep update priorityList */
     void deactivateThread(ThreadID tid);
+
+    // Function to initialize scheduler
+    void initDecodeScheduler();
+
+    // Select a thread that is not fetch-blocked, using scheduler
+    ThreadID selectUnstalledThread();
   private:
     /** Reset this pipeline stage */
     void resetStage();
 
+    /** Retry queued I-cache packets once, stopping at the first new block. */
+    void retryPendingIcacheRequests();
+
     /** Changes the status of this stage to active, and indicates this
      * to the CPU.
      */
@@ -657,12 +679,9 @@ class Fetch
     /** Is the cache blocked?  If so no threads can access it. */
     bool cacheBlocked;
 
-    /** The packet that is waiting to be retried. */
+    /** Packets waiting for the next cache-issued retry callback. */
     std::vector<PacketPtr> retryPkt;
 
-    /** The thread that is waiting on the cache to tell fetch to retry. */
-    ThreadID retryTid;
-
     /** Cache block size. */
     unsigned int cacheBlkSize;
 
@@ -1035,8 +1054,12 @@ class Fetch
          * the pipeline.
          */
         statistics::Scalar idleCycles;
+
+        statistics::Vector smtidleCycles;
         /** Total number of cycles spent blocked. */
         statistics::Scalar blockedCycles;
+
+        statistics::Vector smtblockedCycles;
         /** Total number of cycles spent in any other state. */
         statistics::Scalar miscStallCycles;
         /** Total number of cycles spent in waiting for drains. */
@@ -1072,6 +1095,10 @@ class Fetch
         statistics::Vector fetchStatusDist;
         /** Number of decode stalls */
         statistics::Scalar decodeStalls;
+
+        statistics::Vector smtdecodeStalls;
+
+        statistics::Vector smtftqempty;
         /** Number of decode stalls per cycle */
         statistics::Formula decodeStallRate;
         /** Unutilized issue-pipeline slots while there is no backend-stall */
@@ -1107,7 +1134,7 @@ class Fetch
         statistics::Scalar traceMetaCleanupCommitCalls;
     } fetchStats;
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
 public:
     const FetchStatGroup &getFetchStats() { return fetchStats; }
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index c4ffd4cb50..2d4cdb34c3 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -525,7 +525,7 @@ IEW::squash(ThreadID tid)
 
     for (auto& dp : dispQue) {
         for (auto& it : dp) {
-            if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum) {
+            if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum && (it->threadNumber == tid)) {
                 it->setSquashed();
             }
         }
@@ -799,13 +799,14 @@ IEW::checkSquash()
     for (int i = 0; i < numThreads; i++) {
         if (fromCommit->commitInfo[i].squash) {
             squash(i);
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
-            DPRINTF(IEW, "Updating squash version to %u\n", localSquashVer.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
+            DPRINTF(IEW, "Updating squash version to %u\n",
+                    localSquashVer[i].getVersion());
 
             fetchRedirect[i] = false;
             iewStats.stallEvents[ROBWalk]++;
             setAllStalls(StallReason::CommitSquash);
-            return;
         }
 
         if (fromCommit->commitInfo[i].robSquashing) {
@@ -831,7 +832,7 @@ IEW::moveInstsToBuffer()
     for (int i = 0; i < insts_from_rename; ++i) {
         const DynInstPtr &inst = fromRename->insts[i];
         assert(inst->threadNumber == tid);
-        if (localSquashVer.largerThan(inst->getVersion())) {
+        if (localSquashVer[tid].largerThan(inst->getVersion())) {
             inst->setSquashed();
         } else {
             fixedbuffer[tid].push_back(inst);
@@ -935,9 +936,9 @@ IEW::dispatchInsts()
 
         toRename->iewInfo[tid].robHeadStallReason = checkDispatchStall(tid, NumDQ, nullptr, -1);
         toRename->iewInfo[tid].lqHeadStallReason =
-            ldstQueue.lqEmpty() ? StallReason::NoStall : checkLSQStall(tid, true);
+            ldstQueue.lqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, true);
         toRename->iewInfo[tid].sqHeadStallReason =
-            ldstQueue.sqEmpty() ? StallReason::NoStall : checkLSQStall(tid, false);
+            ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false);
         toRename->iewInfo[tid].blockReason = blockReason;
     }
 }
@@ -1523,6 +1524,9 @@ IEW::executeInsts()
     while (threads != end) {
         ThreadID tid = *threads++;
         fetchRedirect[tid] = false;
+        toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid);
+        toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid);
+        toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid);
     }
 
     // Uncomment this if you want to see all available instructions.
@@ -1533,6 +1537,7 @@ IEW::executeInsts()
     ThreadID tid = *activeThreads->begin();
     toFetch->iewInfo[tid].resolvedCFIs.clear();
 
+    
     // Execute/writeback any instructions that are available.
     int insts_to_execute = fromIssue->size;
     fromIssue->size = 0;
@@ -1548,6 +1553,11 @@ IEW::executeInsts()
         // executing
         ppExecute->notify(inst);
 
+        if (inst->isSplitStoreData() &&
+            ldstQueue.splitStoreAddrSquashed(inst)) {
+            inst->setSquashed();
+        }
+
         // Check if the instruction is squashed; if so then skip it
         if (inst->isSquashed()) {
             DPRINTF(IEW, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
@@ -1682,8 +1692,8 @@ IEW::writebackInsts()
         DynInstPtr inst = toCommit->insts[inst_num];
         ThreadID tid = inst->threadNumber;
 
-        if (inst->savedRequest && inst->isLoad()) {
-            inst->pf_source = inst->savedRequest->mainReq()->getPFSource();
+        if (inst->isLoad()) {
+            inst->pf_source = ldstQueue.getLoadPFSource(inst);
         }
 
         DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n",
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index e63e7aff11..94dd9a0835 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -260,6 +260,10 @@ class IEW
      * the store queue or the store buffer to write back to.
      */
     bool flushStores(ThreadID tid) { return ldstQueue.flushStores(tid); }
+    bool flushStores(ThreadID tid, InstSeqNum seq_num)
+    {
+        return ldstQueue.flushStores(tid, seq_num);
+    }
 
     /** Check if we need to squash after a load/store/branch is executed. */
     void SquashCheckAfterExe(DynInstPtr inst);
@@ -405,7 +409,7 @@ class IEW
     /** Scoreboard pointer. */
     Scoreboard* scoreboard;
 
-    SquashVersion localSquashVer{0};
+    SquashVersion localSquashVer[MaxThreads];
 
     /** Value predictor */
     valuepred::VPUnit *valuePred;
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 89a027c3b1..db8ec407f4 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -53,6 +53,7 @@
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/fu_pool.hh"
+#include "cpu/o3/iew.hh"
 #include "cpu/o3/issue_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/IQ.hh"
@@ -151,7 +152,8 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
     scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue);
     scheduler->resetDepGraph(numPhysRegs);
     scheduler->setMemDepUnit(memDepUnit);
-
+    scheduler->initIQICountSmtScheduler(numThreads);
+    
     resetState();
 }
 
@@ -757,7 +759,7 @@ InstructionQueue::commit(const InstSeqNum &inst, ThreadID tid)
 {
     DPRINTF(IQ, "[tid:%i] Committing instructions older than [sn:%llu]\n",
             tid,inst);
-    scheduler->doCommit(inst);
+    scheduler->doCommit(inst, tid);
 }
 
 int
@@ -1121,7 +1123,9 @@ InstructionQueue::doSquash(ThreadID tid)
 
     DPRINTF(IQ, "[tid:%i] Squashing until sequence number %i!\n",
             tid, squashedSeqNum[tid]);
-    scheduler->doSquash(squashedSeqNum[tid]);
+    squashInfo.squashTid = tid;
+    squashInfo.squashSn  = squashedSeqNum[tid];
+    scheduler->doSquash(squashInfo);
 
     for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) {
         if (!it->inst ||
@@ -1134,7 +1138,7 @@ InstructionQueue::doSquash(ThreadID tid)
     }
 
     for (auto it = nonSpecInsts.begin(); it != nonSpecInsts.end();) {
-        if (it->first > squashedSeqNum[tid]) {
+        if (it->first > squashedSeqNum[tid]  && (it->second->threadNumber == tid)) {
             auto& squashed_inst = it->second;
             if (!squashed_inst->isIssued() ||
                 (squashed_inst->isMemRef() &&
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index db01710da9..f163ebb28e 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -427,6 +427,7 @@ class InstructionQueue
 
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum[MaxThreads];
+    SquashInfo    squashInfo;
 
     struct IQStats : public statistics::Group
     {
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index 1017698961..87a3e39dd7 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -140,6 +140,58 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid)
     }
 }
 
+void
+SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq)
+{
+    BaseSelector::setparent(scheduler, iq);
+
+    smtScheduler = iq->getIndependentIQICountScheduler();
+}
+
+ReadyQue::iterator
+SMTBasedSelector::select(ReadyQue::iterator begin, int portid)
+{
+    if (begin == end) {
+        return end;
+    }
+    
+    ThreadID priorityThread = 0;
+    
+    if (smtScheduler) {
+        priorityThread = smtScheduler->getThread();
+        
+        DPRINTF(Schedule, 
+            "SMTBasedSelector: priority thread = %d\n", 
+            priorityThread);
+    }
+    
+    for (auto it = begin; it != end; it++) {
+        auto& inst = *it;
+        
+        if (inst->threadNumber == priorityThread) {
+            DPRINTF(Schedule, 
+                "[sn:%llu] selected by SMT policy (tid=%d)\n",
+                inst->seqNum, priorityThread);
+            return it;
+        }
+    }
+    
+    
+    for (auto it = begin; it != end; it++) {
+        auto& inst = *it;
+        
+        if (inst->threadNumber != priorityThread) {
+            DPRINTF(Schedule, 
+                "[sn:%llu] selected by default (tid=%d, priority=%d)\n",
+                inst->seqNum, inst->threadNumber, priorityThread);
+            return it;
+        }
+    }
+    
+    DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n");
+    return begin;
+}
+
 bool
 IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const
 {
@@ -301,6 +353,9 @@ IssueQue::IssueQue(const IssueQueParams& params)
         if (storePipeAcc)
             numStorePipe++;
     }
+
+    //Init InstsCounter
+    instsCounter = new InstsCounter();
 }
 
 void
@@ -327,7 +382,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst)
         }
         // check bypass data ready or not
         if (!scheduler->bypassScoreboard[src->flatIndex()]) [[unlikely]] {
-            auto dst_inst = scheduler->getInstByDstReg(src->flatIndex());
+            auto dst_inst = scheduler->getInstByDstReg(src->flatIndex(),
+                                                       inst->threadNumber,
+                                                       inst->seqNum);
             assert(dst_inst);
             if (!dst_inst->isLoad()) panic("dst[sn:%llu] is not load, src[sn:%llu]", dst_inst->seqNum, inst->seqNum);
             warn_once(
@@ -350,6 +407,9 @@ IssueQue::addToFu(const DynInstPtr& inst)
     }
     inst->setIssued();
     POPINST(inst);
+    if (hasInstsCounter()) {
+        decInIQInstsCounter(inst->threadNumber);
+    }
     scheduler->addToFU(inst);
 }
 
@@ -489,14 +549,16 @@ IssueQue::wakeUpDependents(const DynInstPtr& inst, bool speculative)
         for (auto& it : depgraph) {
             int srcIdx = it.first;
             auto& consumer = it.second;
-            if (consumer->readySrcIdx(srcIdx)) {
-                continue;
-            }
-            consumer->markSrcRegReady(srcIdx);
+            if(consumer->threadNumber == inst->threadNumber){
+                if (consumer->readySrcIdx(srcIdx)) {
+                    continue;
+                }
+                consumer->markSrcRegReady(srcIdx);
 
 
-            DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx);
-            addIfReady(consumer);
+                DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx);
+                addIfReady(consumer);
+            }
         }
 
         if (!speculative) {
@@ -697,6 +759,9 @@ IssueQue::insert(const DynInstPtr& inst)
     selector->allocate(inst);
     inst->issueQue = this;
     instList.emplace_back(inst);
+    if (hasInstsCounter()) {
+        incInIQInstsCounter(inst->threadNumber);
+    }
     bool addToDepGraph = false;
     for (int i = 0; i < inst->numSrcRegs(); i++) {
         auto src = inst->renamedSrcIdx(i);
@@ -743,20 +808,28 @@ IssueQue::insertNonSpec(const DynInstPtr& inst)
 }
 
 void
-IssueQue::doCommit(const InstSeqNum seqNum)
+IssueQue::doCommit(const InstSeqNum seqNum, ThreadID tid)
 {
-    while (!instList.empty() && instList.front()->seqNum <= seqNum) {
-        assert(instList.front()->isIssued());
-        instList.pop_front();
+    for (auto it = instList.begin(); it != instList.end();) {
+        const auto &inst = *it;
+        if (inst->threadNumber == tid && inst->seqNum <= seqNum) {
+            assert(inst->isIssued());
+            it = instList.erase(it);
+        } else {
+            ++it;
+        }
     }
 }
 
 void
-IssueQue::doSquash(const InstSeqNum seqNum)
+IssueQue::doSquash(SquashInfo squashInfo)
 {
     for (auto it = instList.begin(); it != instList.end();) {
-        if ((*it)->seqNum > seqNum) {
+        if (((*it)->seqNum > squashInfo.squashSn) && ((*it)->threadNumber == squashInfo.squashTid)) {
             if (!(*it)->isIssued()) {
+                if (hasInstsCounter()) {
+                    decInIQInstsCounter((*it)->threadNumber);
+                }
                 POPINST((*it));
                 (*it)->setIssued();
             }
@@ -779,7 +852,7 @@ IssueQue::doSquash(const InstSeqNum seqNum)
         int size = inflightIssues[-i].size;
         for (int j = 0; j < size; j++) {
             auto& inst = inflightIssues[-i].insts[j];
-            if (inst && inst->isSquashed()) {
+            if (inst && inst->isSquashed() && (inst->threadNumber == squashInfo.squashTid)) {
                 inst = nullptr;
             }
         }
@@ -788,7 +861,7 @@ IssueQue::doSquash(const InstSeqNum seqNum)
     // clear in depGraph
     for (auto& entrys : subDepGraph) {
         for (auto it = entrys.begin(); it != entrys.end();) {
-            if ((*it).second->isSquashed()) {
+            if ((*it).second->isSquashed() && ((*it).second->threadNumber == squashInfo.squashTid)) {
                 it = entrys.erase(it);
             } else {
                 it++;
@@ -797,6 +870,33 @@ IssueQue::doSquash(const InstSeqNum seqNum)
     }
 }
 
+void
+IssueQue::incInIQInstsCounter(ThreadID tid)
+{
+    if (instsCounter) {
+        instsCounter->incCounter(tid);
+    } 
+}
+    
+void
+IssueQue::decInIQInstsCounter(ThreadID tid)
+{
+    if (instsCounter) {
+        instsCounter->decCounter(tid);
+    }
+}
+
+void
+IssueQue::initIndependentIQICountScheduler(int numThreads)
+{
+       assert(instsCounter != nullptr && "InstsCounter must be set first");
+        
+        independentIQICountScheduler = new IndependentIQICountScheduler(
+            numThreads, instsCounter);
+        
+        DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname);    
+}
+
 Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to,
                                                       PendingWakeEventsType* owner)
     : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to)
@@ -1143,18 +1243,28 @@ Scheduler::ready(OpClass op, int disp_seq)
 }
 
 DynInstPtr
-Scheduler::getInstByDstReg(RegIndex flatIdx)
+Scheduler::getInstByDstReg(RegIndex flatIdx, ThreadID tid,
+                           InstSeqNum consumerSeqNum)
 {
+    DynInstPtr candidate = nullptr;
+
     for (auto iq : issueQues) {
-        for (auto& inst : iq->instList) {
-            for (auto i = 0; i < inst->numDestRegs(); i++) {
-                if (inst->renamedDestIdx(i)->flatIndex() == flatIdx) {
-                    return inst;
+        for (auto &inst : iq->instList) {
+            if (inst->threadNumber != tid || inst->seqNum >= consumerSeqNum) {
+                continue;
+            }
+            for (int i = 0; i < inst->numDestRegs(); i++) {
+                if (inst->renamedDestIdx(i)->flatIndex() != flatIdx) {
+                    continue;
+                }
+                if (!candidate || inst->seqNum > candidate->seqNum) {
+                    candidate = inst;
                 }
             }
         }
     }
-    return nullptr;
+
+    return candidate;
 }
 
 void
@@ -1394,12 +1504,14 @@ Scheduler::loadCancel(const DynInstPtr& inst)
                 for (auto& it : iq->subDepGraph[dst->flatIndex()]) {
                     int srcIdx = it.first;
                     auto& depInst = it.second;
-                    if (depInst->readySrcIdx(srcIdx)) {
-                        DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum,
-                                depInst->renamedSrcIdx(srcIdx)->flatIndex());
-                        depInst->issueQue->cancel(depInst);
-                        depInst->clearSrcRegReady(srcIdx);
-                        dfs.push(depInst);
+                    if(depInst->threadNumber == inst->threadNumber){
+                        if (depInst->readySrcIdx(srcIdx)) {
+                            DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum,
+                                    depInst->renamedSrcIdx(srcIdx)->flatIndex());
+                            depInst->issueQue->cancel(depInst);
+                            depInst->clearSrcRegReady(srcIdx);
+                            dfs.push(depInst);
+                        }
                     }
                 }
             }
@@ -1512,19 +1624,19 @@ Scheduler::isDrained()
 }
 
 void
-Scheduler::doCommit(const InstSeqNum seqNum)
+Scheduler::doCommit(const InstSeqNum seqNum, ThreadID tid)
 {
     for (auto it : issueQues) {
-        it->doCommit(seqNum);
+        it->doCommit(seqNum, tid);
     }
 }
 
 void
-Scheduler::doSquash(const InstSeqNum seqNum)
+Scheduler::doSquash(SquashInfo squashInfo)
 {
-    DPRINTF(Schedule, "doSquash until seqNum %lu\n", seqNum);
+    DPRINTF(Schedule, "doSquash until seqNum %lu\n", squashInfo.squashSn);
     for (auto it : issueQues) {
-        it->doSquash(seqNum);
+        it->doSquash(squashInfo);
     }
 }
 
@@ -1538,6 +1650,17 @@ Scheduler::getIQInsts()
     return total;
 }
 
+uint32_t
+Scheduler::getIQInsts(ThreadID tid)
+{
+    uint32_t total = 0;
+    for (auto iq : issueQues) {
+        total += iq->getInstsCounter()->getCounter(tid);;   
+    }
+    return total;
+}
+
+
 void
 Scheduler::setMainRdpOpt(bool enable)
 {
@@ -1546,5 +1669,19 @@ Scheduler::setMainRdpOpt(bool enable)
     }
 }
 
+void
+Scheduler::initIQICountSmtScheduler(int numThreads)
+{
+    DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads);
+        
+    // to do: add switch;add SMTSchedulingPolicy
+    for (auto iq : issueQues) {
+        InstsCounter* counter = iq->getInstsCounter();
+        assert(counter);
+        iq->initIndependentIQICountScheduler(numThreads);
+        iq->selector->setparent(this, iq);
+    }
+}
+
 }
 }
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index f804595b54..bade5f78f8 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -16,12 +16,14 @@
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/smt_sched.hh"
 #include "cpu/reg_class.hh"
 #include "cpu/timebuf.hh"
 #include "params/BaseSelector.hh"
 #include "params/IssuePort.hh"
 #include "params/IssueQue.hh"
 #include "params/PAgeSelector.hh"
+#include "params/SMTBasedSelector.hh"
 #include "params/Scheduler.hh"
 #include "params/SpecWakeupChannel.hh"
 #include "sim/sim_object.hh"
@@ -99,11 +101,25 @@ class PAgeSelector : public BaseSelector
     ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
 };
 
+class SMTBasedSelector : public BaseSelector
+{
+  private:
+      IndependentIQICountScheduler* smtScheduler = nullptr;
+  public:
+    SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {}
+    void setparent(Scheduler* scheduler, IssueQue* iq) override;
+    void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);}
+    void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);}
+    ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
+};
+
 class IssueQue : public SimObject
 {
     friend class Scheduler;
     friend class BaseSelector;
     friend class PAgeSelector;
+    friend class InstsCounter;
+    friend class IndependentIQICountScheduler;
 
     std::string _name;
     const int inports;
@@ -171,6 +187,10 @@ class IssueQue : public SimObject
     Scheduler* scheduler = nullptr;
     BaseSelector* selector = nullptr;
 
+    //iq smt scheduler
+    InstsCounter* instsCounter = nullptr;
+    IndependentIQICountScheduler* independentIQICountScheduler = nullptr;
+
     struct IssueQueStats : public statistics::Group
     {
         IssueQueStats(statistics::Group* parent, IssueQue* que, std::string name);
@@ -206,6 +226,21 @@ class IssueQue : public SimObject
     void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; }
     void resetDepGraph(int numPhysRegs);
 
+    void setInstsCounter(InstsCounter* counter) { instsCounter = counter;}
+
+    InstsCounter* getInstsCounter() const {return instsCounter; }
+
+    void incInIQInstsCounter(ThreadID tid);
+    void decInIQInstsCounter(ThreadID tid);
+    bool hasInstsCounter() const { return instsCounter != nullptr; }
+
+    void initIndependentIQICountScheduler(int numThreads);
+
+    void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) {
+      independentIQICountScheduler = _independentIQICountScheduler;
+    }
+    IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; }
+
     void tick();
     bool ready();
     int emptyEntries() const { return iqsize - instNum; }
@@ -217,8 +252,8 @@ class IssueQue : public SimObject
     void retryMem(const DynInstPtr& inst);
     bool idle();
 
-    void doCommit(const InstSeqNum inst);
-    void doSquash(const InstSeqNum seqNum);
+    void doCommit(const InstSeqNum inst, ThreadID tid);
+    void doSquash(SquashInfo squashInfo);
 
     int getIssueStages() { return scheduleToExecDelay; }
     int getId() { return IQID; }
@@ -329,12 +364,14 @@ class Scheduler : public SimObject
     void setAllScoreBoard(PhysRegIdPtr reg);
     void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; }
     void setMainRdpOpt(bool enable);
+    void initIQICountSmtScheduler(int numThreads);
 
     void tick();
     void issueAndSelect();
     void lookahead(std::deque<DynInstPtr>& insts);
     bool ready(const DynInstPtr& inst, int disp_seq);
-    DynInstPtr getInstByDstReg(RegIndex flatIdx);
+    DynInstPtr getInstByDstReg(RegIndex flatIdx, ThreadID tid,
+                               InstSeqNum consumerSeqNum);
 
     void addProducer(const DynInstPtr& inst);
     // return true if insert successful
@@ -356,9 +393,10 @@ class Scheduler : public SimObject
     uint32_t getCorrectedOpLat(const DynInstPtr& inst);
     bool hasReadyInsts();
     bool isDrained();
-    void doCommit(const InstSeqNum seqNum);
-    void doSquash(const InstSeqNum seqNum);
+    void doCommit(const InstSeqNum seqNum, ThreadID tid);
+    void doSquash(SquashInfo squashInfo);
     uint32_t getIQInsts();
+    uint32_t getIQInsts(ThreadID tid);
 
     SchedulerStats& getStats() { return stats; }
 };
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 759f974cbc..1c18cf33b0 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -89,23 +89,96 @@ LSQ::DcachePort::DcachePort(LSQ *_lsq, CPU *_cpu) :
 
 std::list<LSQ::SingleDataRequest*> LSQ::SingleDataRequest::singleList;
 
+namespace
+{
+
+bool
+storeBufferEntryEligibleForLoad(const LSQ::StoreBufferEntry *entry,
+                                ThreadID load_tid, InstSeqNum load_seq,
+                                uint64_t visible_generation)
+{
+    if (!entry) {
+        return false;
+    }
+
+    if (entry->tid == load_tid) {
+        return entry->seqNum < load_seq;
+    }
+
+    return entry->generation != 0 && entry->generation <= visible_generation;
+}
+
+bool
+storeBufferByteEligibleForLoad(const LSQ::StoreBufferEntry *entry,
+                               size_t byte_idx, ThreadID load_tid,
+                               InstSeqNum load_seq,
+                               uint64_t visible_generation)
+{
+    if (!entry) {
+        return false;
+    }
+
+    if (entry->tid == load_tid) {
+        return entry->seqNum < load_seq;
+    }
+
+    if (!entry->sending) {
+        return false;
+    }
+
+    return byte_idx < entry->byteGenerations.size() &&
+           entry->byteGenerations[byte_idx] != 0 &&
+           entry->byteGenerations[byte_idx] <= visible_generation;
+}
+
+uint64_t
+storeBufferEligibleGeneration(const LSQ::StoreBufferEntry *entry,
+                              ThreadID load_tid, InstSeqNum load_seq,
+                              uint64_t visible_generation)
+{
+    if (!entry) {
+        return 0;
+    }
+
+    uint64_t best_generation = 0;
+    if (storeBufferEntryEligibleForLoad(entry, load_tid, load_seq,
+                                        visible_generation)) {
+        best_generation = entry->generation;
+    }
+    if (storeBufferEntryEligibleForLoad(entry->vice, load_tid, load_seq,
+                                        visible_generation)) {
+        best_generation = std::max(best_generation, entry->vice->generation);
+    }
+    return best_generation;
+}
+
+} // anonymous namespace
+
 void
-LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr,
+LSQ::StoreBufferEntry::reset(ThreadID tid, InstSeqNum seq_num,
+                             uint64_t block_vaddr, uint64_t block_paddr,
                              uint64_t offset, uint8_t *datas, uint64_t size,
-                             const std::vector<bool> &mask)
+                             const std::vector<bool> &mask,
+                             uint64_t generation)
 {
     std::fill(validMask.begin(), validMask.begin() + offset, false);
+    std::fill(byteGenerations.begin(), byteGenerations.end(), 0);
 
     for (int i = 0; i < size; i++) {
         validMask[offset + i] = mask[i];
+        if (mask[i]) {
+            byteGenerations[offset + i] = generation;
+        }
     }
 
     std::fill(validMask.begin() + offset + size, validMask.end(), false);
     memcpy(blockDatas.data() + offset, datas, size);
 
     this->tid = tid;
+    this->seqNum = seq_num;
     this->blockVaddr = block_vaddr;
     this->blockPaddr = block_paddr;
+    this->generation = generation;
     this->sending = false;
     this->request = nullptr;
     this->vice = nullptr;
@@ -113,19 +186,23 @@ LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_
 
 void
 LSQ::StoreBufferEntry::merge(uint64_t offset, uint8_t *datas, uint64_t size,
-                             const std::vector<bool> &mask)
+                             const std::vector<bool> &mask,
+                             uint64_t generation)
 {
     assert(offset + size <= validMask.size());
     for (uint64_t i = 0; i < size; ++i) {
         if (mask[i]) {
             blockDatas[offset + i] = datas[i];
             validMask[offset + i] = true;
+            byteGenerations[offset + i] = generation;
         }
     }
 }
 
 bool
-LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq)
+LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq,
+                                     ThreadID load_tid, InstSeqNum load_seq,
+                                     uint64_t visible_generation)
 {
     int offset = req->getPaddr() & (validMask.size() - 1);
     // the offset in the split request
@@ -136,13 +213,21 @@ LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq)
     bool full_forward = true;
     for (int i = 0; i < req->getSize(); i++) {
         assert(goffset + i < lsqreq->_size);
-        if (vice && vice->validMask[offset + i]) {
+        const bool vice_eligible =
+            vice && vice->validMask[offset + i] &&
+            storeBufferByteEligibleForLoad(vice, offset + i, load_tid,
+                                           load_seq, visible_generation);
+        const bool self_eligible =
+            validMask[offset + i] &&
+            storeBufferByteEligibleForLoad(this, offset + i, load_tid,
+                                           load_seq, visible_generation);
+        if (vice_eligible) {
             // vice is newer
             assert(vice->blockVaddr == blockVaddr);
             lsqreq->SBforwardPackets.push_back(
                 LSQRequest::FWDPacket{
                     .idx = goffset + i, .byte = vice->blockDatas[offset + i]});
-        } else if (validMask[offset + i]) {
+        } else if (self_eligible) {
             lsqreq->SBforwardPackets.push_back(
                 LSQRequest::FWDPacket{
                     .idx = goffset + i, .byte = blockDatas[offset + i]});
@@ -182,6 +267,40 @@ LSQ::StoreBuffer::size() const
     return _size;
 }
 
+uint64_t
+LSQ::StoreBuffer::size(ThreadID tid) const
+{
+    uint64_t count = 0;
+    for (size_t index = 0; index < data_vec.size(); ++index) {
+        if (!data_vld[index]) {
+            continue;
+        }
+
+        auto *entry = data_vec[index];
+        if (entry && entry->tid == tid) {
+            ++count;
+        }
+    }
+    return count;
+}
+
+uint64_t
+LSQ::StoreBuffer::size(ThreadID tid, InstSeqNum seq_num) const
+{
+    uint64_t count = 0;
+    for (size_t index = 0; index < data_vec.size(); ++index) {
+        if (!data_vld[index]) {
+            continue;
+        }
+
+        auto *entry = data_vec[index];
+        if (entry && entry->tid == tid && entry->seqNum < seq_num) {
+            ++count;
+        }
+    }
+    return count;
+}
+
 uint64_t
 LSQ::StoreBuffer::unsentSize() const
 {
@@ -243,6 +362,47 @@ LSQ::StoreBuffer::getEvict()
     return data_vec[index];
 }
 
+LSQ::StoreBufferEntry *
+LSQ::StoreBuffer::getEvict(const bool *eligible_tids, size_t num_threads)
+{
+    return getEvict(eligible_tids, nullptr, num_threads);
+}
+
+LSQ::StoreBufferEntry *
+LSQ::StoreBuffer::getEvict(const bool *eligible_tids,
+                           const InstSeqNum *eligible_seq,
+                           size_t num_threads)
+{
+    if (eligible_tids == nullptr && eligible_seq == nullptr) {
+        return getEvict();
+    }
+
+    for (auto it = lru_index.rbegin(); it != lru_index.rend(); ++it) {
+        auto *entry = data_vec[*it];
+        if (!entry) {
+            continue;
+        }
+
+        const ThreadID tid = entry->tid;
+        if (tid >= num_threads) {
+            continue;
+        }
+        if (eligible_tids && !eligible_tids[tid]) {
+            continue;
+        }
+        if (eligible_seq &&
+            eligible_seq[tid] != static_cast<InstSeqNum>(-1) &&
+            entry->seqNum >= eligible_seq[tid]) {
+            continue;
+        }
+
+        lru_index.erase(std::find(lru_index.begin(), lru_index.end(), *it));
+        return entry;
+    }
+
+    return nullptr;
+}
+
 LSQ::StoreBufferEntry *
 LSQ::StoreBuffer::createVice(StoreBufferEntry *entry)
 {
@@ -368,6 +528,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
             params.StoreCompletionWidth);
         thread[tid].init(cpu, iew_ptr, params, this, tid);
         thread[tid].setDcachePort(&dcachePort);
+        _storeBufferFlushing[tid] = false;
     }
 
     std::vector<StoreBufferEntry *> store_buffer_entries;
@@ -637,6 +798,14 @@ LSQ::insertStore(const DynInstPtr &store_inst)
     thread[tid].insertStore(store_inst);
 }
 
+bool
+LSQ::splitStoreAddrSquashed(const DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    return thread[tid].splitStoreAddrSquashed(inst);
+}
+
 void
 LSQ::issueToLoadPipe(const DynInstPtr &inst)
 {
@@ -705,18 +874,26 @@ LSQ::processWriteback()
 
 
     if (storeBufferBlocked()) {
-        // dont offload store to sbuffer when sbuffer is flushing
         DPRINTF(StoreBuffer, "Store buffer is blocking, skip SQ offload\n");
         return;
     }
+
     std::vector<uint32_t> offload_quota(numThreads, 0);
     std::vector<uint32_t> offload_demand(numThreads, 0);
     std::vector<ThreadID> requester_tids;
     requester_tids.reserve(activeThreads->size());
+
     for (ThreadID tid : *activeThreads) {
         offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries(
             maxStoreBufferEntriesAcceptedFromSQPerCycle);
-        if (offload_demand[tid] != 0) {
+        // During a global sbuffer flush, only threads that requested the
+        // flush may keep draining older committed stores from their SQ.
+        // If both SMT threads are flushing simultaneously, both must still be
+        // allowed to make forward progress, otherwise they can deadlock while
+        // waiting on each other's flush bit.
+        const bool conti =
+            !storeBufferFlushing() || storeBufferFlushing(tid);
+        if (conti && offload_demand[tid] != 0) {
             requester_tids.push_back(tid);
         }
     }
@@ -760,17 +937,23 @@ LSQ::processWriteback()
         ThreadID tid = *threads++;
         thread[tid].offloadToStoreBuffer(offload_quota[tid]);
     }
+
+    // A fence/flush only waits for the requesting thread's sbuffer domain.
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (!storeBufferFlushing(tid) ||
+            !storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid])) {
+            continue;
+        }
+
+        clearStoreBufferFlushing(tid);
+        cpu->activityThisCycle();
+    }
 }
 
 void
 LSQ::storeBufferWriteback()
 {
     bool can_evict = true;
-    if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] {
-        assert(storeBuffer.unsentSize() == 0);
-        clearStoreBufferFlushing();
-        cpu->activityThisCycle();
-    }
 
     // write request will stall one cycle
     // so 2 cycle send one write request
@@ -810,12 +993,23 @@ LSQ::storeBufferWriteback()
         }
 
         if (cause) {
-            StoreBufferEntry *entry = storeBuffer.getEvict();
+            StoreBufferEntry *entry = nullptr;
+            if (*cause == StoreBufferEvictCause::Flush) {
+                entry = storeBuffer.getEvict(
+                    _storeBufferFlushing, _storeBufferFlushBeforeSeq,
+                    numThreads);
+            } else {
+                entry = storeBuffer.getEvict();
+            }
+            if (!entry) {
+                /* Disabled with the broad sbuffer watchdog above. */
+                return;
+            }
+            /* Disabled with the broad sbuffer watchdog above. */
             auto &owner_unit = thread[entry->tid];
             recordStoreBufferEviction(*cause);
             DPRINTF(StoreBuffer, "Evicting sbuffer entry[%#x]\n",
                     entry->blockPaddr);
-
             if (debug::StoreBuffer) {
                 DPRINTFR(StoreBuffer, "Dumping sbuffer entry data\n");
                 for (int i = 0; i < owner_unit.cacheLineSize(); i++) {
@@ -901,6 +1095,20 @@ void
 LSQ::completeSbufferEvict(PacketPtr pkt)
 {
     auto request = dynamic_cast<SbufferRequest *>(pkt->senderState);
+    const Addr block_paddr = request->sbuffer_entry->blockPaddr;
+    invalidateOtherThreadStoreBufferBytes(request->sbuffer_entry->tid,
+                                          request->mainReq()->getPaddr(),
+                                          request->mainReq()->getByteEnable(),
+                                          request->sbuffer_entry->generation);
+    markStoreBufferBlockVisible(block_paddr,
+                                request->sbuffer_entry->generation);
+    const bool replay_executed_loads =
+        cpu->consumeSyncVisibleStoreReplay(request->sbuffer_entry->tid);
+    notifyOtherThreadsStoreVisible(request->sbuffer_entry->tid,
+                                   request->mainReq()->getPaddr(),
+                                   request->mainReq()->getByteEnable(),
+                                   request->sbuffer_entry->seqNum,
+                                   replay_executed_loads);
     if (cpu->goldenMemManager() &&
         cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
         Addr paddr = request->mainReq()->getPaddr();
@@ -912,6 +1120,7 @@ LSQ::completeSbufferEvict(PacketPtr pkt)
     }
 
     storeBuffer.release(request->sbuffer_entry);
+    reclaimStoreBufferBlockMetadata(block_paddr);
     DPRINTF(StoreBuffer,
             "finish entry[%#x] evict to cache, sbuffer size: %d, "
             "unsentsize: %d\n",
@@ -1074,7 +1283,6 @@ LSQ::recvTimingResp(PacketPtr pkt)
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
     panic_if(!request, "Got packet back with unknown sender state\n");
 
-
     thread[request->_port.lsqID].recvTimingResp(pkt);
 
     if (pkt->isInvalidate()) {
@@ -1337,6 +1545,12 @@ LSQ::lqEmpty() const
     return true;
 }
 
+bool
+LSQ::lqEmpty(ThreadID tid) const
+{
+    return thread[tid].lqEmpty();
+}
+
 bool
 LSQ::sqEmpty() const
 {
@@ -1353,6 +1567,12 @@ LSQ::sqEmpty() const
     return true;
 }
 
+bool
+LSQ::sqEmpty(ThreadID tid) const
+{
+    return thread[tid].sqEmpty();
+}
+
 bool
 LSQ::lqFull()
 {
@@ -1419,6 +1639,29 @@ LSQ::getLSQHeadInst(ThreadID tid, bool isLoad)
     }
 }
 
+int
+LSQ::getLoadPFSource(const DynInstPtr &inst) const
+{
+    if (!inst || !inst->isLoad() || inst->lqIdx < 0) {
+        return -1;
+    }
+
+    const auto &entry = thread[inst->threadNumber].loadQueue[inst->lqIdx];
+    auto *request = entry.request();
+    if (!request) {
+        return -1;
+    }
+
+    // A load can retire through a split request or after replay/discard has
+    // detached some request state. Prefetch source is best-effort metadata, so
+    // only query a live sub-request when one still exists.
+    if (request->numReqs() == 0) {
+        return -1;
+    }
+
+    return request->req()->getPFSource();
+}
+
 bool
 LSQ::isStalled()
 {
@@ -1466,12 +1709,245 @@ LSQ::hasStoresToWB(ThreadID tid)
     return thread.at(tid).hasStoresToWB();
 }
 
-bool LSQ::flushStores(ThreadID tid)
+bool
+LSQ::hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num)
+{
+    return thread.at(tid).hasStoresToWBBefore(seq_num);
+}
+
+bool
+LSQ::flushStores(ThreadID tid)
+{
+    _storeBufferFlushing[tid] = true;
+    _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    const bool has_stores = hasStoresToWB(tid);
+    const bool sbuffer_empty =
+        storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid]);
+    if (!has_stores && sbuffer_empty) {
+        clearStoreBufferFlushing(tid);
+        return true;
+    }
+
+    return false;
+}
+
+bool
+LSQ::flushStores(ThreadID tid, InstSeqNum seq_num)
+{
+    _storeBufferFlushing[tid] = true;
+    _storeBufferFlushBeforeSeq[tid] = seq_num;
+    const bool has_older_stores = hasStoresToWBBefore(tid, seq_num);
+    const bool sbuffer_empty = storeBufferEmpty(tid, seq_num);
+    if (!has_older_stores && sbuffer_empty) {
+        clearStoreBufferFlushing(tid);
+        return true;
+    }
+
+    return false;
+}
+
+void
+LSQ::requestGlobalStoreBufferFlush()
 {
-    _storeBufferFlushing = true;
-    // TODO：high performance shared SMT storebuffer flushing
-    bool t = !hasStoresToWB(tid) && storeBufferEmpty();
-    return t;
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        _storeBufferFlushing[tid] = true;
+        _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    }
+}
+
+bool
+LSQ::storeBufferHasConflict(ThreadID tid, Addr block_paddr) const
+{
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+
+        if (storeBuffer.get(other_tid, block_paddr)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+uint64_t
+LSQ::bumpStoreBufferBlockVersion(Addr block_paddr)
+{
+    auto &version = storeBufferBlockVersion[block_paddr];
+    ++version;
+    if (version == 0) {
+        version = 1;
+    }
+    return version;
+}
+
+uint64_t
+LSQ::currentStoreBufferBlockVersion(Addr block_paddr) const
+{
+    auto it = storeBufferBlockVersion.find(block_paddr);
+    return it == storeBufferBlockVersion.end() ? 0 : it->second;
+}
+
+void
+LSQ::markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation)
+{
+    auto &visible = storeBufferVisibleVersion[block_paddr];
+    visible = std::max(visible, generation);
+    reclaimStoreBufferBlockMetadata(block_paddr);
+}
+
+uint64_t
+LSQ::currentStoreBufferVisibleVersion(Addr block_paddr) const
+{
+    auto it = storeBufferVisibleVersion.find(block_paddr);
+    return it == storeBufferVisibleVersion.end() ? 0 : it->second;
+}
+
+LSQ::StoreBufferEntry *
+LSQ::findForwardingStoreBufferEntry(Addr block_paddr, ThreadID load_tid,
+                                    InstSeqNum load_seq) const
+{
+    StoreBufferEntry *best_entry = nullptr;
+    uint64_t best_generation = 0;
+    const auto visible_generation =
+        currentStoreBufferVisibleVersion(block_paddr);
+
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        auto entry = storeBuffer.get(tid, block_paddr);
+        if (!entry) {
+            continue;
+        }
+
+        const uint64_t entry_generation =
+            storeBufferEligibleGeneration(entry, load_tid, load_seq,
+                                          visible_generation);
+        if (entry_generation == 0) {
+            continue;
+        }
+
+        if (!best_entry || entry_generation > best_generation) {
+            best_entry = entry;
+            best_generation = entry_generation;
+        }
+    }
+
+    return best_entry;
+}
+
+bool
+LSQ::hasLiveStoreBufferBlock(Addr block_paddr) const
+{
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (storeBuffer.get(tid, block_paddr)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void
+LSQ::reclaimStoreBufferBlockMetadata(Addr block_paddr)
+{
+    if (hasLiveStoreBufferBlock(block_paddr)) {
+        return;
+    }
+
+    auto version_it = storeBufferBlockVersion.find(block_paddr);
+    if (version_it == storeBufferBlockVersion.end()) {
+        storeBufferVisibleVersion.erase(block_paddr);
+        return;
+    }
+
+    auto visible_it = storeBufferVisibleVersion.find(block_paddr);
+    const uint64_t visible_generation =
+        visible_it == storeBufferVisibleVersion.end() ? 0 : visible_it->second;
+    if (visible_generation < version_it->second) {
+        return;
+    }
+
+    storeBufferBlockVersion.erase(version_it);
+    if (visible_it != storeBufferVisibleVersion.end()) {
+        storeBufferVisibleVersion.erase(visible_it);
+    }
+}
+
+void
+LSQ::invalidateOtherThreadStoreBufferBytes(
+    ThreadID tid, Addr paddr, const std::vector<bool> &mask,
+    uint64_t generation)
+{
+    const Addr cache_block_mask =
+        ~((static_cast<Addr>(cpu->cacheLineSize())) - 1);
+    const Addr block_paddr = paddr & cache_block_mask;
+    const Addr offset = paddr & ~cache_block_mask;
+    auto invalidate_entry = [&](StoreBufferEntry *entry) {
+        if (!entry || offset + mask.size() > entry->validMask.size()) {
+            return;
+        }
+
+        if (!entry->sending) {
+            return;
+        }
+
+        for (size_t i = 0; i < mask.size(); ++i) {
+            if (mask[i] &&
+                entry->byteGenerations[offset + i] != 0 &&
+                entry->byteGenerations[offset + i] <= generation) {
+                entry->validMask[offset + i] = false;
+            }
+        }
+    };
+
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+
+        auto entry = storeBuffer.get(other_tid, block_paddr);
+        if (!entry) {
+            continue;
+        }
+
+        invalidate_entry(entry);
+        invalidate_entry(entry->vice);
+    }
+}
+
+void
+LSQ::notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr,
+                                    const std::vector<bool> &byte_enable,
+                                    InstSeqNum store_seq,
+                                    bool replay_executed_loads)
+{
+    if (numThreads <= 1) {
+        return;
+    }
+
+    Request::Flags flags;
+    const Addr cache_block_mask =
+        ~((static_cast<Addr>(cpu->cacheLineSize())) - 1);
+    RequestPtr req = std::make_shared<Request>(
+        store_paddr & cache_block_mask, cpu->cacheLineSize(), flags,
+        cpu->dataRequestorId());
+    Packet pkt(req, MemCmd::InvalidateReq);
+
+    for (ThreadID context_id = 0; context_id < numThreads; ++context_id) {
+        gem5::ThreadContext *tc = cpu->getContext(context_id);
+        bool no_squash = cpu->thread[context_id]->noSquashFromTC;
+        cpu->thread[context_id]->noSquashFromTC = true;
+        tc->getIsaPtr()->handleLockedSnoop(&pkt, cache_block_mask);
+        cpu->thread[context_id]->noSquashFromTC = no_squash;
+    }
+
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+        thread[other_tid].checkLocalStoreVisible(store_paddr, byte_enable,
+                                                 store_seq,
+                                                 replay_executed_loads);
+    }
 }
 
 int
@@ -1529,6 +2005,48 @@ LSQ::dumpInsts(ThreadID tid) const
     thread.at(tid).dumpInsts();
 }
 
+void
+LSQ::dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const
+{
+    cprintf("Store buffer state for tid %i:\n", tid);
+    cprintf("  flushing=%d flushBeforeSeq=%llu\n",
+            _storeBufferFlushing[tid],
+            static_cast<unsigned long long>(_storeBufferFlushBeforeSeq[tid]));
+    cprintf("  storesToWB=%d hasStoresToWBBefore=%d\n",
+            thread.at(tid).numStoresToSbuffer(),
+            thread.at(tid).hasStoresToWBBefore(seq_num));
+    cprintf("  sbufferSize(tid)=%llu sbufferSizeBeforeSeq=%llu\n",
+            static_cast<unsigned long long>(storeBuffer.size(tid)),
+            static_cast<unsigned long long>(storeBuffer.size(tid, seq_num)));
+}
+
+void
+LSQ::dumpStoreBuffer(ThreadID tid) const
+{
+    cprintf("Store buffer entries for tid %i:\n", tid);
+    const auto &entries = storeBuffer.entries();
+    for (size_t index = 0; index < entries.size(); ++index) {
+        if (!storeBuffer.valid(index)) {
+            continue;
+        }
+
+        auto *entry = entries[index];
+        if (!entry || entry->tid != tid) {
+            continue;
+        }
+
+        cprintf("  idx:%d seq:%llu paddr:%#lx vaddr:%#lx sending=%d vice=%d generation=%llu request=%p\n",
+                entry->index,
+                static_cast<unsigned long long>(entry->seqNum),
+                entry->blockPaddr,
+                entry->blockVaddr,
+                entry->sending,
+                entry->vice != nullptr,
+                static_cast<unsigned long long>(entry->generation),
+                entry->request);
+    }
+}
+
 bool
 LSQ::isMisaligned(const DynInstPtr& inst, Addr vaddr, int size)
 {
@@ -1820,6 +2338,12 @@ LSQ::SplitDataRequest::mainReq()
     return _mainReq;
 }
 
+RequestPtr
+LSQ::SplitDataRequest::mainReq() const
+{
+    return _mainReq;
+}
+
 void
 LSQ::SplitDataRequest::initiateTranslation()
 {
@@ -2028,14 +2552,47 @@ LSQ::LSQRequest::forward()
     }
 }
 
-LSQ::LSQRequest::~LSQRequest()
+void
+LSQ::LSQRequest::detachLSQEntry()
 {
-    if (isAnyOutstandingRequest()) {
-        warn("numInTranslationFragments = %u, _numOutstandingPackets = %u\n",
-             numInTranslationFragments, _numOutstandingPackets);
-        std::raise(SIGINT);
+    if (!_inst) {
+        return;
     }
+
+    if (isLoad() && _inst->lqIdx >= 0 &&
+        _port.loadQueue[_inst->lqIdx].request() == this) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from LQ entry\n",
+                _inst->seqNum);
+        _port.loadQueue[_inst->lqIdx].setRequest(nullptr);
+    } else if ((isAtomic() || _inst->isStore()) && _inst->sqIdx >= 0 &&
+               _port.storeQueue[_inst->sqIdx].request() == this) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from SQ entry\n",
+                _inst->seqNum);
+        _port.storeQueue[_inst->sqIdx].setRequest(nullptr);
+    }
+}
+
+void
+LSQ::LSQRequest::detachInflightLoad()
+{
+    if (!isLoad()) {
+        return;
+    }
+
+    auto &inflight = _port.inflightLoads;
+    auto it = std::find(inflight.begin(), inflight.end(), this);
+    if (it != inflight.end()) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from inflightLoads\n",
+                _inst ? _inst->seqNum : 0);
+        inflight.erase(it);
+    }
+}
+
+LSQ::LSQRequest::~LSQRequest()
+{
     assert(!isAnyOutstandingRequest());
+    detachLSQEntry();
+    detachInflightLoad();
     if (_inst && _inst->savedRequest == this) {
         DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum);
          _inst->savedRequest = nullptr;
@@ -2125,7 +2682,6 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
                     mainReq()->isUncacheable(), cacheHit, *((uint64_t*)buffer));
     }
 
-
     if (isLoad()) {
         auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this);
         if (it != lsqUnit()->inflightLoads.end()) {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 788ff0ae29..6ebbe9d5dd 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -64,6 +64,7 @@
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/dyn_inst_xsmeta.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/packet.hh"
@@ -146,10 +147,13 @@ class LSQ
       public:
         const int index;
         ThreadID tid;
+        InstSeqNum seqNum = 0;
         Addr blockVaddr;
         Addr blockPaddr;
         std::vector<uint8_t> blockDatas;
         std::vector<bool> validMask;
+        std::vector<uint64_t> byteGenerations;
+        uint64_t generation = 0;
         bool sending;
         // the another same addr entry when sending
         // another cannot sending until self sending finished
@@ -161,16 +165,20 @@ class LSQ
         {
             blockDatas.resize(size, 0);
             validMask.resize(size, false);
+            byteGenerations.resize(size, 0);
         }
 
-        void reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr,
-                   uint64_t offset, uint8_t *datas, uint64_t size,
-                   const std::vector<bool> &mask);
+        void reset(ThreadID tid, InstSeqNum seq_num, uint64_t block_vaddr,
+                   uint64_t block_paddr, uint64_t offset, uint8_t *datas,
+                   uint64_t size, const std::vector<bool> &mask,
+                   uint64_t generation);
 
         void merge(uint64_t offset, uint8_t *datas, uint64_t size,
-                   const std::vector<bool> &mask);
+                   const std::vector<bool> &mask, uint64_t generation);
 
-        bool recordForward(RequestPtr req, LSQRequest *lsqreq);
+        bool recordForward(RequestPtr req, LSQRequest *lsqreq,
+                           ThreadID load_tid, InstSeqNum load_seq,
+                           uint64_t visible_generation);
     };
 
     class StoreBuffer
@@ -197,12 +205,21 @@ class LSQ
         void setData(std::vector<StoreBufferEntry *> &data_vec);
         bool full() const;
         uint64_t size() const;
+        uint64_t size(ThreadID tid) const;
+        uint64_t size(ThreadID tid, InstSeqNum seq_num) const;
         uint64_t unsentSize() const;
+        const std::vector<StoreBufferEntry *> &entries() const { return data_vec; }
+        bool valid(size_t index) const { return data_vld.at(index); }
         StoreBufferEntry *getEmpty();
         void insert(StoreBufferEntry *entry);
         StoreBufferEntry *get(ThreadID tid, uint64_t addr) const;
         void update(int index);
         StoreBufferEntry *getEvict();
+        StoreBufferEntry *getEvict(const bool *eligible_tids,
+                                   size_t num_threads);
+        StoreBufferEntry *getEvict(const bool *eligible_tids,
+                                   const InstSeqNum *eligible_seq,
+                                   size_t num_threads);
         StoreBufferEntry *createVice(StoreBufferEntry *entry);
         void release(StoreBufferEntry *entry);
     };
@@ -350,6 +367,8 @@ class LSQ
         AtomicOpFunctorPtr _amo_op;
         bool _hasStaleTranslation;
         bool _sbufferBypass;
+        bool _goldenSnapshotCaptured = false;
+        uint64_t _storeBufferGeneration = 0;
 
         struct FWDPacket
         {
@@ -370,6 +389,14 @@ class LSQ
         /** Install the request in the LQ/SQ. */
         void install();
 
+        /** If the request is still installed in the current LQ/SQ slot,
+         * detach that slot so later scans do not observe a discarded or
+         * deleted request through the queue entry. */
+        void detachLSQEntry();
+
+        /** Remove the request from the in-flight load tracker if present. */
+        void detachInflightLoad();
+
         bool squashed() const override;
 
 
@@ -476,6 +503,7 @@ class LSQ
 
         RequestPtr req(int idx = 0) { return _reqs.at(idx); }
         const RequestPtr req(int idx = 0) const { return _reqs.at(idx); }
+        size_t numReqs() const { return _reqs.size(); }
 
         Addr getVaddr(int idx = 0) const { return req(idx)->getVaddr(); }
         virtual void initiateTranslation() = 0;
@@ -496,6 +524,13 @@ class LSQ
             return req();
         }
 
+        virtual RequestPtr
+        mainReq() const
+        {
+            assert (_reqs.size() == 1);
+            return req();
+        }
+
         /**
          * Test if there is any in-flight translation or mem access request
          */
@@ -635,6 +670,8 @@ class LSQ
         void
         discard()
         {
+            detachLSQEntry();
+            detachInflightLoad();
             release(Flag::Discarded);
         }
 
@@ -766,6 +803,7 @@ class LSQ
         virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
 
         virtual RequestPtr mainReq();
+        virtual RequestPtr mainReq() const;
         virtual PacketPtr mainPacket();
         virtual std::string name() const { return "SplitDataRequest"; }
     };
@@ -823,6 +861,7 @@ class LSQ
     void insertLoad(const DynInstPtr &load_inst);
     /** Inserts a store into the LSQ. */
     void insertStore(const DynInstPtr &store_inst);
+    bool splitStoreAddrSquashed(const DynInstPtr &inst);
 
     /** Executes an amo inst. */
     Fault executeAmo(const DynInstPtr &inst);
@@ -938,8 +977,12 @@ class LSQ
     bool isEmpty() const;
     /** Returns if all of the LQs are empty. */
     bool lqEmpty() const;
+    /** Returns if the LQ of a given thread is empty. */
+    bool lqEmpty(ThreadID tid) const;
     /** Returns if all of the SQs are empty. */
     bool sqEmpty() const;
+    /** Returns if the SQ of a given thread is empty. */
+    bool sqEmpty(ThreadID tid) const;
 
     /** Returns if any of the LQs are full. */
     bool lqFull();
@@ -954,6 +997,8 @@ class LSQ
     /** Returns whether the head instruction of sq has completed*/
     const DynInstPtr& getLSQHeadInst(ThreadID tid, bool isLoad);
 
+    int getLoadPFSource(const DynInstPtr &inst) const;
+
     /**
      * Returns if the LSQ is stalled due to a memory operation that must be
      * replayed.
@@ -972,9 +1017,29 @@ class LSQ
      * to memory.
      */
     bool hasStoresToWB(ThreadID tid);
+    bool hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num);
 
     // true if all stores are flushed
     bool flushStores(ThreadID tid);
+    bool flushStores(ThreadID tid, InstSeqNum seq_num);
+    void requestGlobalStoreBufferFlush();
+    bool storeBufferHasConflict(ThreadID tid, Addr block_paddr) const;
+    uint64_t bumpStoreBufferBlockVersion(Addr block_paddr);
+    uint64_t currentStoreBufferBlockVersion(Addr block_paddr) const;
+    void markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation);
+    uint64_t currentStoreBufferVisibleVersion(Addr block_paddr) const;
+    StoreBufferEntry *findForwardingStoreBufferEntry(Addr block_paddr,
+                                                     ThreadID load_tid,
+                                                     InstSeqNum load_seq) const;
+    bool hasLiveStoreBufferBlock(Addr block_paddr) const;
+    void reclaimStoreBufferBlockMetadata(Addr block_paddr);
+    void invalidateOtherThreadStoreBufferBytes(
+        ThreadID tid, Addr paddr, const std::vector<bool> &mask,
+        uint64_t generation);
+    void notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr,
+                                        const std::vector<bool> &byte_enable,
+                                        InstSeqNum store_seq,
+                                        bool replay_executed_loads);
 
     /** Returns the number of stores a specific thread has to write back. */
     int numStoresToSbuffer(ThreadID tid);
@@ -990,6 +1055,10 @@ class LSQ
     void dumpInsts() const;
     /** Debugging function to print out instructions from a specific thread. */
     void dumpInsts(ThreadID tid) const;
+    /** Debugging function to print store-buffer flush state for a thread. */
+    void dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const;
+    /** Debugging function to print store-buffer entries for a thread. */
+    void dumpStoreBuffer(ThreadID tid) const;
 
     bool isMisaligned(const DynInstPtr& inst, Addr vaddr, int size);
 
@@ -1077,8 +1146,34 @@ class LSQ
     bool getDcacheWriteStall() { return dcacheWriteStall; }
     StoreBuffer &getStoreBuffer() { return storeBuffer; }
     bool storeBufferEmpty() const { return storeBuffer.size() == 0; }
-    bool storeBufferFlushing() const { return _storeBufferFlushing; }
-    void clearStoreBufferFlushing() { _storeBufferFlushing = false; }
+    bool storeBufferEmpty(ThreadID tid) const
+    {
+        return storeBuffer.size(tid) == 0;
+    }
+    bool storeBufferEmpty(ThreadID tid, InstSeqNum seq_num) const
+    {
+        return storeBuffer.size(tid, seq_num) == 0;
+    }
+    bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; }
+    bool storeBufferFlushing() const
+    {
+        for (auto tid : *activeThreads) {
+            if (_storeBufferFlushing[tid])
+                return true;
+        }
+        return false;
+    }
+    void clearStoreBufferFlushing(ThreadID tid)
+    {
+        _storeBufferFlushing[tid] = false;
+        _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    }
+    void clearStoreBufferFlushing() {
+        for (auto tid : *activeThreads) {
+            _storeBufferFlushing[tid] = false;
+            _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+        }
+    }
     uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; }
     uint32_t getSbufferEntries() const { return sbufferEntries; }
     uint64_t getStoreBufferInactiveCycles() const
@@ -1139,7 +1234,6 @@ class LSQ
     std::vector<uint32_t> dcacheRefillDataRead;
     std::vector<uint32_t> dcacheRefillDataWrite;
     std::vector<uint32_t> dcacheRefillTagWrite;
-
     bool isDcacheRefillTagWrite() const
     {
         for (auto stage : dcacheRefillTagWrite) {
@@ -1168,7 +1262,12 @@ class LSQ
     const uint64_t storeBufferInactiveThreshold;
     const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2;
     StoreBuffer storeBuffer;
-    bool _storeBufferFlushing = false;
+    std::unordered_map<Addr, uint64_t> storeBufferBlockVersion;
+    std::unordered_map<Addr, uint64_t> storeBufferVisibleVersion;
+    bool _storeBufferFlushing[MaxThreads] = {false};
+    InstSeqNum _storeBufferFlushBeforeSeq[MaxThreads] = {
+        static_cast<InstSeqNum>(-1)
+    };
     uint64_t storeBufferWritebackInactive = 0;
     StoreBufferEntry *blockedSbufferEntry = nullptr;
     ThreadID nextStoreBufferOffloadTid = InvalidThreadID;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index a358b9df19..f027cdb7db 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -182,10 +182,10 @@ LSQUnit::SQEntry::setStatus(SplitStoreStatus status)
 LSQUnit::WritebackRegEvent::WritebackRegEvent(const DynInstPtr &_inst,
         PacketPtr _pkt, LSQUnit *lsq_ptr)
     : Event(Default_Pri, AutoDelete),
-      inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
+      inst(_inst), request(_inst->savedRequest), pkt(_pkt), lsqPtr(lsq_ptr)
 {
-    assert(_inst->savedRequest);
-    _inst->savedRequest->writebackScheduled();
+    assert(request);
+    request->writebackScheduled();
 }
 
 void
@@ -195,8 +195,8 @@ LSQUnit::WritebackRegEvent::process()
 
     lsqPtr->writebackReg(inst, pkt);
 
-    assert(inst->savedRequest);
-    inst->savedRequest->writebackDone();
+    assert(request);
+    request->writebackDone();
     delete pkt;
 }
 
@@ -349,19 +349,49 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
         if (inst->isLoad() || inst->isAtomic()) {
             Addr addr = pkt->getAddr();
             auto [enable_diff, diff_all_states] = cpu->getDiffAllStates();
-            if (system->multiCore() && enable_diff && !request->_sbufferBypass &&
+            if (system->multiContextDifftest() && enable_diff &&
+                request->_sbufferBypass &&
+                inst->isLoad() &&
+                cpu->goldenMemManager()->inPmem(addr)) {
+                // A store-forwarded load may legitimately observe a value that
+                // is newer than the current shared golden memory snapshot.
+                // Keep the observed value on the instruction so difftest can
+                // repair the reference state for this hart if needed.
+                inst->setGolden(pkt->getPtr<uint8_t>());
+            }
+            if (system->multiContextDifftest() && enable_diff &&
+                !request->_sbufferBypass &&
                 cpu->goldenMemManager()->inPmem(addr)) {
-                // check data with golden mem
-                uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
                 uint8_t *loaded_data = pkt->getPtr<uint8_t>();
                 size_t size = pkt->getSize();
-                if (memcmp(golden_data, loaded_data, size) == 0) {
-                    assert(size == inst->effSize);
-                    inst->setGolden(golden_data);
+                assert(size == inst->effSize);
+
+                if (inst->isAtomic()) {
+                    uint8_t current_golden[8] = {};
+                    panic_if(size > sizeof(current_golden),
+                             "Unexpected AMO size %u at addr %#lx\n",
+                             size, addr);
+                    cpu->goldenMemManager()->readGoldenMem(addr, current_golden,
+                                                           size);
+
+                    // Preserve the DUT-observed old value until completeStore()
+                    // derives the post-AMO memory image. The golden old-value
+                    // snapshot used by difftest is captured when the request
+                    // is first sent, before later concurrent updates can
+                    // advance shared memory.
+                    inst->setGolden(loaded_data);
                 } else {
-                    panic("Data error at addr %#lx, size %d. %s\n",
-                        addr, size,
-                        goldenDiffStr(loaded_data, golden_data, size).c_str());
+                    // check data with golden mem
+                    uint8_t *golden_data =
+                        (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
+                    if (memcmp(golden_data, loaded_data, size) != 0) {
+                        DPRINTF(Diff,
+                                "[tid:%d] [sn:%llu] Load sees value different from "
+                                "current golden memory at addr %#lx, size %d. "
+                                "Treating as concurrent update window. %s\n",
+                                inst->threadNumber, inst->seqNum, addr, size,
+                                goldenDiffStr(loaded_data, golden_data, size).c_str());
+                    }
                 }
             }
         }
@@ -737,6 +767,44 @@ LSQUnit::insertStore(const DynInstPtr& store_inst)
     storeQueue.back().set(store_inst);
 }
 
+LSQUnit::LSQRequest *
+LSQUnit::currentLoadRequest(const DynInstPtr &inst)
+{
+    return (inst && inst->lqIdx >= 0) ? loadQueue[inst->lqIdx].request()
+                                      : nullptr;
+}
+
+LSQUnit::LSQRequest *
+LSQUnit::currentStoreRequest(const DynInstPtr &inst)
+{
+    return (inst && inst->sqIdx >= 0) ? storeQueue[inst->sqIdx].request()
+                                      : nullptr;
+}
+
+bool
+LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst)
+{
+    if (!inst->isSplitStoreData()) {
+        return false;
+    }
+
+    if (!storeQueue.isValidIdx(inst->sqIdx)) {
+        return true;
+    }
+
+    auto sq_it = storeQueue.getIterator(inst->sqIdx);
+    if (!sq_it->valid()) {
+        return true;
+    }
+
+    const auto &sta_inst = sq_it->instruction();
+    if (!sta_inst || sta_inst->seqNum != inst->seqNum) {
+        return true;
+    }
+
+    return sta_inst->isSquashed();
+}
+
 bool
 LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst)
 {
@@ -746,9 +814,10 @@ LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_
     Addr store_eff_addr1 = store_inst->physEffAddr >> depCheckShift;
     Addr store_eff_addr2 = (store_inst->physEffAddr + store_inst->effSize - 1) >> depCheckShift;
 
-    LSQRequest* store_req = store_inst->savedRequest;
+    LSQRequest* store_req = currentStoreRequest(store_inst);
+    LSQRequest* load_req = currentLoadRequest(load_inst);
     // Dont perform pipe line nuke check for split load
-    bool load_is_splited = load_inst->savedRequest && load_inst->savedRequest->isSplit();
+    bool load_is_splited = load_req && load_req->isSplit();
     bool load_need_check = !load_is_splited && load_inst->effAddrValid() &&
                             (load_inst->lqIt >= store_inst->lqIt);
     bool store_need_check = store_req && store_req->isTranslationComplete() &&
@@ -828,7 +897,7 @@ LSQUnit::checkSnoop(PacketPtr pkt)
 
     DynInstPtr ld_inst = iter->instruction();
     assert(ld_inst);
-    LSQRequest *request = ld_inst->savedRequest;
+    LSQRequest *request = iter->request();
 
     // Check that this snoop didn't just invalidate our lock flag
     if (ld_inst->effAddrValid() && request &&
@@ -842,7 +911,7 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     while (++iter != loadQueue.end()) {
         ld_inst = iter->instruction();
         assert(ld_inst);
-        request = ld_inst->savedRequest;// iter->request();
+        request = iter->request();
         if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered())
             continue;
 
@@ -886,12 +955,131 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     return;
 }
 
+namespace
+{
+
+bool
+overlapsVisibleStore(const o3::LSQ::LSQRequest *load_req, Addr store_paddr,
+                     const std::vector<bool> &store_byte_enable)
+{
+    if (!load_req) {
+        return false;
+    }
+
+    for (size_t req_idx = 0; req_idx < load_req->numReqs(); ++req_idx) {
+        const auto req = load_req->req(req_idx);
+        if (!req->hasPaddr()) {
+            continue;
+        }
+
+        const Addr load_start = req->getPaddr();
+        const Addr load_end = load_start + req->getSize();
+        for (size_t byte_idx = 0; byte_idx < store_byte_enable.size();
+             ++byte_idx) {
+            if (!store_byte_enable[byte_idx]) {
+                continue;
+            }
+
+            const Addr byte_addr = store_paddr + byte_idx;
+            if (byte_addr >= load_start && byte_addr < load_end) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+} // anonymous namespace
+
+void
+LSQUnit::checkLocalStoreVisible(Addr store_paddr,
+                                const std::vector<bool> &store_byte_enable,
+                                InstSeqNum store_seq,
+                                bool replay_executed_loads)
+{
+    [[maybe_unused]] const InstSeqNum visible_store_seq = store_seq;
+    [[maybe_unused]] const bool replay_visible_loads = replay_executed_loads;
+
+    if (loadQueue.empty()) {
+        return;
+    }
+
+    const Addr block_addr = store_paddr & cacheBlockMask;
+    DynInstPtr oldest_violator = memDepViolator;
+
+    for (auto it = loadQueue.begin(); it != loadQueue.end(); ++it) {
+        DynInstPtr ld_inst = it->instruction();
+        if (!ld_inst || ld_inst->isSquashed() || ld_inst->needReplay() ||
+            !ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
+            continue;
+        }
+
+        LSQRequest *request = it->request();
+        // Replay/cancel paths can leave the dyninst carrying a stale
+        // savedRequest pointer after the active LQ request has been replaced
+        // or dropped. Only the current queue entry request is safe here.
+        if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) {
+            continue;
+        }
+        if (!overlapsVisibleStore(request, store_paddr, store_byte_enable)) {
+            continue;
+        }
+        if (ld_inst->memReqFlags & Request::LLSC) {
+            ld_inst->tcBase()->getIsaPtr()->handleLockedSnoopHit(ld_inst.get());
+        }
+
+        if (ld_inst->isExecuted()) {
+            DPRINTF(LSQUnit,
+                    "Local visible store ignores already executed load "
+                    "[sn:%lli] on addr %#x\n",
+                    ld_inst->seqNum, store_paddr);
+            continue;
+        }
+
+        ld_inst->hitExternalSnoop(true);
+        ld_inst->possibleLoadViolation(true);
+        DPRINTF(LSQUnit,
+                "Local visible store replays not-yet-executed load [sn:%lli] "
+                "on addr %#x\n",
+                ld_inst->seqNum, store_paddr);
+        ld_inst->setNukeReplay();
+        loadSetReplay(ld_inst, request, true);
+    }
+
+    if (oldest_violator &&
+        (!memDepViolator || oldest_violator->seqNum < memDepViolator->seqNum)) {
+        memDepViolator = oldest_violator;
+        cpu->activityThisCycle();
+        iewStage->SquashCheckAfterExe(oldest_violator);
+    }
+}
+
 Fault
 LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
         const DynInstPtr& inst)
 {
+    LSQRequest *request = nullptr;
+    if (inst->isLoad()) {
+        if (inst->lqIdx >= 0) {
+            request = loadQueue[inst->lqIdx].request();
+        }
+    } else if (inst->isStore() || inst->isAtomic()) {
+        if (inst->sqIdx >= 0) {
+            request = storeQueue[inst->sqIdx].request();
+        }
+    }
+
+    // Replay/cancel paths can drop the active LSQ request before the
+    // instruction is retried. In that window the dyninst may still carry a
+    // stale savedRequest pointer, so only the current LSQ entry request is
+    // safe to inspect here.
+    if (!request) {
+        return NoFault;
+    }
+
     auto saved_it = loadIt;
-    for (auto req0 : inst->savedRequest->_reqs) {
+    for (auto req0 : request->_reqs) {
         Addr inst_eff_addr1 = req0->getPaddr() >> depCheckShift;
         Addr inst_eff_addr2 = (req0->getPaddr() + req0->getSize() - 1) >> depCheckShift;
 
@@ -1000,13 +1188,11 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow)
     // Reset DTB translation state
     inst->translationStarted(false);
     inst->translationCompleted(false);
+    inst->savedRequest = nullptr;
     // clear request in loadQueue
     loadQueue[inst->lqIdx].setRequest(nullptr);
     if (dropReqNow) {
-        // discard this request
         request->discard();
-        // TODO: is this essential?
-        inst->savedRequest = nullptr;
     }
 
     DPRINTF(LoadPipeline, "Load [sn:%ld] set replay, dropReqNow: %d\n", inst->seqNum, dropReqNow);
@@ -1058,8 +1244,9 @@ LSQUnit::loadDoTranslate(const DynInstPtr &inst)
         DPRINTF(LoadPipeline, "Load [sn:%llu] setTLBMissReplay\n", inst->seqNum);
     }
 
-    if (inst->savedRequest && inst->savedRequest->isTranslationComplete()) {
-        inst->setNormalLd(inst->savedRequest->isNormalLd());
+    if (auto *request = currentLoadRequest(inst);
+        request && request->isTranslationComplete()) {
+        inst->setNormalLd(request->isNormalLd());
 
         cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::VAddress, inst->effAddr);
         cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::PAddress, inst->physEffAddr);
@@ -1074,7 +1261,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst)
     DPRINTF(LoadPipeline, "loadDoSendRequest: load [sn:%lli]\n", inst->seqNum);
     assert(!inst->isSquashed());
     Fault load_fault = inst->getFault();
-    LSQRequest* request = inst->savedRequest;
+    LSQRequest* request = currentLoadRequest(inst);
 
     if (inst->effAddrValid()) {
         for (int i = 0; i < storePipeSx[1]->size; i++) {
@@ -1120,9 +1307,9 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst)
     }
 
     if (load_fault != NoFault && inst->translationCompleted() &&
-            inst->savedRequest->isPartialFault()
-            && !inst->savedRequest->isComplete()) {
-        assert(inst->savedRequest->isSplit());
+            request && request->isPartialFault()
+            && !request->isComplete()) {
+        assert(request->isSplit());
         // If we have a partial fault where the mem access is not complete yet
         // then the cache must have been blocked. This load will be re-executed
         // when the cache gets unblocked. We will handle the fault when the
@@ -1165,7 +1352,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
     DPRINTF(LoadPipeline, "loadDoRecvData: load [sn:%lli]\n", inst->seqNum);
 
     assert(!inst->isSquashed());
-    LSQRequest* request = inst->savedRequest;
+    LSQRequest* request = currentLoadRequest(inst);
 
     if (inst->wakeUpEarly()) {
         auto& bus = getLsq()->bus;
@@ -1257,7 +1444,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
 
     // No nuke happens, prepare the inst data
     // assert(request->isNormalLd() ? !request->isAnyOutstandingRequest() : true);
-    request = inst->savedRequest;
+    request = currentLoadRequest(inst);
     if (inst->fullForward()) {
         DPRINTF(LoadPipeline, "Load [sn:%llu] fullForward\n", inst->seqNum);
         assert(request);
@@ -1314,13 +1501,25 @@ LSQUnit::executeLoadPipeSx()
                     case 0:
                         fault = loadDoTranslate(inst);
                         break;
-                    case 1:
-                        iewStage->getScheduler()->specWakeUpFromLoadPipe(inst);
-                        // Loads will mark themselves as executed, and their writeback
-                        // event adds the instruction to the queue to commit
+                    case 1: {
                         fault = loadDoSendRequest(inst);
+                        auto *request = currentLoadRequest(inst);
+                        if (fault == NoFault &&
+                            !inst->replayOrSkipFollowingPipe() &&
+                            inst->readPredicate() &&
+                            inst->readMemAccPredicate() &&
+                            request &&
+                            request->isTranslationComplete() &&
+                            request->isMemAccessRequired()) {
+                            iewStage->getScheduler()->specWakeUpFromLoadPipe(
+                                inst);
+                        }
+                        // Loads will mark themselves as executed, and their
+                        // writeback event adds the instruction to the queue
+                        // to commit.
                         iewStage->SquashCheckAfterExe(inst);
                         break;
+                    }
                     case 2:
                         fault = loadDoRecvData(inst);
 
@@ -1377,10 +1576,12 @@ LSQUnit::executeLoadPipeSx()
                 else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst);
                 else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst);
                 else if (inst->needNukeReplay()) {
-                    if (inst->cacheHit()) {
-                        loadSetReplay(inst, inst->savedRequest, true);
-                    } else if (inst->hasPendingCacheReq()) {
-                        loadSetReplay(inst, inst->savedRequest, false);
+                    if (auto *request = currentLoadRequest(inst); request) {
+                        if (inst->cacheHit()) {
+                            loadSetReplay(inst, request, true);
+                        } else if (inst->hasPendingCacheReq()) {
+                            loadSetReplay(inst, request, false);
+                        }
                     }
                     inst->issueQue->retryMem(inst);
                 }
@@ -1410,7 +1611,10 @@ LSQUnit::executeLoadPipeSx()
             }
 
             if (i == loadPipeStages - 1 && !inst->needReplay()) {
-                if (inst->isNormalLd() || !inst->readMemAccPredicate()) iewStage->readyToFinish(inst);
+                if (inst->isExecuted() &&
+                    (inst->isNormalLd() || !inst->readMemAccPredicate())) {
+                    iewStage->readyToFinish(inst);
+                }
                 iewStage->activityThisCycle();
                 inst->endPipelining();
                 DPRINTF(LoadPipeline, "Load [sn:%llu] ready to finish\n",
@@ -1538,6 +1742,10 @@ LSQUnit::executeStorePipeSx()
                 continue;
             }
 
+            if (splitStoreAddrSquashed(inst)) {
+                inst->setSquashed();
+            }
+
             if (inst->isSquashed()) {
                 DPRINTF(StorePipeline, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
                     " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
@@ -1752,7 +1960,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst)
             if (x.instruction()->seqNum > youngest_inst) {
                 break;
             }
-            assert(x.instruction()->isSplitStoreAddr() ? x.splitStoreFinish() : true);
+            // Commit can publish a new squash to IEW one cycle after IEW has
+            // already received an older doneMemSeqNum. If that stale
+            // doneMemSeqNum reaches here in the same cycle that ROB marks this
+            // store squashed, do not advance SQ writeback state past the
+            // squashed entry; IEW's next-cycle squash will remove it.
+            if (x.instruction()->isSquashed()) {
+                break;
+            }
+            if (x.instruction()->isSplitStoreAddr() && !x.splitStoreFinish()) {
+                panic("Split store reached commitStores unfinished: tid=%d "
+                      "seq=%llu pc=%#lx youngest=%llu canCommit=%d "
+                      "executed=%d squashed=%d addrReady=%d dataReady=%d "
+                      "staFinish=%d stdFinish=%d canWB=%d completed=%d\n",
+                      x.instruction()->threadNumber,
+                      static_cast<unsigned long long>(
+                          x.instruction()->seqNum),
+                      x.instruction()->pcState().instAddr(),
+                      static_cast<unsigned long long>(youngest_inst),
+                      x.instruction()->readyToCommit(),
+                      x.instruction()->isExecuted(),
+                      x.instruction()->isSquashed(),
+                      x.addrReady(), x.dataReady(),
+                      x.staFinish(), x.stdFinish(),
+                      x.canWB(), x.completed());
+            }
             DPRINTF(LSQUnit, "Marking store as able to write back, PC "
                     "%s [sn:%lli]\n",
                     x.instruction()->pcState(),
@@ -1765,6 +1997,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst)
     }
 }
 
+bool
+LSQUnit::hasStoresToWBBefore(InstSeqNum seq_num) const
+{
+    if (storesToWB == 0) {
+        return false;
+    }
+
+    for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) {
+        if (!it->valid() || !it->instruction()) {
+            continue;
+        }
+
+        const auto &inst = it->instruction();
+        if (inst->seqNum >= seq_num) {
+            break;
+        }
+
+        if (it->canWB() && !it->completed()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 bool
 LSQUnit::writebackBlockedStore()
 {
@@ -1772,8 +2029,25 @@ LSQUnit::writebackBlockedStore()
         return false;
     }
 
-    storeWBIt->request()->sendPacketToCache();
-    if (storeWBIt->request()->isSent()) {
+    auto *request = storeWBIt->request();
+    const auto &inst = storeWBIt->instruction();
+
+    if (request->mainReq()->hasPaddr() &&
+        system->multiContextDifftest() && inst->isAtomic() &&
+        cpu->goldenMemManager() &&
+        cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
+        uint8_t issue_golden[8] = {};
+        panic_if(request->_size > sizeof(issue_golden),
+                 "Unexpected AMO size %u at addr %#lx\n",
+                 request->_size, request->mainReq()->getPaddr());
+        cpu->goldenMemManager()->readGoldenMem(
+            request->mainReq()->getPaddr(), issue_golden, request->_size);
+        std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden,
+                    request->_size);
+    }
+
+    request->sendPacketToCache();
+    if (request->isSent()) {
         storePostSend();
     }
     return isStoreBlocked;
@@ -1784,6 +2058,7 @@ LSQUnit::directStoreToCache()
 {
     DynInstPtr inst = storeWBIt->instruction();
     LSQRequest* request = storeWBIt->request();
+
     if ((request->mainReq()->isLLSC() || request->mainReq()->isRelease()) && (storeWBIt.idx() != storeQueue.head())) {
         DPRINTF(LSQUnit,
                 "Store idx:%i PC:%s to Addr:%#x "
@@ -1832,6 +2107,28 @@ LSQUnit::directStoreToCache()
         }
     }
 
+    if (request->mainReq()->hasPaddr()) {
+        if (request->_storeBufferGeneration == 0) {
+            const Addr block_paddr =
+                request->mainReq()->getPaddr() & cacheBlockMask;
+            request->_storeBufferGeneration =
+                lsq->bumpStoreBufferBlockVersion(block_paddr);
+        }
+
+        if (system->multiContextDifftest() && inst->isAtomic() &&
+            cpu->goldenMemManager() &&
+            cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
+            uint8_t issue_golden[8] = {};
+            panic_if(request->_size > sizeof(issue_golden),
+                     "Unexpected AMO size %u at addr %#lx\n",
+                     request->_size, request->mainReq()->getPaddr());
+            cpu->goldenMemManager()->readGoldenMem(
+                request->mainReq()->getPaddr(), issue_golden, request->_size);
+            std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden,
+                        request->_size);
+        }
+    }
+
     if (request->mainReq()->isLocalAccess()) {
         assert(!inst->isStoreConditional());
         assert(!inst->inHtmTransactionalState());
@@ -1893,6 +2190,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
 {
     assert(!lsq->storeBufferBlocked());
     if (isStoreBlocked) return;
+    if (max_entries == 0) return;
 
     uint32_t accepted_entries = 0;
     while (storesToWB > 0 &&
@@ -1923,17 +2221,20 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
             request->mainReq()->isRelease() ||
             request->mainReq()->isStrictlyOrdered() ||
             inst->isStoreConditional()) {
-            DPRINTF(StoreBuffer, "Find atomic/SC store [sn:%llu]\n", storeWBIt->instruction()->seqNum);
             if (!(storeWBIt.idx() == storeQueue.head())) {
-                DPRINTF(StoreBuffer, "atomic/SC store waiting\n");
                 break;
             }
-            if (!storeBufferEmpty()) {
-                DPRINTF(StoreBuffer, "sbuffer need flush\n");
+            if (request->mainReq()->hasPaddr()) {
+                const Addr block_paddr =
+                    request->mainReq()->getPaddr() & cacheBlockMask;
+                if (lsq->storeBufferHasConflict(lsqID, block_paddr)) {
+                    lsq->requestGlobalStoreBufferFlush();
+                    break;
+                }
+            }
+            if (!storeBufferEmpty(lsqID)) {
                 lsq->flushStores(lsqID);
                 break;
-            } else {
-                DPRINTF(StoreBuffer, "sbuffer finishing flushed\n");
             }
             bool contin = directStoreToCache();
             if (isStoreBlocked) {
@@ -1956,8 +2257,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
                 uint64_t offset = vaddr - vbase;
                 DPRINTF(LSQUnit, "Spilt store idx %d [sn:%lli] insert into sbuffer\n", i, inst->seqNum);
                 assert(offset + req->getSize() <= storeWBIt->size());
-                bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, req->getSize(),
-                                                 req->getByteEnable());
+                bool success = insertStoreBuffer(
+                    vaddr, paddr, (uint8_t *)storeWBIt->data() + offset,
+                    req->getSize(), req->getByteEnable(), inst->seqNum);
                 if (success) {
                     request->_numOutstandingPackets++;
                 } else {
@@ -1977,8 +2279,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
             Addr vaddr = request->getVaddr();
             Addr paddr = request->mainReq()->getPaddr();
             DPRINTF(LSQUnit, "Store [sn:%lli] insert into sbuffer\n", inst->seqNum);
-            bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size,
-                                             request->mainReq()->getByteEnable());
+            bool success = insertStoreBuffer(
+                vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size,
+                request->mainReq()->getByteEnable(), inst->seqNum);
             if (!success) {
                 break;
             }
@@ -1990,7 +2293,10 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
     }
 }
 
-bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector<bool>& mask)
+bool
+LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
+                           uint64_t size, const std::vector<bool>& mask,
+                           InstSeqNum store_seq)
 {
     auto &storeBuffer = lsq->getStoreBuffer();
     // access range must in a cache block
@@ -1998,14 +2304,18 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
     Addr blockVaddr = vaddr & cacheBlockMask;
     Addr blockPaddr = paddr & cacheBlockMask;
     Addr offset = paddr & ~cacheBlockMask;
+
     // check request is not already in the storebuffer
     auto entry = storeBuffer.get(lsqID, blockPaddr);
+    const auto generation = lsq->bumpStoreBufferBlockVersion(blockPaddr);
+
     if (entry) {
         if (entry->sending) {
             if (entry->vice) {
                 // merge into vice
                 entry = entry->vice;
-                entry->merge(offset, datas, size, mask);
+                entry->merge(offset, datas, size, mask, generation);
+                entry->generation = generation;
                 DPRINTF(StoreBuffer, "Merging vice entry[%#x] for addr %#x\n",
                         blockPaddr, paddr);
             } else {
@@ -2017,14 +2327,18 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
                 }
                 stats.sbufferCreateVice++;
                 auto vice = storeBuffer.createVice(entry);
-                vice->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask);
+                vice->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset,
+                            datas, size, mask, generation);
+                vice->generation = generation;
                 DPRINTF(StoreBuffer, "Create new vice entry[%#x] for addr %#x\n",
                         blockPaddr, paddr);
             }
         } else {
             // merge into unsent
             storeBuffer.update(entry->index);
-            entry->merge(offset, datas, size, mask);
+            entry->merge(offset, datas, size, mask, generation);
+            entry->seqNum = std::max(entry->seqNum, store_seq);
+            entry->generation = generation;
             DPRINTF(StoreBuffer, "Merging entry[%#x] for addr %#x\n",
                     blockPaddr, paddr);
         }
@@ -2037,7 +2351,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
         }
         // insert
         auto entry = storeBuffer.getEmpty();
-        entry->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask);
+        entry->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, datas,
+                     size, mask, generation);
+        entry->generation = generation;
         storeBuffer.insert(entry);
         DPRINTF(StoreBuffer, "Create new entry[%#x] for addr %#x\n",
                 blockPaddr, paddr);
@@ -2256,6 +2572,7 @@ LSQUnit::squash(const InstSeqNum &squashed_num)
             break;
         }
     }
+
 }
 
 uint64_t
@@ -2337,7 +2654,8 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt)
 
             if (!htm_fault) {
                 assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
-                       inst->savedRequest->isPartialFault());
+                       (currentLoadRequest(inst) &&
+                        currentLoadRequest(inst)->isPartialFault()));
 
             } else if (!pkt->htmTransactionFailedInCache()) {
                 // Situation in which the instruction has a hardware
@@ -2358,8 +2676,12 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt)
         }
     }
 
-    if (!inst->savedRequest->isNormalLd()) {
-        // Need to insert instruction into queue to commit
+    const bool finish_after_writeback =
+        !inst->isNormalLd() || !inst->inPipe();
+    if (finish_after_writeback) {
+        // Normal loads usually wait for the last pipe stage to enqueue commit.
+        // If the response arrives after the load has already drained from the
+        // pipe, writeback must finish the instruction here.
         iewStage->readyToFinish(inst);
         iewStage->activityThisCycle();
     }
@@ -2383,14 +2705,51 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
      * store queue. */
     DynInstPtr store_inst = store_idx->instruction();
     auto request = store_idx->request();
-
+    // Predicated-off or zero-sized stores can legitimately reach completion
+    // without ever materializing a backing memory request.
+    const bool has_main_request =
+        request && request->numReqs() > 0;
+    const bool has_paddr =
+        has_main_request && request->mainReq()->hasPaddr();
     DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
             "idx:%i\n",
             store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1);
 
+    if (!from_sbuffer &&
+        (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
+        has_paddr) {
+        const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask;
+        auto generation = request->_storeBufferGeneration;
+        const bool replay_executed_loads =
+            store_inst->isAtomic() || cpu->consumeSyncVisibleStoreReplay(lsqID);
+        if (generation == 0) {
+            generation = lsq->bumpStoreBufferBlockVersion(block_paddr);
+        }
+        lsq->invalidateOtherThreadStoreBufferBytes(
+            lsqID, request->mainReq()->getPaddr(),
+            request->mainReq()->getByteEnable(), generation);
+        lsq->markStoreBufferBlockVisible(block_paddr, generation);
+        lsq->notifyOtherThreadsStoreVisible(lsqID,
+            request->mainReq()->getPaddr(),
+            request->mainReq()->getByteEnable(), store_inst->seqNum,
+            replay_executed_loads);
+    }
+
+    if (from_sbuffer &&
+        (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
+        has_paddr) {
+        auto generation = request->_storeBufferGeneration;
+        if (generation == 0) {
+            generation = lsq->bumpStoreBufferBlockVersion(
+                request->mainReq()->getPaddr() & cacheBlockMask);
+            request->_storeBufferGeneration = generation;
+        }
+    }
+
     if (!from_sbuffer &&
         (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
         cpu->goldenMemManager() &&
+        has_paddr &&
         cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
         Addr paddr = request->mainReq()->getPaddr();
         if (!store_inst->isAtomic()) {
@@ -2400,23 +2759,22 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
                                                      request->_size);
         } else {
             uint8_t tmp_data[8];
-            memset(tmp_data, 0, 8);
-            memcpy(tmp_data, store_inst->memData, request->_size);
+            memset(tmp_data, 0, sizeof(tmp_data));
             assert(request->req()->getAtomicOpFunctor());
 
-            // read golden memory to get the global latest value before this AMO is executed for further compare
-            cpu->goldenMemManager()->readGoldenMem(paddr,
-                                                   store_inst->getAmoOldGoldenValuePtr(), request->_size);
-            cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue();
+            // The AMO response returns the old memory value. Capture it on the
+            // instruction so commit/difftest can use a per-inst golden copy
+            // under SMT, but derive the new memory image from the DUT-observed
+            // old value captured in goldenData.
+            memcpy(tmp_data, store_inst->getGolden(), request->_size);
 
-            // before amo operate on golden memory
             (*(request->req()->getAtomicOpFunctor()))(tmp_data);
-            // after amo operate on golden memory
 
             DPRINTF(LSQUnit, "AMO writing to golden memory at addr %#x, data %#lx, mask %#x, size %d\n",
                     paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size);
             cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff,
                                                      request->_size);
+            store_inst->setGolden(tmp_data);
         }
     }
 
@@ -2522,11 +2880,15 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo
         request->packetSent();
 
         if (isLoad) {
-            auto &storeBuffer = lsq->getStoreBuffer();
-            auto entry = storeBuffer.get(lsqID, pkt->getAddr() & cacheBlockMask);
+            const Addr block_addr = pkt->getAddr() & cacheBlockMask;
+            auto entry = lsq->findForwardingStoreBufferEntry(
+                block_addr, lsqID, request->instruction()->seqNum);
             if (entry) {
                 DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print());
-                if (entry->recordForward(pkt->req, request)) {
+                if (entry->recordForward(
+                        pkt->req, request, lsqID,
+                        request->instruction()->seqNum,
+                        lsq->currentStoreBufferVisibleVersion(block_addr))) {
                     assert(request->isSplit()); // here must be split request
                     stats.sbufferFullForward++;
                 } else if (!request->SBforwardPackets.empty()) {
@@ -2697,8 +3059,12 @@ LSQUnit::dumpInsts() const
     for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) {
         if (it->valid()) {
             const DynInstPtr &inst(it->instruction());
-            cprintf("idx:%d %s.[sn:%llu] %s\n", it.idx(), inst->pcState(), inst->seqNum,
-                    it->addrReady() ? "AddrReady" : "Not AddrReady");
+            cprintf("idx:%d %s.[sn:%llu] %s squashed=%d canWB=%d completed=%d "
+                    "dataReady=%d staFinish=%d stdFinish=%d\n",
+                    it.idx(), inst->pcState(), inst->seqNum,
+                    it->addrReady() ? "AddrReady" : "Not AddrReady",
+                    inst->isSquashed(), it->canWB(), it->completed(),
+                    it->dataReady(), it->staFinish(), it->stdFinish());
         }
     }
 
@@ -2930,19 +3296,37 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     }
 
     if (request) {
+        request->SBforwardPackets.clear();
         request->SQforwardPackets.clear();
+        request->_sbufferBypass = false;
+        if (!load_inst->hasPendingCacheReq()) {
+            request->_goldenSnapshotCaptured = false;
+        }
     }
 
     // Check the SQ for any previous stores that might lead to forwarding
     auto store_it = load_inst->sqIt;
-    panic_if(store_it < storeWBIt, "[sn:%llu] Load instruction's store index is younger than store writeback index",
-             load_inst->seqNum);
-    // End once we've reached the top of the LSQ
-    while (store_it != storeWBIt && !load_inst->isDataPrefetch()) {
+    if (storeWBIt.dereferenceable()) {
+        panic_if(store_it < storeWBIt,
+                 "[sn:%llu] Load instruction's store index is younger than "
+                 "store writeback index",
+                 load_inst->seqNum);
+    }
+    // End once we've reached the top of the LSQ. If storeWBIt is end(), there
+    // is no outstanding SQ forwarding window to scan.
+    while (storeWBIt.dereferenceable() &&
+           store_it != storeWBIt &&
+           !load_inst->isDataPrefetch()) {
         // Move the index to one younger
         store_it--;
         assert(store_it->valid());
         assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        auto store_req = store_it->request();
+
+        if (store_it->completed()) {
+            continue;
+        }
+
         int store_size = store_it->size();
 
         // Cache maintenance instructions go down via the store
@@ -3077,9 +3461,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                             "addr %#x, data: %#lx\n", store_it->instruction()->seqNum, load_inst->seqNum,
                             request->mainReq()->getPaddr(), *((uint64_t*)buffer));
                 }
-
-
-
                 load_inst->setFullForward();
 
                 // Don't need to do anything special for split loads.
@@ -3131,11 +3512,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     // sbuffer forward
     if (!load_inst->isDataPrefetch() && !request->isSplit()) {
         Addr blk_addr = request->mainReq()->getPaddr() & cacheBlockMask;
-        int offset = request->mainReq()->getPaddr() & ~cacheBlockMask;
-        auto &storeBuffer = lsq->getStoreBuffer();
-        auto entry = storeBuffer.get(lsqID, blk_addr);
+        auto entry = lsq->findForwardingStoreBufferEntry(
+            blk_addr, lsqID, load_inst->seqNum);
         if (entry) {
-            if (entry->recordForward(request->mainReq(), request)) {
+            if (entry->recordForward(request->mainReq(), request, lsqID,
+                                     load_inst->seqNum,
+                                     lsq->currentStoreBufferVisibleVersion(
+                                         blk_addr))) {
                 // full forward
                 // no need to send to cache
                 stats.sbufferFullForward++;
@@ -3150,7 +3533,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     DPRINTF(LoadPipeline, "Load [sn:%llu] forward from sbuffer, data: %lx\n",
                             load_inst->seqNum, *((uint64_t*)buffer));
                 }
-
                 return NoFault;
             }
         }
@@ -3196,9 +3578,21 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     } else {
         DPRINTF(LoadPipeline, "Load [sn:%llu] sendPacketToCache\n", load_inst->seqNum);
         // if cannot forward from bus, do real cache access
+        bool should_capture_golden =
+            system->multiContextDifftest() &&
+            cpu->goldenMemManager() &&
+            cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr()) &&
+            !request->_goldenSnapshotCaptured;
         request->buildPackets();
         // if the cache is not blocked, do cache access
         request->sendPacketToCache();
+        if (request->isSent() && should_capture_golden) {
+            uint8_t *issue_golden =
+                (uint8_t *)cpu->goldenMemManager()->guestToHost(
+                    request->mainReq()->getPaddr());
+            load_inst->setGolden(issue_golden);
+            request->_goldenSnapshotCaptured = true;
+        }
         if (!request->isSent() && !load_inst->needBankConflictReplay() && !load_inst->needMshrArbFailReplay() &&
             !load_inst->needMshrAliasFailReplay() &&!load_inst->needHitInWriteBufferReplay()) {
             iewStage->blockMemInst(load_inst);
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index b4e604310d..2e950ce1ce 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -154,6 +154,7 @@ class LSQUnit
         }
 
         LSQRequest* request() { return _request; }
+        const LSQRequest* request() const { return _request; }
         void setRequest(LSQRequest* r) { _request = r; }
         bool hasRequest() { return _request != nullptr; }
         /** Member accessors. */
@@ -212,6 +213,8 @@ class LSQUnit
 
         bool addrReady() const { return _addrReady; }
         bool dataReady() const { return _dataReady; }
+        bool staFinish() const { return _staFinish; }
+        bool stdFinish() const { return _stdFinish; }
         bool canForwardToLoad() const { return _addrReady && _dataReady; }
         bool splitStoreFinish() const { return _staFinish && _stdFinish; }
 
@@ -302,6 +305,7 @@ class LSQUnit
     void insertLoad(const DynInstPtr &load_inst);
     /** Inserts a store instruction. */
     void insertStore(const DynInstPtr &store_inst);
+    bool splitStoreAddrSquashed(const DynInstPtr &inst);
 
     /** Check for ordering violations in the LSQ. For a store squash if we
      * ever find a conflicting load. For a load, only squash if we
@@ -326,6 +330,10 @@ class LSQUnit
      * of the intermediate invalidate.
      */
     void checkSnoop(PacketPtr pkt);
+    void checkLocalStoreVisible(Addr store_paddr,
+                                const std::vector<bool> &store_byte_enable,
+                                InstSeqNum store_seq,
+                                bool replay_executed_loads);
 
     /** Iq issues a load to load pipeline. */
     void issueToLoadPipe(const DynInstPtr &inst);
@@ -353,9 +361,12 @@ class LSQUnit
     /** Writes back stores. */
     void offloadToStoreBuffer(uint32_t max_entries);
 
-    bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector<bool>& mask);
+    bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
+                           uint64_t size, const std::vector<bool>& mask,
+                           InstSeqNum store_seq);
 
     bool storeBufferEmpty() { return lsq->storeBufferEmpty(); }
+    bool storeBufferEmpty(ThreadID tid) { return lsq->storeBufferEmpty(tid); }
     bool storeBufferSQWillFull() const
     {
         return storeQueue.size() > sqFullUpperLimit;
@@ -380,6 +391,12 @@ class LSQUnit
     /** Check if there exists raw nuke between load and store. */
     bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst);
 
+    /** Returns the current request attached to an active LQ entry. */
+    LSQRequest *currentLoadRequest(const DynInstPtr &inst);
+
+    /** Returns the current request attached to an active SQ entry. */
+    LSQRequest *currentStoreRequest(const DynInstPtr &inst);
+
     /** Returns the number of free LQ entries. */
     unsigned numFreeLoadEntries();
 
@@ -438,8 +455,11 @@ class LSQUnit
     /** Returns if there are any stores to writeback. */
     bool hasStoresToWB() { return storesToWB > 0; }
 
+    /** Returns if there are older stores/atomics still pending writeback. */
+    bool hasStoresToWBBefore(InstSeqNum seq_num) const;
+
     /** Returns the number of stores to writeback. */
-    int numStoresToSbuffer() { return storesToWB; }
+    int numStoresToSbuffer() const { return storesToWB; }
 
     /** Update loadCompletedIdx and storeCompletedIdx */
     void updateCompletedIdx();
@@ -570,6 +590,9 @@ class LSQUnit
         /** Instruction whose results are being written back. */
         DynInstPtr inst;
 
+        /** Request that owns the delayed writeback lifecycle. */
+        LSQRequest *request;
+
         /** The packet that would have been sent to memory. */
         PacketPtr pkt;
 
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index d3e51e2c86..0be9a0906e 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -78,6 +78,8 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams &params)
         fixedbuffer[tid] = boost::circular_buffer<DynInstPtr>(renameWidth);
         renameMap[tid] = nullptr;
         stalls[tid] = {false, false};
+        finalCommitSeq[tid] = 0;
+        releaseSeq[tid] = 0;
     }
 
     assert(decodeToRenameDelay == 1);
@@ -260,6 +262,8 @@ Rename::resetStage()
     for (ThreadID tid = 0; tid < numThreads; tid++) {
 
         stalls[tid].iew = false;
+        finalCommitSeq[tid] = 0;
+        releaseSeq[tid] = 0;
     }
 }
 
@@ -415,7 +419,15 @@ Rename::tick()
 
     updateActivate();
 
-    if (wroteToTimeBuffer || releaseSeq < finalCommitSeq) {
+    bool release_pending = false;
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (releaseSeq[tid] < finalCommitSeq[tid]) {
+            release_pending = true;
+            break;
+        }
+    }
+
+    if (wroteToTimeBuffer || release_pending) {
         DPRINTF(Activity, "Activity this cycle.\n");
         cpu->activityThisCycle();
     }
@@ -426,21 +438,26 @@ Rename::releasePhysRegs()
 {
     // Release physical registers up to releaseWidth
     auto threads = activeThreads->begin();
-    if (releaseSeq + releaseWidth < finalCommitSeq) {
-        releaseSeq += releaseWidth;
-    } else {
-        releaseSeq = finalCommitSeq;
-    }
     while (threads != activeThreads->end()) {
         ThreadID tid = *threads++;
 
-        removeFromHistory(releaseSeq, tid);
-        // If we committed this cycle then doneSeqNum will be > 0
+        if (releaseSeq[tid] + releaseWidth < finalCommitSeq[tid]) {
+            releaseSeq[tid] += releaseWidth;
+        } else {
+            releaseSeq[tid] = finalCommitSeq[tid];
+        }
+
+        removeFromHistory(releaseSeq[tid], tid);
+        // doneSeqNum is also reused as a squash-progress marker while the
+        // ROB is walking younger entries. Only real commit progress should
+        // release physical registers.
         if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
-            !fromCommit->commitInfo[tid].squash) {
+            !fromCommit->commitInfo[tid].squash &&
+            !fromCommit->commitInfo[tid].robSquashing) {
 
-            finalCommitSeq = fromCommit->commitInfo[tid].doneSeqNum;
-            releaseSeq = historyBuffer->empty() ? 0 : historyBuffer[tid].back().instSeqNum;
+            finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum;
+            releaseSeq[tid] =
+                historyBuffer[tid].empty() ? 0 : historyBuffer[tid].back().instSeqNum;
         }
     }
 }
@@ -600,7 +617,7 @@ Rename::moveInstsToBuffer()
     for (int i = 0; i < insts_from_decode; ++i) {
         const DynInstPtr &inst = fromDecode->insts[i];
         assert(inst->threadNumber == tid);
-        if (localSquashVer.largerThan(inst->getVersion())) {
+        if (localSquashVer[tid].largerThan(inst->getVersion())) {
             inst->setSquashed();
         } else {
             assert(!fixedbuffer[tid].full());
@@ -625,9 +642,10 @@ Rename::checkSquash()
 
             squash(fromCommit->commitInfo[i].doneSeqNum, i);
 
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
             DPRINTF(Rename, "Updating squash version to %u\n",
-                    localSquashVer.getVersion());
+                    localSquashVer[i].getVersion());
         }
     }
 }
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index 50c566b31a..ed03f62d8e 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -277,9 +277,9 @@ class Rename
      */
     std::list<RenameHistory> historyBuffer[MaxThreads];
 
-    InstSeqNum finalCommitSeq = 0;
+    InstSeqNum finalCommitSeq[MaxThreads] = {};
 
-    InstSeqNum releaseSeq = 0;
+    InstSeqNum releaseSeq[MaxThreads] = {};
 
     void tryFreePReg(PhysRegIdPtr phys_reg);
 
@@ -451,7 +451,7 @@ class Rename
 
     StallReason checkRenameStallFromIEW(ThreadID tid);
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
     /** Value predictor */
     valuepred::VPUnit *valuePred;
diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc
index 4e007804c2..d57ea8b0df 100644
--- a/src/cpu/o3/rob.cc
+++ b/src/cpu/o3/rob.cc
@@ -297,15 +297,23 @@ ROB::countInsts(ThreadID tid)
     return instList[tid].size();
 }
 
+uint32_t
+ROB::countInstsOfGroups(ThreadID tid, int groups)
+{
+    int sum = 0;
+    auto it = threadGroups[tid].begin();
+    for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) {
+        sum += *it;
+    }
+    return sum;
+}
+
 uint32_t
 ROB::countInstsOfGroups(int groups)
 {
     int sum = 0;
     for (ThreadID tid = 0; tid < numThreads; tid++) {
-        auto it = threadGroups[tid].begin();
-        for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) {
-            sum += *it;
-        }
+        sum += countInstsOfGroups(tid, groups);
     }
     return sum;
 }
@@ -420,6 +428,36 @@ ROB::retireHead(ThreadID tid)
     cpu->removeFrontInst(head_inst);
 }
 
+void
+ROB::drainSquashedHead(ThreadID tid)
+{
+    stats.writes++;
+
+    assert(numInstsInROB > 0);
+
+    InstIt head_it = instList[tid].begin();
+
+    DynInstPtr head_inst = std::move(*head_it);
+    instList[tid].erase(head_it);
+
+    assert(head_inst->readyToCommit());
+    assert(head_inst->isSquashed());
+
+    DPRINTF(ROB, "[tid:%i] Draining squashed head instruction, "
+            "instruction PC %s, [sn:%llu]\n", tid, head_inst->pcState(),
+            head_inst->seqNum);
+
+    --numInstsInROB;
+
+    commitGroup(head_inst, tid);
+
+    head_inst->clearInROB();
+
+    updateHead();
+
+    cpu->removeFrontInst(head_inst);
+}
+
 bool
 ROB::isHeadGroupReady(ThreadID tid)
 {
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index d9b3e9999b..94b93d2593 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -164,6 +164,11 @@ class ROB
      */
     void retireHead(ThreadID tid);
 
+    /** Drains a squashed head instruction from a specific thread without
+     *  marking it committed.
+     */
+    void drainSquashedHead(ThreadID tid);
+
     /** Is the oldest instruction across all threads ready. */
 //    bool isHeadReady();
 
@@ -256,6 +261,7 @@ class ROB
         return sum;
     }
 
+    uint32_t countInstsOfGroups(ThreadID tid, int groups);
     uint32_t countInstsOfGroups(int groups);
 
     bool (ROB::*allocateNewGroup)(const DynInstPtr inst, ThreadID tid);
diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh
index d5222e758d..74198c44fd 100644
--- a/src/cpu/o3/smt_sched.hh
+++ b/src/cpu/o3/smt_sched.hh
@@ -28,6 +28,8 @@ class InstsCounter
 
     uint64_t getCounter(ThreadID tid) { return counter[tid]; }
     void setCounter(ThreadID tid, uint64_t value) { counter[tid] = value; }
+    void incCounter(ThreadID tid, uint64_t value = 1) { counter[tid] += value; }
+    void decCounter(ThreadID tid, uint64_t value = 1) { counter[tid] -= value; }
 };
 
 class SMTScheduler
@@ -36,7 +38,8 @@ class SMTScheduler
     int numThreads;
   public:
     SMTScheduler(int numThreads) : numThreads(numThreads) {}
-    virtual ThreadID getThread();
+    virtual ~SMTScheduler() = default;
+    virtual ThreadID getThread() = 0;
 };
 
 
@@ -124,7 +127,28 @@ class MultiPrioritySched : public SMTScheduler
     }
 };
 
+class IndependentIQICountScheduler : public SMTScheduler {
+private:
+     InstsCounter* counter;  // Counter for this IQ only
 
+public:
+    IndependentIQICountScheduler(int numThreads, InstsCounter* counter)
+        : SMTScheduler(numThreads), counter(counter){}
+
+    ThreadID getThread() override {
+        ThreadID selectedTid = 0;
+        uint64_t minCount = counter->getCounter(0);
+
+        for (ThreadID tid = 1; tid < numThreads; ++tid) {
+            uint64_t count = counter->getCounter(tid);
+            if (count < minCount) {
+                minCount = count;
+                selectedTid = tid;
+            }
+        }
+        return selectedTid;
+    }
+};
 
 }}
 #endif
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 402d5a84aa..044ff2885e 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -1021,6 +1021,7 @@ class BTBRAS(TimedBaseBTBPredictor):
     cxx_class = 'gem5::branch_prediction::btb_pred::BTBRAS'
     cxx_header = 'cpu/pred/btb/ras.hh'
 
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
     numEntries = Param.Unsigned(32, "Number of entries in the RAS")
     ctrWidth = Param.Unsigned(8, "Width of the counter")
     numInflightEntries = Param.Unsigned(384, "Number of inflight entries")
diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc
index c4876e8158..8013900e83 100644
--- a/src/cpu/pred/btb/abtb.cc
+++ b/src/cpu/pred/btb/abtb.cc
@@ -166,28 +166,42 @@ AheadBTB::setTrace()
 std::vector<AheadBTB::TickedBTBEntry>
 AheadBTB::processEntries(const std::vector<TickedBTBEntry>& entries, Addr startAddr)
 {
-    int hitNum = entries.size();
-    bool hit = hitNum > 0;
+    auto processed_entries = entries;
     
+    // Sort by instruction order
+    std::sort(processed_entries.begin(), processed_entries.end(), 
+             [](const BTBEntry &a, const BTBEntry &b) {
+                 return a.pc < b.pc;
+             });
+
+    auto it = std::remove_if(processed_entries.begin(), processed_entries.end(),
+                           [startAddr](const BTBEntry &e) {
+                               return e.pc < startAddr;
+                           });
+    processed_entries.erase(it, processed_entries.end());
+
+    Addr abtb_end = (startAddr + predictWidth) &
+                    ~mask(floorLog2(predictWidth) - 1);
+    it = std::remove_if(processed_entries.begin(), processed_entries.end(),
+                        [abtb_end](const BTBEntry &e) {
+                            return e.pc >= abtb_end;
+                        });
+    processed_entries.erase(it, processed_entries.end());
+
+    int hitNum = processed_entries.size();
+    bool hit = hitNum > 0;
+
     // Update prediction statistics
     if (hit) {
         DPRINTF(ABTB, "BTB: lookup hit, dumping hit entry\n");
         btbStats.predHit += hitNum;
-        for (auto &entry: entries) {
+        for (auto &entry: processed_entries) {
             printTickedBTBEntry(entry);
         }
     } else {
         btbStats.predMiss++;
         DPRINTF(ABTB, "BTB: lookup miss\n");
     }
-
-    auto processed_entries = entries;
-    
-    // Sort by instruction order
-    std::sort(processed_entries.begin(), processed_entries.end(), 
-             [](const BTBEntry &a, const BTBEntry &b) {
-                 return a.pc < b.pc;
-             });
     return processed_entries;
 }
 
@@ -299,12 +313,13 @@ AheadBTB::putPCHistory(Addr startAddr,
                          std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<BTBMeta>();
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     // Lookup all matching entries in BTB
-    auto find_entries = lookup(startAddr);
-    
+    auto find_entries = lookup(startAddr, asidHash);
+
     // Process BTB entries
     auto processed_entries = processEntries(find_entries, startAddr);
-    
+
     // Fill predictions for each pipeline stage
     fillStagePredictions(processed_entries, stagePreds);
     
@@ -313,8 +328,9 @@ AheadBTB::putPCHistory(Addr startAddr,
 }
 
 std::shared_ptr<void>
-AheadBTB::getPredictionMeta()
+AheadBTB::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     // Lazy-initialize meta so callers never observe a null pointer
     // This avoids early-cycle crashes when prediction hasn't populated meta yet
     if (!meta) {
@@ -342,13 +358,13 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget
  * @return Vector of matching BTB entries
  */
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookupSingleBlock(Addr block_pc)
+AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
         return res; // ignore false hit when lowest bit is 1
     }
-    Addr btb_idx = getIndex(block_pc);
+    Addr btb_idx = getIndex(block_pc, asidHash);
     auto btb_set = btb[btb_idx];
     assert(btb_idx < numSets);
     // AheadBTB always uses ahead-pipelined implementation:
@@ -356,7 +372,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc)
     DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx);
     aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set));
 
-    Addr tag_curStartpc = getTag(block_pc);// abtb uses current FB pc to get tag
+    Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag
     Addr pc = 0;
     Addr idx_prvStartpc = 0;// abtb uses previous FB pc to get index
     BTBSet set;
@@ -391,7 +407,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc)
 }
 
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookup(Addr block_pc)
+AheadBTB::lookup(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -399,7 +415,7 @@ AheadBTB::lookup(Addr block_pc)
     }
 
     // AheadBTB always uses single block lookup
-    res = lookupSingleBlock(block_pc);
+    res = lookupSingleBlock(block_pc, asidHash);
     return res;
 }
 
@@ -593,12 +609,12 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC)
 
     for (auto &entry : entries_to_update) {
         Addr startPC = s3Pred.bbStart;
-        Addr btb_tag = getTag(startPC);  // use last pc to get tag
+        Addr btb_tag = getTag(startPC, s3Pred.asidHash);  // use last pc to get tag
         if (previousPC == 0) {
             DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n");
             return;
         }
-        Addr btb_idx = getIndex(previousPC);  // use last pc to get idx
+        Addr btb_idx = getIndex(previousPC, s3Pred.asidHash);  // use last pc to get idx
         BranchInfo takenbranchinfo;
         takenbranchinfo.pc = s3Pred.getTakenEntry().pc;
         takenbranchinfo.target = s3Pred.getTakenEntry().target;
@@ -669,7 +685,7 @@ AheadBTB::update(const FetchTarget &stream)
     // 4. Update BTB entries - each entry uses its own PC to calculate index and tag
     for (auto &entry : entries_to_update) {
         Addr startPC = stream.getRealStartPC();
-        Addr btb_tag = getTag(startPC);  // use current pc to get tag
+        Addr btb_tag = getTag(startPC, stream.asidHash);  // use current pc to get tag
 
         // AheadBTB always uses ahead-pipelined update logic
         Addr previousPC = getPreviousPC(stream);
@@ -677,7 +693,7 @@ AheadBTB::update(const FetchTarget &stream)
             DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n");
             return;
         }
-        Addr btb_idx = getIndex(previousPC);  // use last pc to get idx
+        Addr btb_idx = getIndex(previousPC, stream.asidHash);  // use last pc to get idx
         entry.source = getComponentIdx(); // mark the entry source as AheadBTB
         updateBTBEntry(btb_idx, btb_tag, entry, stream.exeBranchInfo, stream.exeTaken);
     }
diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh
index 9e7abc6260..e5e29f7ffd 100644
--- a/src/cpu/pred/btb/abtb.hh
+++ b/src/cpu/pred/btb/abtb.hh
@@ -147,7 +147,7 @@ class AheadBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // not used
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
@@ -224,8 +224,9 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch to look up.
      *  @return Returns the index into the BTB.
      */
-    inline Addr getIndex(Addr instPC) {
-        return (instPC >> idxShiftAmt) & idxMask;
+    inline Addr getIndex(Addr instPC, uint8_t asidHash) {
+        Addr baseIndex = (instPC >> idxShiftAmt) & idxMask;
+        return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash);
     }
 
     /** Returns the tag bits of a given address.
@@ -234,8 +235,9 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch's address.
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr instPC) {
-        return (instPC >> tagShiftAmt) & tagMask;
+    inline Addr getTag(Addr instPC, uint8_t asidHash) {
+        Addr baseTag = (instPC >> tagShiftAmt) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
 
@@ -365,13 +367,13 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The address of the block to look up.
      *  @return Returns all hit BTB entries.
      */
-    std::vector<TickedBTBEntry> lookup(Addr block_pc);
+    std::vector<TickedBTBEntry> lookup(Addr block_pc, uint8_t asidHash);
 
     /** Helper function to lookup entries in a single block
      * @param block_pc The aligned PC to lookup
      * @return Vector of matching BTB entries
      */
-    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, uint8_t asidHash);
 
     /** The BTB structure:
      *  - Organized as numSets sets
diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc
index 58828467cd..2f3ca7fe9d 100644
--- a/src/cpu/pred/btb/btb_ittage.cc
+++ b/src/cpu/pred/btb/btb_ittage.cc
@@ -38,6 +38,8 @@ ittageStats(this, p.numPredictors)
     tableIndexMasks.resize(numPredictors);
     tableTagBits.resize(numPredictors);
     tableTagMasks.resize(numPredictors);
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
     for (unsigned int i = 0; i < p.numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -53,9 +55,15 @@ ittageStats(this, p.numPredictors)
 
         assert(tablePcShifts.size() >= numPredictors);
 
-        tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], (int)16));
-        altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, (int)16));
-        indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], (int)16));
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i], (int)16);
+            state.altTagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i] - 1, (int)16);
+            state.indexFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableIndexBits[i], (int)16);
+        }
     }
     // useAlt.resize(128);
     // for (unsigned i = 0; i < useAlt.size(); ++i) {
@@ -64,6 +72,27 @@ ittageStats(this, p.numPredictors)
     usefulResetCnt = 0;
 }
 
+ThreadID
+BTBITTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBITTAGE::ThreadHistoryState &
+BTBITTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBITTAGE::ThreadHistoryState &
+BTBITTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 void
 BTBITTAGE::tickStart()
 {
@@ -73,8 +102,10 @@ void
 BTBITTAGE::tick() {}
 
 void
-BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries, IndirectTargets& results)
+BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries,
+                        IndirectTargets& results, ThreadID tid, uint8_t asidHash)
 {
+    (void)asidHash;
     DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr);
     std::vector<TagePrediction> preds;
     for (auto &btb_entry : btbEntries) {
@@ -150,7 +181,7 @@ BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries,
             }
             // Note: predTargetHit will be updated in the update phase when we know the actual target
             TagePrediction pred(btb_entry.pc, main_info, alt_info, use_alt, main_target);
-            meta->preds[btb_entry.pc] = pred;
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
         }
     }
 }
@@ -162,17 +193,20 @@ BTBITTAGE::dryRunCycle(Addr startPC) {
 
 void
 BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
+    const auto &state = historyState(tid);
     if (debugPC == stream_start) {
         debugFlag = true;
     }
     DPRINTF(ITTAGE, "putPCHistory startAddr: %#lx\n", stream_start);
 
     // clear old metas
-    meta = std::make_shared<TageMeta>();
+    threadMeta[tid] = std::make_shared<TageMeta>();
     // assign history for meta
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
 
     lookupEntries.clear();
     lookupIndices.clear();
@@ -181,8 +215,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
     // all btb entries should use the same lookup result
     // but each btb entry can use prediction from different tables
     for (int i = 0; i < numPredictors; ++i) {
-        Addr index = getTageIndex(stream_start, i);
-        Addr tag = getTageTag(stream_start, i);
+        Addr index = getTageIndex(stream_start, i, state.indexFoldedHist[i].get(), asidHash);
+        Addr tag = getTageTag(stream_start, i, state.tagFoldedHist[i].get(),
+                              state.altTagFoldedHist[i].get(), asidHash);
         auto &entry = tageTable[i][index];
         lookupEntries.push_back(entry);
         lookupIndices.push_back(index);
@@ -191,20 +226,24 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
         DPRINTF(ITTAGE, "lookup table %d[%d]: valid %d, tag %d, ctr %d, useful %d\n",
             i, index, entry.valid, entry.tag, entry.counter, entry.useful);
     }
-    meta->usefulMask = std::move(useful_mask);
+    threadMeta[tid]->usefulMask = std::move(useful_mask);
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         auto &stage_pred = stagePreds[s];
         stage_pred.indirectTargets.clear();
-        lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.indirectTargets);
+        lookupHelper(stream_start, stage_pred.btbEntries,
+                     stage_pred.indirectTargets, tid, asidHash);
     }
     DPRINTF(ITTAGE, "putPCHistory end\n");
     debugFlag = false;
 }
 
 std::shared_ptr<void>
-BTBITTAGE::getPredictionMeta() {
-    return meta;
+BTBITTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 void
@@ -367,8 +406,9 @@ BTBITTAGE::update(const FetchTarget &stream)
                 unsigned startTable = main_found ? main_info.table + 1 : 0;
 
                 for (int ti = startTable; ti < numPredictors; ti++) {
-                    Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get());
-                    Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(), updateAltTagFoldedHist[ti].get());
+                    Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get(), stream.asidHash);
+                    Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(),
+                                             updateAltTagFoldedHist[ti].get(), stream.asidHash);
                     assert(newIndex < tageTable[ti].size());
                     auto &newEntry = tageTable[ti][newIndex];
 
@@ -402,7 +442,8 @@ BTBITTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 }
 
 Addr
-BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist)
+BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                      uint8_t asidHash)
 {
     // Create mask for tableTagBits[t]
     uint64_t mask = ((1ULL << tableTagBits[t]) - 1);
@@ -414,30 +455,33 @@ BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis
     uint64_t altTagBits = (altFoldedHist << 1);
 
     // XOR all components
-    return (pcBits ^ foldedHist ^ altTagBits) & mask;
+    return injectAsidHashIntoTag((pcBits ^ foldedHist ^ altTagBits) & mask,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-BTBITTAGE::getTageTag(Addr pc, int t)
+BTBITTAGE::getTageTag(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get());
+    const auto &state = historyState(0);
+    return getTageTag(pc, t, state.tagFoldedHist[t].get(),
+                      state.altTagFoldedHist[t].get(), asidHash);
 }
 
 Addr
-BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t]
     uint64_t mask = ((1ULL << tableIndexBits[t]) - 1);
 
     // Extract lower bits of PC and XOR with folded history
     uint64_t pcBits = (pc >> floorLog2(blockSize));
-    return (pcBits ^ foldedHist) & mask;
+    return xorAsidHashIntoIndex((pcBits ^ foldedHist) & mask, tableIndexBits[t], asidHash);
 }
 
 Addr
-BTBITTAGE::getTageIndex(Addr pc, int t)
+BTBITTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
@@ -478,8 +522,10 @@ BTBITTAGE::satDecrement(int min, short &counter)
  * @param target The target address of the branch
  */
 void
-BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target)
+BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken,
+                        Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::ITTAGEHistory) {  // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
@@ -492,7 +538,9 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference
             foldedHist.update(history, 2, taken, pc, target);
             DPRINTF(ITTAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get());
@@ -503,7 +551,7 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 bool
 BTBITTAGE::tageHit()
 {
-    auto meta = getPredictionMeta();
+    auto meta = getPredictionMeta(0);
     auto preds = std::static_pointer_cast<TageMeta>(meta)->preds;
     bool hit = false;
     for (auto & [pc, pred] : preds) {
@@ -531,7 +579,7 @@ void
 BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, taken, pc, target);
+    doUpdateHist(history, taken, pc, target, pred.tid);
 }
 
 /**
@@ -550,18 +598,28 @@ BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredic
 void
 BTBITTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = historyState(entry.tid);
     std::shared_ptr<TageMeta> predMeta = std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < numPredictors; i++) {
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
-    doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, cond_taken, entry.getControlPC(),
+                 entry.getTakenTarget(), entry.tid);
 }
 
 void
 BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                           const char * when)
+{
+    auto &state = historyState(tid);
     if (debugFlag) {
         DPRINTF(ITTAGE, "checking folded history when %s\n", when);
         std::string hist_str;
@@ -572,7 +630,9 @@ BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe
         for (int type = 0; type < 2; type++) {
             DPRINTF(ITTAGE, "t: %d, type: %d\n", t, type);
             std::string buf2, buf3;
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             foldedHist.check(hist);
         }
     }
diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh
index e86b45817b..7db7e39350 100644
--- a/src/cpu/pred/btb/btb_ittage.hh
+++ b/src/cpu/pred/btb/btb_ittage.hh
@@ -3,6 +3,7 @@
 
 #include <deque>
 #include <map>
+#include <memory>
 #include <utility>
 #include <vector>
 
@@ -10,6 +11,7 @@
 #include "base/statistics.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -30,6 +32,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor
 {
     using defer = std::shared_ptr<void>;
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
     typedef BTBITTAGEParams Params;
 
@@ -99,7 +102,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update 3 folded history, according history and pred.taken
     // the other specUpdateHist methods are left blank
@@ -116,30 +119,34 @@ class BTBITTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
   private:
 
     // return provided
-    void lookupHelper(Addr stream_start, const std::vector<BTBEntry> &btbEntries, IndirectTargets& results);
+    void lookupHelper(Addr stream_start, const std::vector<BTBEntry> &btbEntries,
+                      IndirectTargets& results, ThreadID tid, uint8_t asidHash);
 
     // use blockPC
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // use blockPC (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // use blockPC
-    Addr getTageTag(Addr pc, int table);
+    Addr getTageTag(Addr pc, int table, uint8_t asidHash = 0);
 
     // use blockPC (uint64_t version for performance)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    uint8_t asidHash = 0);
 
     Addr getOffset(Addr pc) {
         return (pc & (blockSize - 1)) >> 1;
     }
 
     // Update branch history
-    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target);
+    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target,
+                      ThreadID tid);
 
     const unsigned numPredictors;
 
@@ -151,9 +158,14 @@ class BTBITTAGE : public TimedBaseBTBPredictor
     std::vector<bitset> tableTagMasks;
     std::vector<unsigned> tablePcShifts;
     std::vector<unsigned> histLengths;
-    std::vector<PathFoldedHist> tagFoldedHist;
-    std::vector<PathFoldedHist> altTagFoldedHist;
-    std::vector<PathFoldedHist> indexFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<PathFoldedHist> tagFoldedHist;
+        std::vector<PathFoldedHist> altTagFoldedHist;
+        std::vector<PathFoldedHist> indexFoldedHist;
+    };
+
+    std::vector<ThreadHistoryState> threadHistory;
 
     LFSR64 allocLFSR;
 
@@ -261,7 +273,10 @@ class BTBITTAGE : public TimedBaseBTBPredictor
         }
     } TageMeta;
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 
 public:
 
diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc
index 9011dbbce6..8586816f02 100755
--- a/src/cpu/pred/btb/btb_mgsc.cc
+++ b/src/cpu/pred/btb/btb_mgsc.cc
@@ -60,41 +60,64 @@ BTBMGSC::initStorage()
     assert(isPowerOf2(numCtrsPerLine));
     numCtrsPerLineBits = log2i(numCtrsPerLine);
 
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     auto bwTableSize = allocPredTable(bwTable, bwTableNum, bwTableIdxWidth);
-    for (unsigned int i = 0; i < bwTableNum; ++i) {
-        indexBwFoldedHist.push_back(GlobalBwFoldedHist(bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < bwTableNum; ++i) {
+            state.indexBwFoldedHist.emplace_back(
+                bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     bwIndex.resize(bwTableNum);
 
     auto lTableSize = allocPredTable(lTable, lTableNum, lTableIdxWidth);
-    indexLFoldedHist.resize(numEntriesFirstLocalHistories);
-    for (unsigned int i = 0; i < lTableNum; ++i) {
-        for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
-            indexLFoldedHist[k].push_back(LocalFoldedHist(lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        state.indexLFoldedHist.resize(numEntriesFirstLocalHistories);
+        for (unsigned int i = 0; i < lTableNum; ++i) {
+            for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
+                state.indexLFoldedHist[k].push_back(LocalFoldedHist(
+                    lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16));
+            }
         }
     }
     lIndex.resize(lTableNum);
 
     auto iTableSize = allocPredTable(iTable, iTableNum, iTableIdxWidth);
-    for (unsigned int i = 0; i < iTableNum; ++i) {
-        assert(iHistLen[i] >= 0);
-        assert(static_cast<unsigned>(iHistLen[i]) < 63);
-        assert(pow2(static_cast<unsigned>(iHistLen[i])) <= iTableSize);
-        indexIFoldedHist.push_back(ImliFoldedHist(iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < iTableNum; ++i) {
+            assert(iHistLen[i] >= 0);
+            assert(static_cast<unsigned>(iHistLen[i]) < 63);
+            assert(pow2(static_cast<unsigned>(iHistLen[i])) <= iTableSize);
+            state.indexIFoldedHist.emplace_back(
+                iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     iIndex.resize(iTableNum);
 
     auto gTableSize = allocPredTable(gTable, gTableNum, gTableIdxWidth);
-    for (unsigned int i = 0; i < gTableNum; ++i) {
-        assert(gTable.size() >= gTableNum);
-        indexGFoldedHist.push_back(GlobalFoldedHist(gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < gTableNum; ++i) {
+            assert(gTable.size() >= gTableNum);
+            state.indexGFoldedHist.emplace_back(
+                gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     gIndex.resize(gTableNum);
 
     auto pTableSize = allocPredTable(pTable, pTableNum, pTableIdxWidth);
-    for (unsigned int i = 0; i < pTableNum; ++i) {
-        assert(pTable.size() >= pTableNum);
-        indexPFoldedHist.push_back(PathFoldedHist(pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < pTableNum; ++i) {
+            assert(pTable.size() >= pTableNum);
+            state.indexPFoldedHist.emplace_back(
+                pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2);
+        }
     }
     pIndex.resize(pTableNum);
 
@@ -219,6 +242,27 @@ BTBMGSC::BTBMGSC(const Params &p)
 #endif
 BTBMGSC::~BTBMGSC() {}
 
+ThreadID
+BTBMGSC::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBMGSC::ThreadHistoryState &
+BTBMGSC::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBMGSC::ThreadHistoryState &
+BTBMGSC::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 BTBMGSC::setTrace()
@@ -347,34 +391,41 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum)
  * @return TagePrediction containing main and alternative predictions
  */
 BTBMGSC::MgscPrediction
-BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info)
+BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
+                                  const TageInfoForMGSC &tage_info,
+                                  ThreadID tid)
 {
     DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc,
             btb_entry.alwaysTaken);
+    const auto &state = historyState(tid);
 
     // Calculate indices for all tables
     for (unsigned int i = 0; i < bwTableNum; ++i) {
-        bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, indexBwFoldedHist[i].get());
+        bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits,
+                                  state.indexBwFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < lTableNum; ++i) {
         lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits,
-                                 indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get());
+                                 state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get());
     }
     // std::string buf;
     // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf);
     // DPRINTF(MGSC, "startPC: %#lx, local index: %d, local_folded_hist: %s\n", startPC, lIndex[0], buf.c_str());
 
     for (unsigned int i = 0; i < iTableNum; ++i) {
-        iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, indexIFoldedHist[i].get());
+        iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexIFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < gTableNum; ++i) {
-        gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, indexGFoldedHist[i].get());
+        gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexGFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < pTableNum; ++i) {
-        pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, indexPFoldedHist[i].get());
+        pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexPFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < biasTableNum; ++i) {
@@ -468,7 +519,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
  */
 void
 BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens &results)
+                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens &results, ThreadID tid)
 {
     DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -478,8 +530,9 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
         if (btb_entry.isCond && btb_entry.valid) {
             auto tage_info = tageInfoForMgscs.find(btb_entry.pc);
             if (tage_info != tageInfoForMgscs.end()) {
-                auto pred = generateSinglePrediction(btb_entry, startPC, tage_info->second);
-                meta->preds[btb_entry.pc] = pred;
+                auto pred = generateSinglePrediction(btb_entry, startPC,
+                                                     tage_info->second, tid);
+                threadMeta[tid]->preds[btb_entry.pc] = pred;
                 results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
             } else {
                 assert(false);
@@ -504,6 +557,8 @@ void
 BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds)
 {
+    const ThreadID tid = predictorTid(stagePreds);
+    const auto &state = historyState(tid);
     DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start);
 
     // IMPORTANT: when this function is called,
@@ -515,25 +570,29 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
     }
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<MgscMeta>();
-    meta->indexBwFoldedHist = indexBwFoldedHist;
-    meta->indexLFoldedHist = indexLFoldedHist;
-    meta->indexIFoldedHist = indexIFoldedHist;
-    meta->indexGFoldedHist = indexGFoldedHist;
-    meta->indexPFoldedHist = indexPFoldedHist;
+    threadMeta[tid] = std::make_shared<MgscMeta>();
+    threadMeta[tid]->indexBwFoldedHist = state.indexBwFoldedHist;
+    threadMeta[tid]->indexLFoldedHist = state.indexLFoldedHist;
+    threadMeta[tid]->indexIFoldedHist = state.indexIFoldedHist;
+    threadMeta[tid]->indexGFoldedHist = state.indexGFoldedHist;
+    threadMeta[tid]->indexPFoldedHist = state.indexPFoldedHist;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens);
+        lookupHelper(stream_start, stage_pred.btbEntries,
+                     stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid);
     }
 }
 
 std::shared_ptr<void>
-BTBMGSC::getPredictionMeta()
+BTBMGSC::getPredictionMeta(ThreadID tid)
 {
-    return meta;
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -1068,10 +1127,11 @@ BTBMGSC::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, bool ta
 void
 BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getHistInfo();
-    doUpdateHist(history, shamt, cond_taken, indexGFoldedHist);  // use global history to update G folded history
+    doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist);  // use global history to update G folded history
 }
 
 /**
@@ -1089,8 +1149,9 @@ BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredictio
 void
 BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, 2, taken, indexPFoldedHist, pc, target);  // only path history needs pc!
+    doUpdateHist(history, 2, taken, state.indexPFoldedHist, pc, target);  // only path history needs pc!
 }
 
 
@@ -1109,10 +1170,11 @@ BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti
 void
 BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getBwHistInfo();
-    doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist);
 }
 
 /**
@@ -1130,12 +1192,13 @@ BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPredict
 void
 BTBMGSC::specUpdateIHist(FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getBwHistInfo();
     // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update)
     boost::dynamic_bitset<> dummy;
-    doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist);
+    doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist);
 }
 
 /**
@@ -1153,11 +1216,12 @@ BTBMGSC::specUpdateIHist(FullBTBPrediction &pred)
 void
 BTBMGSC::specUpdateLHist(const std::vector<boost::dynamic_bitset<>> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getHistInfo();
     doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                 indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]);
+                 state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]);
 }
 
 /**
@@ -1179,11 +1243,12 @@ BTBMGSC::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < gTableNum; i++) {
-        indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]);
+        state.indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]);
     }
-    doUpdateHist(history, shamt, cond_taken, indexGFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist);
 }
 
 /**
@@ -1205,11 +1270,13 @@ BTBMGSC::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < pTableNum; i++) {
-        indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]);
+        state.indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]);
     }
-    doUpdateHist(history, 2, cond_taken, indexPFoldedHist, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, 2, cond_taken, state.indexPFoldedHist,
+                 entry.getControlPC(), entry.getTakenTarget());
 }
 
 /**
@@ -1231,11 +1298,12 @@ BTBMGSC::recoverBwHist(const boost::dynamic_bitset<> &history, const FetchTarget
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < bwTableNum; i++) {
-        indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]);
+        state.indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]);
     }
-    doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist);
 }
 
 /**
@@ -1257,13 +1325,14 @@ BTBMGSC::recoverIHist(const FetchTarget &entry, int shamt, bool cond_taken)
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < iTableNum; i++) {
-        indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]);
+        state.indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]);
     }
     // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update)
     boost::dynamic_bitset<> dummy;
-    doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist);
+    doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist);
 }
 
 /**
@@ -1286,14 +1355,15 @@ BTBMGSC::recoverLHist(const std::vector<boost::dynamic_bitset<>> &history, const
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
         for (int i = 0; i < lTableNum; i++) {
-            indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]);
+            state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]);
         }
     }
             doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                         indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]);
+                         state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]);
         }
 
 #ifndef UNIT_TEST
@@ -1414,6 +1484,15 @@ void
 BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
                          const std::vector<boost::dynamic_bitset<>> &LHistory, const char *when)
 {
+    checkFoldedHist(Ghistory, PHistory, LHistory, 0, when);
+}
+
+void
+BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
+                         const std::vector<boost::dynamic_bitset<>> &LHistory,
+                         ThreadID tid, const char *when)
+{
+    auto &state = historyState(tid);
     DPRINTF(MGSC, "checking folded history when %s\n", when);
     if (debug::MGSC) {
         std::string hist_str;
@@ -1421,17 +1500,17 @@ BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::d
         DPRINTF(MGSC, "history:\t%s\n", hist_str.c_str());
     }
     for (int t = 0; t < gTableNum; t++) {
-        auto &foldedHist = indexGFoldedHist[t];
+        auto &foldedHist = state.indexGFoldedHist[t];
         foldedHist.check(Ghistory);
     }
     for (int t = 0; t < pTableNum; t++) {
-        auto &foldedHist = indexPFoldedHist[t];
+        auto &foldedHist = state.indexPFoldedHist[t];
         foldedHist.check(PHistory);
     }
     for (int t = 0; t < lTableNum; t++) {
-        assert(LHistory.size() == indexLFoldedHist.size());
+        assert(LHistory.size() == state.indexLFoldedHist.size());
         for (int i = 0; i < LHistory.size(); i++) {
-            auto &foldedHist = indexLFoldedHist[i][t];
+            auto &foldedHist = state.indexLFoldedHist[i][t];
             foldedHist.check(LHistory[i]);
         }
     }
diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh
index 100fc639a4..6ff29b13c8 100755
--- a/src/cpu/pred/btb/btb_mgsc.hh
+++ b/src/cpu/pred/btb/btb_mgsc.hh
@@ -14,6 +14,7 @@
 
 #include "base/sat_counter.hh"
 #include "base/types.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -39,6 +40,7 @@ namespace test {
 
 class BTBMGSC : public TimedBaseBTBPredictor
 {
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     BTBMGSC();
@@ -157,7 +159,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update all folded history, according history and pred.taken
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
@@ -191,6 +193,9 @@ class BTBMGSC : public TimedBaseBTBPredictor
     // check folded hists after speculative update and recover
     void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
                          const std::vector<boost::dynamic_bitset<>> &LHistory, const char *when);  // Check GHR folded
+    void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
+                         const std::vector<boost::dynamic_bitset<>> &LHistory,
+                         ThreadID tid, const char *when);  // Check GHR folded
 
     // Calculate MGSC weight index
     Addr getPcIndex(Addr pc, unsigned tableIndexBits);
@@ -247,7 +252,8 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
     // Look up predictions in MGSC tables for a stream of instructions
     void lookupHelper(const Addr &stream_start, const std::vector<BTBEntry> &btbEntries,
-                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens &results);
+                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens &results, ThreadID tid);
 
     // Calculate MGSC history index with folded history
     Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist);
@@ -277,7 +283,8 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
     // Helper method to generate prediction for a single BTB entry
     MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
-                                            const TageInfoForMGSC &tage_info);
+                                            const TageInfoForMGSC &tage_info,
+                                            ThreadID tid);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -353,12 +360,16 @@ class BTBMGSC : public TimedBaseBTBPredictor
     bool enablePCThreshold;
     Addr focusBranchPC;
 
-    // Folded history for index calculation
-    std::vector<GlobalBwFoldedHist> indexBwFoldedHist;
-    std::vector<std::vector<LocalFoldedHist>> indexLFoldedHist;
-    std::vector<ImliFoldedHist> indexIFoldedHist;
-    std::vector<GlobalFoldedHist> indexGFoldedHist;
-    std::vector<PathFoldedHist> indexPFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<GlobalBwFoldedHist> indexBwFoldedHist;
+        std::vector<std::vector<LocalFoldedHist>> indexLFoldedHist;
+        std::vector<ImliFoldedHist> indexIFoldedHist;
+        std::vector<GlobalFoldedHist> indexGFoldedHist;
+        std::vector<PathFoldedHist> indexPFoldedHist;
+    };
+
+    std::vector<ThreadHistoryState> threadHistory;
 
     // The actual MGSC prediction tables (table x index x line)
     std::vector<std::vector<std::vector<int16_t>>> bwTable;
@@ -552,8 +563,9 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
         static const std::unordered_map<Addr, MgscPrediction> &preds(const BTBMGSC &mgsc)
         {
-            assert(mgsc.meta);
-            return mgsc.meta->preds;
+            assert(!mgsc.threadMeta.empty());
+            assert(mgsc.threadMeta[0]);
+            return mgsc.threadMeta[0]->preds;
         }
     };
 #endif
@@ -594,7 +606,10 @@ class BTBMGSC : public TimedBaseBTBPredictor
         }
     } MgscMeta;
 
-    std::shared_ptr<MgscMeta> meta;
+    std::vector<std::shared_ptr<MgscMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc
index 201f79c6a6..672b729a1a 100644
--- a/src/cpu/pred/btb/btb_tage.cc
+++ b/src/cpu/pred/btb/btb_tage.cc
@@ -104,6 +104,9 @@ tageStats(this, p.numPredictors, p.numBanks)
     tableTagBits.resize(numPredictors);
     tableTagMasks.resize(numPredictors);
 
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     for (unsigned int i = 0; i < numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -121,10 +124,15 @@ tageStats(this, p.numPredictors, p.numBanks)
         tableTagMasks[i].resize(tableTagBits[i], true);
 
         assert(tablePcShifts.size() >= numPredictors);
-
-        tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16));
-        altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16));
-        indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16));
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i], 16);
+            state.altTagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i] - 1, 16);
+            state.indexFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableIndexBits[i], 16);
+        }
     }
     usefulResetCnt = 0;
 
@@ -143,6 +151,27 @@ BTBTAGE::~BTBTAGE()
 {
 }
 
+ThreadID
+BTBTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBTAGE::ThreadHistoryState &
+BTBTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBTAGE::ThreadHistoryState &
+BTBTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 BTBTAGE::setTrace()
@@ -197,8 +226,11 @@ BTBTAGE::tickStart() {}
 BTBTAGE::TagePrediction
 BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
-                                 std::shared_ptr<TageMeta> predMeta) {
+                                 std::shared_ptr<TageMeta> predMeta,
+                                 ThreadID tid,
+                                 uint8_t asidHash) {
     DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
+    const auto &state = historyState(tid);
 
     // Find main and alternative predictions
     bool provided = false;
@@ -212,11 +244,13 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
     for (int i = numPredictors - 1; i >= 0; --i) {
         // Calculate index and tag: use snapshot if provided, otherwise use current folded history
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
-        Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i);
+        Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get(), asidHash)
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash);
         Addr tag = predMeta ? getTageTag(startPC, i,
-                            predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position)
-                        : getTageTag(startPC, i, position);
+                            predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(),
+                            position, asidHash)
+                        : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
+                                     state.altTagFoldedHist[i].get(), position, asidHash);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -295,7 +329,8 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
  */
 void
 BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                      std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens& results)
+                      std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens& results, ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -303,8 +338,8 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
     for (auto &btb_entry : btbEntries) {
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
-            auto pred = generateSinglePrediction(btb_entry, startPC);
-            meta->preds[btb_entry.pc] = pred;
+            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid, asidHash);
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
             tageInfoForMgscs[btb_entry.pc].tage_pred_taken = pred.taken;
@@ -346,6 +381,9 @@ BTBTAGE::dryRunCycle(Addr startPC) {
  */
 void
 BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
+    const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
     predBankValid = true;
@@ -363,24 +401,28 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPr
     // get prediction and save it
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<TageMeta>();
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
-    meta->history = history;
+    threadMeta[tid] = std::make_shared<TageMeta>();
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
+    threadMeta[tid]->history = history;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens);
+        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs,
+                     stage_pred.condTakens, tid, asidHash);
     }
 
 }
 
 std::shared_ptr<void>
-BTBTAGE::getPredictionMeta() {
-    return meta;
+BTBTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -561,6 +603,7 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC,
                                  bool actual_taken,
                                  unsigned start_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way) {
@@ -573,9 +616,9 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC,
     unsigned position = getBranchIndexInBlock(entry.pc, startPC);
 
     for (unsigned ti = start_table; ti < numPredictors; ++ti) {
-        Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get());
+        Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get(), asidHash);
         Addr newTag = getTageTag(startPC, ti,
-            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position);
+            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position, asidHash);
 
         auto &set = tageTable[ti][newIndex];
 
@@ -704,7 +747,8 @@ BTBTAGE::update(const FetchTarget &stream) {
         TagePrediction recomputed;
         if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot
             // Re-read providers using snapshot (do not rely on prediction-time main/alt)
-            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
+            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                 stream.tid, stream.asidHash);
             // Track differences for statistics
             auto it = predMeta->preds.find(btb_entry.pc);
             if (it != predMeta->preds.end() && recomputed.taken != it->second.taken) {
@@ -734,7 +778,8 @@ BTBTAGE::update(const FetchTarget &stream) {
                 start_table = main_info.table + 1; // start from the table after the main prediction table
             }
             alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken,
-                                   start_table, predMeta, allocated_table, allocated_index, allocated_way);
+                                   start_table, predMeta, stream.asidHash,
+                                   allocated_table, allocated_index, allocated_way);
         }
 
 #ifndef UNIT_TEST
@@ -817,7 +862,8 @@ BTBTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 
 // Calculate TAGE tag with folded history - optimized version using bitwise operations
 Addr
-BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position)
+BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position, uint8_t asidHash)
 {
     // Create mask for tableTagBits[t] to limit result size
     Addr mask = (1ULL << tableTagBits[t]) - 1;
@@ -833,17 +879,20 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
     Addr altTagBits = (altFoldedHist << 1) & mask;
 
     // XOR all components together, including position (like RTL)
-    return pcBits ^ foldedBits ^ altTagBits ^ position;
+    return injectAsidHashIntoTag(pcBits ^ foldedBits ^ altTagBits ^ position,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-BTBTAGE::getTageTag(Addr pc, int t, Addr position)
+BTBTAGE::getTageTag(Addr pc, int t, Addr position, uint8_t asidHash)
 {
-    return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get(), position);
+    const auto &state = historyState(0);
+    return getTageTag(pc, t, state.tagFoldedHist[t].get(),
+                      state.altTagFoldedHist[t].get(), position, asidHash);
 }
 
 Addr
-BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t] to limit result size
     Addr mask = (1ULL << tableIndexBits[t]) - 1;
@@ -852,13 +901,13 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
     Addr pcBits = (pc >> pcShift) & mask;
     Addr foldedBits = foldedHist & mask;
 
-    return pcBits ^ foldedBits;
+    return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash);
 }
 
 Addr
-BTBTAGE::getTageIndex(Addr pc, int t)
+BTBTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
@@ -920,8 +969,10 @@ BTBTAGE::getBankId(Addr pc) const
  * @param taken Whether the branch was taken
  */
 void
-BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target)
+BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken,
+                      Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::TAGEHistory) {   // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
@@ -934,7 +985,9 @@ BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr p
 
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference
             foldedHist.update(history, 2, taken, pc, target);
             DPRINTF(TAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get());
@@ -958,7 +1011,7 @@ void
 BTBTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, taken, pc, target);
+    doUpdateHist(history, taken, pc, target, pred.tid);
 }
 
 /**
@@ -978,19 +1031,29 @@ void
 BTBTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = historyState(entry.tid);
     std::shared_ptr<TageMeta> predMeta = std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < numPredictors; i++) {
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
-    doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, cond_taken, entry.getControlPC(),
+                 entry.getTakenTarget(), entry.tid);
 }
 
 // Check folded history after speculative update and recovery
 void
 BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                         const char * when)
+{
+    auto &state = historyState(tid);
     DPRINTF(TAGE, "checking folded history when %s\n", when);
     if (debug::TAGEHistory) {
         std::string hist_str;
@@ -1000,7 +1063,9 @@ BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
             std::string buf2, buf3;
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             foldedHist.check(hist);
         }
     }
diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh
index 969cc43e8d..c4038a1858 100644
--- a/src/cpu/pred/btb/btb_tage.hh
+++ b/src/cpu/pred/btb/btb_tage.hh
@@ -4,12 +4,14 @@
 #include <cstdint>
 #include <deque>
 #include <map>
+#include <memory>
 #include <utility>
 #include <vector>
 
 #include "base/sat_counter.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -43,6 +45,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
 {
     using defer = std::shared_ptr<void>;
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     // Test constructor
@@ -125,7 +128,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update 3 folded history, according history and pred.taken
     // the other specUpdateHist methods are left blank
@@ -163,6 +166,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     virtual void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
 #ifndef UNIT_TEST
   protected:
@@ -170,21 +174,23 @@ class BTBTAGE : public TimedBaseBTBPredictor
 
     // Look up predictions in TAGE tables for a stream of instructions
     void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                    std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens& results);
+                    std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                    CondTakens& results, ThreadID tid, uint8_t asidHash);
 
     // Calculate TAGE index for a given PC and table
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // Calculate TAGE index with folded history (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // Calculate TAGE tag for a given PC and table
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, Addr position = 0, uint8_t asidHash = 0);
 
     // Calculate TAGE tag with folded history (uint64_t version for performance)
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position = 0, uint8_t asidHash = 0);
 
     // Get offset within a block for a given PC
     Addr getOffset(Addr pc) {
@@ -199,7 +205,8 @@ class BTBTAGE : public TimedBaseBTBPredictor
     unsigned getBankId(Addr pc) const;
 
     // Update branch history
-    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target);
+    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target,
+                      ThreadID tid);
 
     // Number of TAGE predictor tables
     const unsigned numPredictors;
@@ -225,14 +232,14 @@ class BTBTAGE : public TimedBaseBTBPredictor
     // History lengths for each table
     std::vector<unsigned> histLengths;
 
-    // Folded history for tag calculation
-    std::vector<PathFoldedHist> tagFoldedHist;
-
-    // Folded history for alternative tag calculation
-    std::vector<PathFoldedHist> altTagFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<PathFoldedHist> tagFoldedHist;
+        std::vector<PathFoldedHist> altTagFoldedHist;
+        std::vector<PathFoldedHist> indexFoldedHist;
+    };
 
-    // Folded history for index calculation
-    std::vector<PathFoldedHist> indexFoldedHist;
+    std::vector<ThreadHistoryState> threadHistory;
 
     // Linear feedback shift register for allocation
     LFSR64 allocLFSR;
@@ -414,7 +421,9 @@ private:
     // If predMeta is nullptr, use current folded history (prediction path)
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
-                                           const std::shared_ptr<TageMeta> predMeta = nullptr);
+                                           const std::shared_ptr<TageMeta> predMeta = nullptr,
+                                           ThreadID tid = 0,
+                                           uint8_t asidHash = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -431,6 +440,7 @@ private:
                                  bool actual_taken,
                                  unsigned main_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way);
@@ -440,7 +450,11 @@ private:
     void updateLRU(int table, Addr index, unsigned way);
     unsigned getLRUVictim(int table, Addr index);
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/btb_tage_ub.cc b/src/cpu/pred/btb/btb_tage_ub.cc
index 5ea3338aa6..1ecebb7b8d 100644
--- a/src/cpu/pred/btb/btb_tage_ub.cc
+++ b/src/cpu/pred/btb/btb_tage_ub.cc
@@ -309,8 +309,9 @@ BTBTAGEUpperBound::putPCHistory(Addr startAddr, const bitset &history,
 }
 
 std::shared_ptr<void>
-BTBTAGEUpperBound::getPredictionMeta()
+BTBTAGEUpperBound::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     return ubMeta;
 }
 
diff --git a/src/cpu/pred/btb/btb_tage_ub.hh b/src/cpu/pred/btb/btb_tage_ub.hh
index f97792c713..b4aae9e7cc 100644
--- a/src/cpu/pred/btb/btb_tage_ub.hh
+++ b/src/cpu/pred/btb/btb_tage_ub.hh
@@ -95,7 +95,7 @@ class BTBTAGEUpperBound : public BTBTAGE
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     void specUpdateHist(const boost::dynamic_bitset<> &history,
                         FullBTBPrediction &pred) override;
diff --git a/src/cpu/pred/btb/btb_ubtb.cc b/src/cpu/pred/btb/btb_ubtb.cc
index 755d8d8460..5f809713c3 100644
--- a/src/cpu/pred/btb/btb_ubtb.cc
+++ b/src/cpu/pred/btb/btb_ubtb.cc
@@ -137,7 +137,8 @@ void
 UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<UBTBMeta>();
-    auto it = lookup(startAddr);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
+    auto it = lookup(startAddr, asidHash);
     auto& entry = meta->hit_entry;
     entry = (it != ubtb.end()) ? *it : TickedUBTBEntry();
 
@@ -151,23 +152,29 @@ UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::
 }
 
 UBTB::UBTBIter
-UBTB::lookup(Addr startAddr)
+UBTB::lookup(Addr startAddr, uint8_t asidHash)
 {
     if (startAddr & 0x1) {
         return ubtb.end();  // ignore false hit when lowest bit is 1
     }
 
-    Addr current_tag = getTag(startAddr);
+    Addr current_tag = getTag(startAddr, asidHash);
+    Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1);
 
     DPRINTF(UBTB, "UBTB: Doing tag comparison for tag %#lx\n", current_tag);
 
     auto it = std::find_if(ubtb.begin(), ubtb.end(),
-                           [current_tag](const TickedUBTBEntry &way) { return way.valid && way.tag == current_tag; });
+                           [current_tag, startAddr, block_end](const TickedUBTBEntry &way) {
+                               return way.valid && way.tag == current_tag &&
+                                      way.pc >= startAddr && way.pc < block_end;
+                           });
 
     if (it != ubtb.end()) {
         // Found a hit - verify no duplicates
-        auto duplicate = std::find_if(std::next(it), ubtb.end(), [current_tag](const TickedUBTBEntry &way) {
-            return way.valid && way.tag == current_tag;
+        auto duplicate = std::find_if(std::next(it), ubtb.end(),
+                                      [current_tag, startAddr, block_end](const TickedUBTBEntry &way) {
+            return way.valid && way.tag == current_tag &&
+                   way.pc >= startAddr && way.pc < block_end;
         });
         if (duplicate != ubtb.end()) {
             DPRINTF(UBTB, "UBTB: Multiple hits found in uBTB for the same tag %#lx\n", current_tag);
@@ -184,7 +191,8 @@ UBTB::lookup(Addr startAddr)
 
 
 void
-UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr)
+UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry,
+                      Addr startAddr, uint8_t asidHash)
 {
     assert(newTakenEntry.valid);
     TickedUBTBEntry newEntry = TickedUBTBEntry(newTakenEntry, curTick());
@@ -192,7 +200,7 @@ UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr
     newEntry.target = newTakenEntry.target;
     newEntry.ctr = 0; // have a bug here:ubtb will accept ctr from mbtb, reset it to 0 at here
     // important: update tag (mbtb and ubtb have different tags, even diffferent tag length)
-    newEntry.tag = getTag(startAddr);
+    newEntry.tag = getTag(startAddr, asidHash);
     *oldEntryIter = newEntry;
 }
 
@@ -213,13 +221,14 @@ UBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred)
     auto startAddr = s3Pred.bbStart;
     UBTBIter oldEntryIter = lastPred.hit_entry;
     takenEntry.source = getComponentIdx();
-    updateNewEntry(oldEntryIter, takenEntry, startAddr);
+    updateNewEntry(oldEntryIter, takenEntry, startAddr, s3Pred.asidHash);
 
 }
 
 
 
-void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr)
+void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry,
+                          const Addr startAddr, uint8_t asidHash)
 {
     //using the FB final taken branch to update uBTB
     if (oldEntryIter != ubtb.end()) {
@@ -259,7 +268,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con
             }
 
             // Replace the entry with the new prediction
-            replaceOldEntry(toBeReplacedIter, takenEntry, startAddr);
+            replaceOldEntry(toBeReplacedIter, takenEntry, startAddr, asidHash);
 
         } else if (oldEntryIter != ubtb.end() && takenEntry.valid) {
             ubtbStats.s1Hits3Taken++;
@@ -269,7 +278,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con
                 updateUCtr(oldEntryIter->uctr, false);
                 if (oldEntryIter->uctr == 0) {
                     // replace the old entry with the new one
-                    replaceOldEntry(oldEntryIter, takenEntry, startAddr);
+                    replaceOldEntry(oldEntryIter, takenEntry, startAddr, asidHash);
                 }
             } else {
                 // S0 and S3 predict the same (brpc and target)
@@ -294,13 +303,15 @@ UBTB::update(const FetchTarget &stream)
      // Use BTBEntry instead of BranchInfo; make it invalid when not taken
     BTBEntry takenEntry = stream.exeTaken ? BTBEntry(stream.exeBranchInfo) : BTBEntry();
     auto startAddr = stream.getRealStartPC();
-    Addr oldtag = getTag(startAddr);
+    Addr oldtag = getTag(startAddr, stream.asidHash);
+    Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1);
 
     UBTBIter oldEntryIter = ubtb.end();
 
     oldEntryIter = meta->hit_entry.valid ?
-                    std::find_if(ubtb.begin(), ubtb.end(), [oldtag](const TickedUBTBEntry &e) {
-                        return e.valid && e.tag == oldtag;
+                    std::find_if(ubtb.begin(), ubtb.end(), [oldtag, startAddr, block_end](const TickedUBTBEntry &e) {
+                        return e.valid && e.tag == oldtag &&
+                               e.pc >= startAddr && e.pc < block_end;
                     }) : ubtb.end();
 
     if (stream.exeTaken) {
@@ -315,7 +326,7 @@ UBTB::update(const FetchTarget &stream)
     // Verify uBTB state
     assert(ubtb.size() <= numEntries);
     if (!usingS3Pred) {
-        updateNewEntry(oldEntryIter, takenEntry, startAddr);
+        updateNewEntry(oldEntryIter, takenEntry, startAddr, stream.asidHash);
     }
 }
 
diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh
index 5c394ac9cc..4898cec009 100644
--- a/src/cpu/pred/btb/btb_ubtb.hh
+++ b/src/cpu/pred/btb/btb_ubtb.hh
@@ -141,7 +141,7 @@ class UBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override
     {
         return meta;
     }
@@ -218,8 +218,9 @@ class UBTB : public TimedBaseBTBPredictor
      *  @param startPC The start address of the fetch block
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr startPC) {
-        return (startPC >> 1) & tagMask;
+    inline Addr getTag(Addr startPC, uint8_t asidHash) {
+        Addr baseTag = (startPC >> 1) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
     void updateUCtr(unsigned &ctr, bool inc) {
@@ -231,7 +232,7 @@ class UBTB : public TimedBaseBTBPredictor
      * @param startAddr The FB start address to look up
      * @return Iterator to the matching entry if found, or ubtb.end() if not found
      */
-    UBTBIter lookup(Addr startAddr);
+    UBTBIter lookup(Addr startAddr, uint8_t asidHash);
 
     /** helper method called by putPCHistory: Check uBTB entry pc range and update statistics
      * @param entry The uBTB entry to check
@@ -251,10 +252,12 @@ class UBTB : public TimedBaseBTBPredictor
      * @param oldEntry Iterator to the entry to replace
      * @param newPrediction The new prediction to store
      */
-    void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr);
+    void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry,
+                         Addr startAddr, uint8_t asidHash);
 
     //using the FB final taken branch to update uBTB
-    void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr);
+    void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry,
+                        const Addr startAddr, uint8_t asidHash);
 
 
     /** The uBTB structure:
diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh
index e00e7fbcf7..9c2cdf2c8f 100644
--- a/src/cpu/pred/btb/common.hh
+++ b/src/cpu/pred/btb/common.hh
@@ -1,6 +1,7 @@
 #ifndef __CPU_PRED_BTB_STREAM_STRUCT_HH__
 #define __CPU_PRED_BTB_STREAM_STRUCT_HH__
 
+#include <algorithm>
 #include <queue>
 #include <string>
 
@@ -18,6 +19,49 @@ namespace branch_prediction {
 
 namespace btb_pred {
 
+inline uint8_t
+foldAsidHash16To4(uint16_t asid)
+{
+    return (asid & 0xf) ^ ((asid >> 4) & 0xf) ^
+           ((asid >> 8) & 0xf) ^ ((asid >> 12) & 0xf);
+}
+
+inline Addr
+expandAsidHash(uint8_t asid_hash, unsigned bits)
+{
+    if (bits == 0) {
+        return 0;
+    }
+
+    Addr expanded = 0;
+    for (unsigned shift = 0; shift < bits; shift += 4) {
+        expanded |= static_cast<Addr>(asid_hash) << shift;
+    }
+    return expanded & mask(bits);
+}
+
+inline Addr
+injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash)
+{
+    if (tag_bits == 0) {
+        return 0;
+    }
+
+    const unsigned hash_bits = std::min<unsigned>(4, tag_bits);
+    const Addr hash_mask = mask(hash_bits);
+    return (base_tag & ~hash_mask) | (static_cast<Addr>(asid_hash) & hash_mask);
+}
+
+inline Addr
+xorAsidHashIntoIndex(Addr base_index, unsigned index_bits, uint8_t asid_hash)
+{
+    if (index_bits == 0) {
+        return 0;
+    }
+
+    return (base_index ^ expandAsidHash(asid_hash, index_bits)) & mask(index_bits);
+}
+
 enum EndType
 {
     END_CALL=0,
@@ -276,6 +320,7 @@ using IndirectTargets = std::vector<std::pair<Addr, Addr>>;
 struct FetchTarget
 {
     ThreadID tid;
+    uint8_t asidHash;
     Addr startPC;       // start pc of the stream
     bool predTaken;     // whether the FetchTarget has taken branch
     Addr predEndPC;     // predicted stream end pc (fall through pc)
@@ -323,7 +368,9 @@ struct FetchTarget
     int s3Source; // which stage the prediction comes from
 
    FetchTarget()
-       : startPC(0),
+       : tid(0),
+         asidHash(0),
+         startPC(0),
          predTaken(false),
          predEndPC(0),
          predBranchInfo(BranchInfo()),
@@ -452,6 +499,7 @@ struct FetchTarget
 struct FullBTBPrediction
 {
     ThreadID tid;
+    uint8_t asidHash;
     Addr bbStart;
     std::vector<BTBEntry> btbEntries; // for BTB, only assigned when hit, sorted by inst order
     // for conditional branch predictors, mapped with lowest bits of branches
@@ -472,6 +520,8 @@ struct FullBTBPrediction
     int s3Source;
 
     FullBTBPrediction() :
+        tid(0),
+        asidHash(0),
         bbStart(0),
         btbEntries(),
         condTakens(),
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 37cf705814..e5fd8c68a0 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -2,11 +2,13 @@
 
 #include <array>
 
+#include "arch/riscv/regs/misc.hh"
 #include "base/debug_helper.hh"
 #include "base/output.hh"
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/pred/btb/folded_hist.hh"
+#include "cpu/thread_context.hh"
 #include "debug/BTB.hh"
 #include "debug/DecoupleBPHist.hh"
 #include "debug/DecoupleBPVerbose.hh"
@@ -21,6 +23,19 @@ namespace branch_prediction
 namespace btb_pred
 {
 
+uint8_t
+DecoupledBPUWithBTB::getThreadAsidHash(ThreadID tid) const
+{
+    if (!cpu) {
+        return 0;
+    }
+
+    const RegVal satp =
+        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_SATP, tid);
+    const uint16_t asid = (satp >> 44) & mask(16);
+    return foldAsidHash16To4(asid);
+}
+
 void
 DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid)
 {
@@ -45,8 +60,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
       // uras(p.uras),
       bpDBSwitches(p.bpDBSwitches),
       numStages(p.numStages),
-      ftq(2, p.ftq_size),
-      historyManager(16), // TODO: fix this
+      ftq(p.numThreads, p.ftq_size),
       resolveBlockThreshold(p.resolveBlockThreshold),
       dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
 {
@@ -86,6 +100,12 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
         printf("\n");
     }
 
+    historyManagers.reserve(numThreads);
+    resolveDequeueFailCounters.assign(numThreads, 0);
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        historyManagers.emplace_back(16);
+    }
+
     for (int tid=0;tid<numThreads; tid++) {
         auto& thread = threads[tid];
 
@@ -115,6 +135,26 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
     });
 }
 
+ThreadID
+DecoupledBPUWithBTB::scheduleThread()
+{
+    for (ThreadID offset = 0; offset < numThreads; ++offset) {
+        const ThreadID tid = (nextPredictTid + offset) % numThreads;
+
+        if (cpu) {
+            auto *tc = cpu->getContext(tid);
+            if (!tc || tc->status() != gem5::ThreadContext::Active) {
+                continue;
+            }
+        }
+
+        nextPredictTid = (tid + 1) % numThreads;
+        return tid;
+    }
+
+    return InvalidThreadID;
+}
+
 
 void
 DecoupledBPUWithBTB::tick()
@@ -122,6 +162,9 @@ DecoupledBPUWithBTB::tick()
     DPRINTF(Override, "DecoupledBPUWithBTB::tick()\n");
 
     ThreadID curTid = scheduleThread();
+    if (curTid == InvalidThreadID) {
+        return;
+    }
 
     // On squash, reset state if there was a valid prediction.
     bool squashOccurred = false;
@@ -180,14 +223,17 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid)
 {
     auto& thread = threads[tid];
     auto& predsOfEachStage = threads[tid].predsOfEachStage;
+    const uint8_t asid_hash = getThreadAsidHash(tid);
 
     DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC);
 
-
-    // Initialize prediction state for each stage
+    // Reset all stage-local prediction fields before components fill them.
+    clearPreds(tid);
     for (int i = 0; i < numStages; i++) {
         predsOfEachStage[i].tid = tid;
+        predsOfEachStage[i].asidHash = asid_hash;
         predsOfEachStage[i].bbStart = thread.s0PC;
+        predsOfEachStage[i].predSource = i;
     }
 
     // Query each predictor component with current PC and history
@@ -300,7 +346,7 @@ DecoupledBPUWithBTB::generateFinalPredAndCreateBubbles(ThreadID tid)
         if (ubtb->isEnabled()) {
             ubtb->updateUsingS3Pred(predsOfEachStage[numStages - 1]);
         }
-        if (abtb->isEnabled() && ftq.backId(tid)) {
+        if (abtb->isEnabled() && !ftq.empty(tid)) {
             auto previous_block_startpc = ftq.back(tid).startPC;
             abtb->updateUsingS3Pred(predsOfEachStage[numStages - 1], previous_block_startpc);
         } else if (abtb->isEnabled()) {
@@ -428,8 +474,14 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id,
 
     // Find the target being squashed
     if (!ftq.hasTarget(target_id, tid)) {
-        assert(!ftq.empty(tid));
-        DPRINTF(DecoupleBP, "The squashing target is insane, ignore squash on it");
+        DPRINTF(DecoupleBP,
+                "Ignore squash for tid %u on missing FTQ target %u; "
+                "recovering predictor state from redirect PC %#lx\n",
+                tid, target_id, redirect_pc);
+        ftq.clear(tid);
+        clearPreds(tid);
+        threads[tid].validprediction = false;
+        threads[tid].s0PC = redirect_pc;
         return;
     }
 
@@ -577,7 +629,7 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid)
     if (!ftq.empty(tid))
         printTarget(ftq.front(tid));
 
-    historyManager.commit(target_id);
+    historyManagers[tid].commit(target_id);
 }
 
 bool
@@ -615,26 +667,26 @@ DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid)
 }
 
 void
-DecoupledBPUWithBTB::notifyResolveSuccess()
+DecoupledBPUWithBTB::notifyResolveSuccess(ThreadID tid)
 {
-    resolveDequeueFailCounter = 0;
+    resolveDequeueFailCounters[tid] = 0;
 }
 
 void
-DecoupledBPUWithBTB::notifyResolveFailure()
+DecoupledBPUWithBTB::notifyResolveFailure(ThreadID tid)
 {
-    resolveDequeueFailCounter++;
-    if (resolveDequeueFailCounter >= resolveBlockThreshold) {
-        blockPredictionOnce();
-        resolveDequeueFailCounter = 0;
+    auto &failCounter = resolveDequeueFailCounters[tid];
+    failCounter++;
+    if (failCounter >= resolveBlockThreshold) {
+        blockPredictionOnce(tid);
+        failCounter = 0;
     }
 }
 
 void
-DecoupledBPUWithBTB::blockPredictionOnce()
+DecoupledBPUWithBTB::blockPredictionOnce(ThreadID tid)
 {
-    // smtTODO
-    threads[0].blockPredictionPending = true;
+    threads[tid].blockPredictionPending = true;
 }
 
 void
@@ -745,6 +797,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid)
     // Create a new fetch target entry
     FetchTarget entry;
     entry.tid = tid;
+    entry.asidHash = finalPred.asidHash;
     entry.startPC = s0PC;
 
     // Extract branch prediction information
@@ -779,7 +832,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid)
 
     // Save predictors' metadata
     for (int i = 0; i < numComponents; i++) {
-        entry.predMetas[i] = components[i]->getPredictionMeta();
+        entry.predMetas[i] = components[i]->getPredictionMeta(tid);
     }
 
     // Initialize default resolution state
@@ -814,7 +867,8 @@ DecoupledBPUWithBTB::fillAheadPipeline(FetchTarget &entry)
 }
 
 void
-DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history)
+DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history,
+                                  ThreadID tid)
 {
     // This function performs a crucial validation of branch history consistency
     // It rebuilds the "ideal" history from HistoryManager's records and compares
@@ -825,7 +879,7 @@ DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history)
     boost::dynamic_bitset<> ideal_hash_hist(historyBits, 0);
 
     // Iterate through all speculative history entries stored in HistoryManager
-    for (const auto entry: historyManager.getSpeculativeHist()) {
+    for (const auto entry: historyManagers[tid].getSpeculativeHist()) {
         // Only process entries that have non-zero shift amount (actual branches)
         if (entry.shamt != 0) {
             // Accumulate total history bits
@@ -868,6 +922,12 @@ DecoupledBPUWithBTB::resetPC(Addr new_pc)
         threads[i].s0PC = new_pc;
 }
 
+void
+DecoupledBPUWithBTB::resetPC(ThreadID tid, Addr new_pc)
+{
+    threads[tid].s0PC = new_pc;
+}
+
 Addr
 DecoupledBPUWithBTB::getPreservedReturnAddr(const DynInstPtr &dynInst)
 {
@@ -915,7 +975,7 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry)
     histShiftIn(shamt, taken, s0History);
 
     // Update history manager and verify TAGE folded history
-    historyManager.addSpeculativeHist(
+    historyManagers[tid].addSpeculativeHist(
         entry.startPC, shamt, taken, entry.predBranchInfo, ftq.backId(tid) + 1);
 
     // Get prediction information for global backward history updates
@@ -938,16 +998,17 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry)
 
 #ifndef NDEBUG
     if (tage->isEnabled()) {
-        tage->checkFoldedHist(s0PHistory, "speculative update");
+        tage->checkFoldedHist(s0PHistory, tid, "speculative update");
     }
     if (ittage->isEnabled()) {
-        ittage->checkFoldedHist(s0PHistory, "speculative update");
+        ittage->checkFoldedHist(s0PHistory, tid, "speculative update");
     }
     if (microtage->isEnabled()) {
-        microtage->checkFoldedHist(s0PHistory, "speculative update");
+        microtage->checkFoldedHist(s0PHistory, tid, "speculative update");
     }
     if (mgsc->isEnabled()) {
-        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, "speculative update");
+        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid,
+                              "speculative update");
     }
 #endif
 }
@@ -1023,31 +1084,33 @@ DecoupledBPUWithBTB::recoverHistoryForSquash(
 
     // Update history manager with appropriate branch info
     if (squash_type == SQUASH_CTRL) {
-        historyManager.squash(target_id, real_shamt, real_taken, target.exeBranchInfo);
+        historyManagers[tid].squash(target_id, real_shamt, real_taken,
+                                    target.exeBranchInfo);
     } else {
-        historyManager.squash(target_id, real_shamt, real_taken, BranchInfo());
+        historyManagers[tid].squash(target_id, real_shamt, real_taken,
+                                    BranchInfo());
     }
 
     // Perform history consistency checks when not a fast build variant
 #ifndef NDEBUG
-    checkHistory(s0History);
+    checkHistory(s0History, tid);
     if (tage->isEnabled()) {
-        tage->checkFoldedHist(s0PHistory,
+        tage->checkFoldedHist(s0PHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (ittage->isEnabled()) {
-        ittage->checkFoldedHist(s0PHistory,
+        ittage->checkFoldedHist(s0PHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (microtage->isEnabled()) {
-        microtage->checkFoldedHist(s0PHistory,
+        microtage->checkFoldedHist(s0PHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (mgsc->isEnabled()) {
-        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory,
+        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 288450001f..0a46c1a4e5 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -75,8 +75,7 @@ class DecoupledBPUWithBTB : public BPredUnit
     // FetchTargetId fetchHeadFtqId{1}; // next FSQ id to be consumed by fetch
 
     CPU *cpu;
-
-    const int numThreads = 2;
+    ThreadID nextPredictTid = 0;
     unsigned predictWidth;  // max predict width, default 64
     unsigned maxInstsNum;
 
@@ -141,11 +140,11 @@ class DecoupledBPUWithBTB : public BPredUnit
         bool blockPredictionPending{false};
     } threads[MaxThreads];
 
-    HistoryManager historyManager;
-    unsigned resolveDequeueFailCounter{0};
+    std::vector<HistoryManager> historyManagers;
+    std::vector<unsigned> resolveDequeueFailCounters;
     const unsigned resolveBlockThreshold;
 
-    ThreadID scheduleThread() { return 0; }
+    ThreadID scheduleThread();
 
     void processNewPrediction(ThreadID tid);
 
@@ -188,10 +187,9 @@ class DecoupledBPUWithBTB : public BPredUnit
     void generateFinalPredAndCreateBubbles(ThreadID tid);
 
     void clearPreds(ThreadID tid) {
-        for (auto &stagePred : threads[tid].predsOfEachStage) {
-            stagePred.condTakens.clear();
-            stagePred.indirectTargets.clear();
-            stagePred.btbEntries.clear();
+        for (int i = 0; i < threads[tid].predsOfEachStage.size(); ++i) {
+            threads[tid].predsOfEachStage[i] = FullBTBPrediction();
+            threads[tid].predsOfEachStage[i].predSource = i;
         }
     }
 
@@ -332,6 +330,7 @@ class DecoupledBPUWithBTB : public BPredUnit
     }
 
     void setCpu(CPU *_cpu) { cpu = _cpu; }
+    uint8_t getThreadAsidHash(ThreadID tid) const;
 
     void consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid);
 
@@ -425,7 +424,7 @@ class DecoupledBPUWithBTB : public BPredUnit
 
     void overrideStats(OverrideReason overrideReason);
 
-    void checkHistory(const boost::dynamic_bitset<> &history);
+    void checkHistory(const boost::dynamic_bitset<> &history, ThreadID tid);
 
     Addr getPreservedReturnAddr(const DynInstPtr &dynInst);
 
@@ -704,6 +703,7 @@ class DecoupledBPUWithBTB : public BPredUnit
                       unsigned control_inst_size = 0);
 
     void resetPC(Addr new_pc);
+    void resetPC(ThreadID tid, Addr new_pc);
 
     // Helper functions for update
     bool resolveUpdate(unsigned &target_id, ThreadID tid);
@@ -711,9 +711,9 @@ class DecoupledBPUWithBTB : public BPredUnit
     void markCFIResolved(unsigned &target, uint64_t resolvedInstPC, ThreadID tid);
     void updatePredictorComponents(FetchTarget &target);
     void updateStatistics(const FetchTarget &target);
-    void notifyResolveSuccess();
-    void notifyResolveFailure();
-    void blockPredictionOnce();
+    void notifyResolveSuccess(ThreadID tid);
+    void notifyResolveFailure(ThreadID tid);
+    void blockPredictionOnce(ThreadID tid);
 
     /**
      * @brief Types of control flow instructions for misprediction tracking
diff --git a/src/cpu/pred/btb/ftq.cc b/src/cpu/pred/btb/ftq.cc
index 3642ef7162..b8abfe7996 100644
--- a/src/cpu/pred/btb/ftq.cc
+++ b/src/cpu/pred/btb/ftq.cc
@@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include "ftq.hh"
 
 namespace gem5
@@ -53,6 +55,19 @@ FetchTargetQueue::squashAfter(FetchTargetId squashId, ThreadID tid)
     queue[tid].fetchptr = squashId + 1;
 }
 
+void
+FetchTargetQueue::clear(ThreadID tid)
+{
+    const FetchTargetId nextTargetId = std::max(
+        queue[tid].fetchptr,
+        queue[tid].baseTargetId +
+            static_cast<FetchTargetId>(queue[tid].cap.size()));
+
+    queue[tid].cap.clear();
+    queue[tid].baseTargetId = nextTargetId;
+    queue[tid].fetchptr = nextTargetId;
+}
+
 
 }
 }
diff --git a/src/cpu/pred/btb/ftq.hh b/src/cpu/pred/btb/ftq.hh
index c43d071447..c762cd0b83 100644
--- a/src/cpu/pred/btb/ftq.hh
+++ b/src/cpu/pred/btb/ftq.hh
@@ -80,6 +80,7 @@ public:
     void finishTarget(ThreadID tid);
     void commitTarget(ThreadID tid);
     void squashAfter(FetchTargetId targetId, ThreadID tid);
+    void clear(ThreadID tid);
 };
 
 }
diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc
index 4ab8445677..de1e764fce 100644
--- a/src/cpu/pred/btb/mbtb.cc
+++ b/src/cpu/pred/btb/mbtb.cc
@@ -299,8 +299,9 @@ MBTB::putPCHistory(Addr startAddr,
                          std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<BTBMeta>();
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     // Lookup all matching entries in BTB
-    auto find_entries = lookup(startAddr, meta);
+    auto find_entries = lookup(startAddr, asidHash, meta);
 
     // Process BTB entries
     auto processed_entries = processEntries(find_entries, startAddr);
@@ -313,8 +314,9 @@ MBTB::putPCHistory(Addr startAddr,
 }
 
 std::shared_ptr<void>
-MBTB::getPredictionMeta()
+MBTB::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     return meta;
 }
 
@@ -334,7 +336,7 @@ MBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &ent
  * @return Vector of matching BTB entries
  */
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookupSingleBlock(Addr block_pc)
+MBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -345,11 +347,11 @@ MBTB::lookupSingleBlock(Addr block_pc)
     auto& target_sram = (sram_id == 0) ? sram0 : sram1;
     auto& target_mru = (sram_id == 0) ? mru0 : mru1;
     
-    Addr btb_idx = getIndex(block_pc);
+    Addr btb_idx = getIndex(block_pc, asidHash);
     auto& btb_set = target_sram[btb_idx];
     assert(btb_idx < numSets);
 
-    Addr current_tag = getTag(block_pc);
+    Addr current_tag = getTag(block_pc, asidHash);
     DPRINTF(BTB, "BTB: Doing tag comparison for SRAM%d index 0x%lx tag %#lx\n",
         sram_id, btb_idx, current_tag);
         
@@ -364,7 +366,7 @@ MBTB::lookupSingleBlock(Addr block_pc)
 }
 
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta)
+MBTB::lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr<BTBMeta> meta)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -375,15 +377,15 @@ MBTB::lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta)
     // Calculate 32B aligned address
     Addr alignedPC = block_pc & ~(blockSize - 1);
     // Lookup first 32B block
-    res = lookupSingleBlock(alignedPC);
+    res = lookupSingleBlock(alignedPC, asidHash);
     // Lookup next 32B block
-    auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize);
+    auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize, asidHash);
     // Merge results
     res.insert(res.end(), nextBlockRes.begin(), nextBlockRes.end());
 
     // lookup victim cache if victim cache is enabled
     if (victimCacheSize > 0) {
-        auto victimResults = lookupVictimCache(block_pc);
+        auto victimResults = lookupVictimCache(block_pc, asidHash);
         if (!victimResults.empty()) {
             DPRINTF(BTB, "Victim cache hit for lookup at %#lx\n", block_pc);
             btbStats.victimCacheHit++;
@@ -459,7 +461,7 @@ MBTB::getAndSetNewBTBEntry(FetchTarget &stream)
     }
 
     // Set tag and update stream metadata for use in update()
-    entry_to_write.tag = getTag(entry_to_write.pc);
+    entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash);
     stream.updateNewBTBEntry = entry_to_write;
     stream.updateIsOldEntry = is_old_entry;
 }
@@ -507,7 +509,7 @@ MBTB::updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream)
     auto& target_mru = (sram_id == 0) ? mru0 : mru1;
     
     // Calculate index and tag for this entry
-    Addr btb_idx = getIndex(entry.pc);
+    Addr btb_idx = getIndex(entry.pc, stream.asidHash);
 
     // Look for matching entry in the target SRAM
     bool found = false;
@@ -563,7 +565,7 @@ MBTB::buildUpdatedEntry(const BTBEntry& req_entry,
                               ? BTBEntry(*existing_entry)
                               : req_entry;
     // Always recalculate tag based on the actual PC being written
-    entry_to_write.tag = getTag(entry_to_write.pc);
+    entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash);
     entry_to_write.resolved = false; // reset resolved status
 
     // Update saturating counter and alwaysTaken
@@ -722,7 +724,7 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) {
  * Victim cache operations implementation
  */
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookupVictimCache(Addr block_pc)
+MBTB::lookupVictimCache(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> results;
     Addr alignedPC = block_pc & ~(blockSize - 1);
@@ -734,7 +736,7 @@ MBTB::lookupVictimCache(Addr block_pc)
         Addr entryAlignedPC = entry.pc & ~(blockSize - 1);
         // Check if this entry is in either of the two 32B blocks we're looking for
         if (entryAlignedPC == alignedPC || entryAlignedPC == (alignedPC + blockSize)) {
-            Addr current_tag = getTag(entry.pc);
+            Addr current_tag = getTag(entry.pc, asidHash);
             if (entry.tag == current_tag) {
                 results.push_back(entry);
                 DPRINTF(BTB, "Victim cache hit for pc %#lx\n", entry.pc);
diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh
index d736d0f55c..3b2ec76fe4 100644
--- a/src/cpu/pred/btb/mbtb.hh
+++ b/src/cpu/pred/btb/mbtb.hh
@@ -147,7 +147,7 @@ class MBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // not used
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
@@ -215,8 +215,9 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch to look up.
      *  @return Returns the index into the BTB.
      */
-    inline Addr getIndex(Addr instPC) {
-        return (instPC >> idxShiftAmt) & idxMask;
+    inline Addr getIndex(Addr instPC, uint8_t asidHash) {
+        Addr baseIndex = (instPC >> idxShiftAmt) & idxMask;
+        return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash);
     }
 
     /** Returns the tag bits of a given address.
@@ -225,8 +226,9 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch's address.
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr instPC) {
-        return (instPC >> tagShiftAmt) & tagMask;
+    inline Addr getTag(Addr instPC, uint8_t asidHash) {
+        Addr baseTag = (instPC >> tagShiftAmt) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
     /** Update the 2-bit saturating counter for conditional branches
@@ -340,16 +342,16 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The address of the block to look up.
      *  @return Returns all hit BTB entries.
      */
-    std::vector<TickedBTBEntry> lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta);
+    std::vector<TickedBTBEntry> lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr<BTBMeta> meta);
 
     /** Helper function to lookup entries in a single block
      * @param block_pc The aligned PC to lookup
      * @return Vector of matching BTB entries
      */
-    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, uint8_t asidHash);
 
     /** Victim cache operations */
-    std::vector<TickedBTBEntry> lookupVictimCache(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupVictimCache(Addr block_pc, uint8_t asidHash);
     void insertVictimCache(const TickedBTBEntry& evicted_entry);
     bool eraseFromVictimCacheByPC(Addr pc);
 
diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc
index bcc57db899..ae291fd3eb 100644
--- a/src/cpu/pred/btb/microtage.cc
+++ b/src/cpu/pred/btb/microtage.cc
@@ -97,6 +97,9 @@ tageStats(this, p.numPredictors, p.numBanks)
     }
 
     // Initialize base table for fallback predictions
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     for (unsigned int i = 0; i < numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -113,9 +116,15 @@ tageStats(this, p.numPredictors, p.numBanks)
 
         assert(tablePcShifts.size() >= numPredictors);
 
-        tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16));
-        altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16));
-        indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16));
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i], 16);
+            state.altTagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i] - 1, 16);
+            state.indexFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableIndexBits[i], 16);
+        }
     }
     usefulResetCnt = 0;
 
@@ -129,6 +138,27 @@ MicroTAGE::~MicroTAGE()
 {
 }
 
+ThreadID
+MicroTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+MicroTAGE::ThreadHistoryState &
+MicroTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const MicroTAGE::ThreadHistoryState &
+MicroTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 MicroTAGE::setTrace()
@@ -183,8 +213,11 @@ MicroTAGE::tickStart() {}
 MicroTAGE::TagePrediction
 MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
-                                 std::shared_ptr<TageMeta> predMeta) {
+                                 std::shared_ptr<TageMeta> predMeta,
+                                 ThreadID tid,
+                                 uint8_t asidHash) {
     DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
+    const auto &state = historyState(tid);
 
     bool provided = false;
     TageTableInfo main_info;
@@ -197,11 +230,13 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
         // Calculate index and tag: use snapshot if provided, otherwise use current folded history
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
         Addr index = predMeta ? getTageIndex(startPC, i,
-                            predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i);
+                            predMeta->indexFoldedHist[i].get(), asidHash)
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash);
         Addr tag = predMeta ? getTageTag(startPC, i,
-                            predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position)
-                        : getTageTag(startPC, i, tagFoldedHist[i].get(),altTagFoldedHist[i].get(), position);
+                            predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(),
+                            position, asidHash)
+                        : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
+                                     state.altTagFoldedHist[i].get(), position, asidHash);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -257,7 +292,8 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
  * @return Map of branch PC addresses to their predicted outcomes
  */
 void
-MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries, CondTakens& results)
+MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
+                        CondTakens& results, ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -265,8 +301,9 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEnt
     for (auto &btb_entry : btbEntries) {
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
-            auto pred = generateSinglePrediction(btb_entry, startPC);
-            meta->preds[btb_entry.pc] = pred;
+            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr,
+                                                 tid, asidHash);
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
         }
@@ -297,6 +334,9 @@ MicroTAGE::dryRunCycle(Addr startPC) {
  */
 void
 MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
+    const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
     predBankValid = true;
@@ -314,30 +354,36 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTB
     // get prediction and save it
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<TageMeta>();
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
-    meta->aheadIndexFoldedHistValid = !aheadindexFoldedHist.empty();
-    if (meta->aheadIndexFoldedHistValid) {
-        meta->aheadIndexFoldedHist = aheadindexFoldedHist.front();
+    threadMeta[tid] = std::make_shared<TageMeta>();
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
+    threadMeta[tid]->aheadIndexFoldedHistValid =
+        !state.aheadIndexFoldedHist.empty();
+    if (threadMeta[tid]->aheadIndexFoldedHistValid) {
+        threadMeta[tid]->aheadIndexFoldedHist =
+            state.aheadIndexFoldedHist.front();
     } else {
-        meta->aheadIndexFoldedHist.clear();
+        threadMeta[tid]->aheadIndexFoldedHist.clear();
     }
-    meta->history = history;
+    threadMeta[tid]->history = history;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens);
+        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens,
+                     tid, asidHash);
     }
 
 }
 
 std::shared_ptr<void>
-MicroTAGE::getPredictionMeta() {
-    return meta;
+MicroTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -495,6 +541,7 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC,
                                  bool actual_taken,
                                  unsigned start_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way) {
@@ -508,9 +555,10 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC,
 
     for (unsigned ti = start_table; ti < numPredictors; ++ti) {
         Addr newIndex = getTageIndex(startPC, ti,
-            meta->indexFoldedHist[ti].get());
+            meta->indexFoldedHist[ti].get(), asidHash);
         Addr newTag = getTageTag(startPC, ti,
-            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position);
+            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(),
+            position, asidHash);
 
         auto &set = tageTable[ti][newIndex];
 
@@ -638,7 +686,8 @@ MicroTAGE::update(const FetchTarget &stream) {
         TagePrediction recomputed;
         if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot
             // Re-read providers using snapshot (do not rely on prediction-time main/alt)
-            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
+            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                 stream.tid, stream.asidHash);
         } else { // otherwise, use the prediction from the prediction-time main/alt
             auto pred_it = predMeta->preds.find(btb_entry.pc);
             if (pred_it != predMeta->preds.end()) {
@@ -646,7 +695,8 @@ MicroTAGE::update(const FetchTarget &stream) {
             } else {
                 DPRINTF(UTAGE, "update: missing predMeta entry for pc %#lx, recompute with snapshot\n",
                         btb_entry.pc);
-                recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
+                recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                     stream.tid, stream.asidHash);
             }
         }
         if (recomputed.mainprovided) {
@@ -669,7 +719,8 @@ MicroTAGE::update(const FetchTarget &stream) {
                 start_table = main_info.table + 1; // start from the table after the main prediction table
             }
             alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken,
-                                   start_table, predMeta, allocated_table, allocated_index, allocated_way);
+                                   start_table, predMeta, stream.asidHash,
+                                   allocated_table, allocated_index, allocated_way);
         }
 
 #ifndef UNIT_TEST
@@ -751,7 +802,8 @@ MicroTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 
 // Calculate TAGE tag with folded history - optimized version using bitwise operations
 Addr
-MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position)
+MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                      Addr position, uint8_t asidHash)
 {
     // Create mask for tableTagBits[t] to limit result size
     Addr mask = (1ULL << tableTagBits[t]) - 1;
@@ -766,11 +818,12 @@ MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis
     Addr altTagBits = (altFoldedHist << 1) & mask;
 
     // XOR all components together, including position (like RTL)
-    return pcBits ^ foldedBits ^ position ^ altTagBits;
+    return injectAsidHashIntoTag(pcBits ^ foldedBits ^ position ^ altTagBits,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t] to limit result size
     Addr mask = (1ULL << tableIndexBits[t]) - 1;
@@ -779,13 +832,13 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
     Addr pcBits = (pc >> pcShift) & mask;
     Addr foldedBits = foldedHist & mask;
 
-    return pcBits ^ foldedBits;
+    return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash);
 }
 
 Addr
-MicroTAGE::getTageIndex(Addr pc, int t)
+MicroTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
@@ -851,23 +904,26 @@ MicroTAGE::getBankId(Addr pc) const
  * @param taken Whether the branch was taken
  */
 void
-MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target)
+MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken,
+                        Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::TAGEHistory) {   // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
         DPRINTF(TAGEHistory, "in doUpdateHist, taken %d, pc %#lx, history %s\n", taken, pc, buf.c_str());
     }
 
-    if (!aheadindexFoldedHist.empty()) {
-        indexFoldedHist = aheadindexFoldedHist.front();
+    if (!state.aheadIndexFoldedHist.empty()) {
+        state.indexFoldedHist = state.aheadIndexFoldedHist.front();
     }
 
     if (!taken) {
-        if (debug::TAGEHistory && !aheadindexFoldedHist.empty()) {
+        if (debug::TAGEHistory && !state.aheadIndexFoldedHist.empty()) {
             bool mismatch = false;
             for (int t = 0; t < numPredictors; t++) {
-                if (indexFoldedHist[t].get() != aheadindexFoldedHist.front()[t].get()) {
+                if (state.indexFoldedHist[t].get() !=
+                    state.aheadIndexFoldedHist.front()[t].get()) {
                     mismatch = true;
                     break;
                 }
@@ -883,22 +939,23 @@ MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 
     for (int t = 0; t < numPredictors; t++) {
         // Update tag folded history immediately so tag calculation always sees current history.
-        tagFoldedHist[t].update(history, 2, taken, pc, target);
-        altTagFoldedHist[t].update(history, 2, taken, pc, target);
+        state.tagFoldedHist[t].update(history, 2, taken, pc, target);
+        state.altTagFoldedHist[t].update(history, 2, taken, pc, target);
         DPRINTF(TAGEHistory, "t: %d, tag 0x%lx, altTag 0x%lx\n",
-                t, tagFoldedHist[t].get(), altTagFoldedHist[t].get());
+                t, state.tagFoldedHist[t].get(),
+                state.altTagFoldedHist[t].get());
     }
 
     // Prepare next-cycle index folded history and delay its visibility by one cycle.
-    auto nextIndexFoldedHist = indexFoldedHist;
+    auto nextIndexFoldedHist = state.indexFoldedHist;
     for (int t = 0; t < numPredictors; t++) {
         nextIndexFoldedHist[t].update(history, 2, taken, pc, target);
         DPRINTF(TAGEHistory, "t: %d, index foldedHist(next) _folded 0x%lx\n",
                 t, nextIndexFoldedHist[t].get());
     }
-    aheadindexFoldedHist.push(nextIndexFoldedHist);
-    if (aheadindexFoldedHist.size() > 1) {
-        aheadindexFoldedHist.pop();
+    state.aheadIndexFoldedHist.push(nextIndexFoldedHist);
+    if (state.aheadIndexFoldedHist.size() > 1) {
+        state.aheadIndexFoldedHist.pop();
     }
 }
 
@@ -918,7 +975,7 @@ void
 MicroTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, taken, pc, target);
+    doUpdateHist(history, taken, pc, target, pred.tid);
 }
 
 /**
@@ -938,6 +995,7 @@ void
 MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = historyState(entry.tid);
     std::shared_ptr<TageMeta> predMeta = std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     if (!predMeta) {
         DPRINTF(UTAGE, "recoverPHist: no prediction metadata, cannot recover\n");
@@ -945,21 +1003,22 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     }
     // Restore current folded index history exactly to prediction-time state.
     for (int i = 0; i < numPredictors; i++) {
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
 
     // Restore delayed index folded history slot exactly to prediction-time state.
-    while (!aheadindexFoldedHist.empty()) {
-        aheadindexFoldedHist.pop();
+    while (!state.aheadIndexFoldedHist.empty()) {
+        state.aheadIndexFoldedHist.pop();
     }
     if (predMeta->aheadIndexFoldedHistValid) {
         assert(predMeta->aheadIndexFoldedHist.size() == numPredictors);
-        aheadindexFoldedHist.push(predMeta->aheadIndexFoldedHist);
+        state.aheadIndexFoldedHist.push(predMeta->aheadIndexFoldedHist);
     }
 
     if (debug::TAGEHistory) {
         bool queue_valid_mismatch =
-            (predMeta->aheadIndexFoldedHistValid != !aheadindexFoldedHist.empty());
+            (predMeta->aheadIndexFoldedHistValid !=
+             !state.aheadIndexFoldedHist.empty());
         if (queue_valid_mismatch) {
             DPRINTF(TAGEHistory,
                     "recoverPHist: ahead queue valid mismatch after restore, cond_taken %d\n",
@@ -968,16 +1027,25 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     }
 
     for (int i = 0; i < numPredictors; i++) {
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
     }
-    doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, cond_taken, entry.getControlPC(),
+                 entry.getTakenTarget(), entry.tid);
 }
 
 // Check folded history after speculative update and recovery
 void
 MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                           const char * when)
+{
+    auto &state = historyState(tid);
     DPRINTF(UTAGE, "checking folded history when %s\n", when);
     if (debug::TAGEHistory) {
         std::string hist_str;
@@ -989,13 +1057,13 @@ MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe
         // aheadindexFoldedHist in doUpdateHist(). During consistency checks
         // right after speculative/recovery updates, compare against the staged
         // next-cycle value when available.
-        if (!aheadindexFoldedHist.empty()) {
-            aheadindexFoldedHist.front()[t].check(hist);
+        if (!state.aheadIndexFoldedHist.empty()) {
+            state.aheadIndexFoldedHist.front()[t].check(hist);
         } else {
-            indexFoldedHist[t].check(hist);
+            state.indexFoldedHist[t].check(hist);
         }
-        tagFoldedHist[t].check(hist);
-        altTagFoldedHist[t].check(hist);
+        state.tagFoldedHist[t].check(hist);
+        state.altTagFoldedHist[t].check(hist);
     }
 }
 
diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh
index da593f6787..c181117bca 100644
--- a/src/cpu/pred/btb/microtage.hh
+++ b/src/cpu/pred/btb/microtage.hh
@@ -4,6 +4,7 @@
 #include <cstdint>
 #include <deque>
 #include <map>
+#include <memory>
 #include <queue>
 #include <utility>
 #include <vector>
@@ -11,6 +12,7 @@
 #include "base/sat_counter.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
 
@@ -42,6 +44,7 @@ namespace test {
 class MicroTAGE : public TimedBaseBTBPredictor
 {
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     // Test constructor
@@ -121,7 +124,7 @@ class MicroTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update 3 folded history, according history and pred.taken
     // the other specUpdateHist methods are left blank
@@ -159,23 +162,26 @@ class MicroTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
 #ifndef UNIT_TEST
   private:
 #endif
 
     // Look up predictions in TAGE tables for a stream of instructions
-    void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries, CondTakens& results);
+    void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
+                      CondTakens& results, ThreadID tid, uint8_t asidHash);
 
     // Calculate TAGE index for a given PC and table
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // Calculate TAGE index with folded history (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // Calculate TAGE tag with folded history (uint64_t version for performance)
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position = 0, uint8_t asidHash = 0);
 
     // Get branch index within a prediction block
     unsigned getBranchIndexInBlock(Addr branchPC, Addr startPC);
@@ -185,7 +191,8 @@ class MicroTAGE : public TimedBaseBTBPredictor
     unsigned getBankId(Addr pc) const;
 
     // Update branch history
-    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target);
+    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target,
+                      ThreadID tid);
 
     // Number of TAGE predictor tables
     const unsigned numPredictors;
@@ -205,14 +212,15 @@ class MicroTAGE : public TimedBaseBTBPredictor
     // History lengths for each table
     std::vector<unsigned> histLengths;
 
-    // Folded history for tag calculation
-    std::vector<PathFoldedHist> tagFoldedHist;
-
-    // Folded history for alternative tag calculation
-    std::vector<PathFoldedHist> altTagFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<PathFoldedHist> tagFoldedHist;
+        std::vector<PathFoldedHist> altTagFoldedHist;
+        std::vector<PathFoldedHist> indexFoldedHist;
+        std::queue<std::vector<PathFoldedHist>> aheadIndexFoldedHist;
+    };
 
-    // Folded history for index calculation
-    std::vector<PathFoldedHist> indexFoldedHist;
+    std::vector<ThreadHistoryState> threadHistory;
 
     // Maximum history length, not used
     unsigned maxHistLen;
@@ -259,8 +267,6 @@ class MicroTAGE : public TimedBaseBTBPredictor
     unsigned lastPredBankId;         // Bank ID of last prediction
     bool predBankValid;              // Whether lastPredBankId is valid
 
-    std::queue<std::vector<PathFoldedHist>> aheadindexFoldedHist;
-
 #ifdef UNIT_TEST
     typedef uint64_t Scalar;
 #else
@@ -351,7 +357,9 @@ private:
     // If predMeta is nullptr, use current folded history (prediction path)
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
-                                           const std::shared_ptr<TageMeta> predMeta = nullptr);
+                                           const std::shared_ptr<TageMeta> predMeta = nullptr,
+                                           ThreadID tid = 0,
+                                           uint8_t asidHash = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -368,11 +376,15 @@ private:
                                  bool actual_taken,
                                  unsigned main_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way);
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc
index 4dabf6dabf..8dd5b80aea 100644
--- a/src/cpu/pred/btb/ras.cc
+++ b/src/cpu/pred/btb/ras.cc
@@ -21,28 +21,13 @@ namespace btb_pred {
             : TimedBaseBTBPredictor(),
               numEntries(numEntries),
               ctrWidth(ctrWidth),
-              numInflightEntries(numInflightEntries)
+              numInflightEntries(numInflightEntries),
+              maxCtr((1 << ctrWidth) - 1),
+              numThreads(1),
+              threadStates(numThreads)
         {
-            // Initialize RAS state
-            ssp = 0;
-            nsp = 0;
-            sctr = 0;
-            stack.resize(numEntries);
-            maxCtr = (1 << ctrWidth) - 1;
-            TOSW = 0;
-            TOSR = 0;
-            inflightPtrDec(TOSR);
-            BOS = 0;
-            inflightStack.resize(numInflightEntries);
-
-            // Initialize stack entries
-            for (auto &entry : stack) {
-                entry.data.ctr = 0;
-                entry.data.retAddr = 0x80000000L;
-            }
-            for (auto &entry : inflightStack) {
-                entry.data.ctr = 0;
-                entry.data.retAddr = 0x80000000L;
+            for (auto &state : threadStates) {
+                initThreadState(state);
             }
         }
 #else
@@ -51,49 +36,61 @@ namespace btb_pred {
         : TimedBaseBTBPredictor(p),
           numEntries(p.numEntries),
           ctrWidth(p.ctrWidth),
-        numInflightEntries(p.numInflightEntries),
-        rasStats(this)
+          numInflightEntries(p.numInflightEntries),
+          maxCtr((1 << ctrWidth) - 1),
+          numThreads(p.numThreads),
+          threadStates(numThreads),
+          rasStats(this)
     {
-        // Initialize RAS state
-        ssp = 0;
-        nsp = 0;
-        sctr = 0;
-        stack.resize(numEntries);
-        maxCtr = (1 << ctrWidth) - 1;
-        TOSW = 0;
-        TOSR = 0;
-        inflightPtrDec(TOSR);
-        BOS = 0;
-        inflightStack.resize(numInflightEntries);
-
-        // Initialize stack entries
-        for (auto &entry : stack) {
-            entry.data.ctr = 0;
-            entry.data.retAddr = 0x80000000L;
-        }
-        for (auto &entry : inflightStack) {
-            entry.data.ctr = 0;
-            entry.data.retAddr = 0x80000000L;
+        for (auto &state : threadStates) {
+            initThreadState(state);
         }
     }
 #endif
 
 void
-BTBRAS::checkCorrectness() {
+BTBRAS::initThreadState(ThreadRASState &state)
+{
+    state.TOSW = 0;
+    state.TOSR = 0;
+    inflightPtrDec(state.TOSR);
+    state.BOS = 0;
+    state.ssp = 0;
+    state.nsp = 0;
+    state.sctr = 0;
+    state.meta.reset();
+
+    state.stack.resize(numEntries);
+    state.inflightStack.resize(numInflightEntries);
+
+    for (auto &entry : state.stack) {
+        entry.data.ctr = 0;
+        entry.data.retAddr = 0x80000000L;
+    }
+    for (auto &entry : state.inflightStack) {
+        entry.data.ctr = 0;
+        entry.data.retAddr = 0x80000000L;
+        entry.nos = 0;
+    }
+}
+
+void
+BTBRAS::checkCorrectness(ThreadID tid) {
+    auto &state = threadStates[tid];
     /*
-    auto tosr = TOSR;
-    int checkssp = ssp;
-    while (inflightInRange(tosr)) {
-        if (!inflightStack[tosr].data.ctr) {
+    auto tosr = state.TOSR;
+    int checkssp = state.ssp;
+    while (inflightInRange(state, tosr)) {
+        if (!state.inflightStack[tosr].data.ctr) {
             checkssp = (checkssp - 1 + numEntries) % numEntries;
         } else {
             // just dec sctr, fixme here
         }
-        tosr = inflightStack[tosr].nos;
+        tosr = state.inflightStack[tosr].nos;
     }
-    if (checkssp != (nsp + numEntries - 1) % numEntries) {
+    if (checkssp != (state.nsp + numEntries - 1) % numEntries) {
         DPRINTF(RAS, "NSP and SSP check failed\n");
-        printStack("checkCorrectness");
+        printStack("checkCorrectness", tid);
     }*/
 }
 
@@ -102,28 +99,39 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                   std::vector<FullBTBPrediction> &stagePreds)
 {
     assert(getDelay() < stagePreds.size());
-    meta = std::make_shared<RASMeta>();
+    const ThreadID tid = stagePreds.back().tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
+    state.meta = std::make_shared<RASMeta>();
     DPRINTFR(RAS, "putPC startAddr %lx", startAddr);
-    // checkCorrectness();
+    // checkCorrectness(tid);
+    auto top = getTop_meta(tid);
     for (int i = getDelay(); i < stagePreds.size(); i++) {
-        stagePreds[i].returnTarget = getTop_meta().retAddr; // stack[sp].retAddr;
+        stagePreds[i].returnTarget = top.retAddr;
     }
     /*
     if (stagePreds.back().btbEntry.slots[0].isCall || stagePreds.back().btbEntry.slots[0].isReturn || stagePreds.back().btbEntry.slots[1].isCall || stagePreds.back().btbEntry.slots[1].isReturn) {
-        printStack("putPCHistory");
+        printStack("putPCHistory", tid);
     }
     */
 }
 
 std::shared_ptr<void>
-BTBRAS::getPredictionMeta()
+BTBRAS::getPredictionMeta(ThreadID tid)
 {
-    return meta;
+    if (tid >= threadStates.size()) {
+        return nullptr;
+    }
+    return threadStates[tid].meta;
 }
 
 void
 BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    const ThreadID tid = pred.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
+    assert(state.meta);
     // do push & pops on prediction
     // pred.returnTarget = stack[sp].retAddr;
     auto takenEntry = pred.getTakenEntry();
@@ -131,11 +139,11 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction
 
     if (takenEntry.isCall) {
         Addr retAddr = takenEntry.pc + takenEntry.size;
-        push(retAddr);
+        push(tid, retAddr);
     }
     if (takenEntry.isReturn) {
         // do pop
-        pop();
+        pop(tid);
     }
     if (takenEntry.isCall) {
         DPRINTFR(RAS, "IsCall spec PC %lx\n", takenEntry.pc);
@@ -145,36 +153,39 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction
     }
     
     if (takenEntry.isCall || takenEntry.isReturn)
-        printStack("after specUpdateHist");
-    DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", meta->TOSR, meta->TOSW);
+        printStack("after specUpdateHist", tid);
+    DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", state.meta->TOSR, state.meta->TOSW);
 }
 
 void
 BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    const ThreadID tid = entry.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
     auto takenEntry = entry.exeBranchInfo;
     /*
     if (takenEntry.isCall || takenEntry.isReturn) {
-        printStack("before recoverHist");
+        printStack("before recoverHist", tid);
     }*/
     // recover sp and tos first
     auto meta_ptr = std::static_pointer_cast<RASMeta>(entry.predMetas[getComponentIdx()]);
     DPRINTF(RAS, "recover called, meta TOSR %d TOSW %d ssp %d sctr %u entry PC %lx end PC %lx\n",
         meta_ptr->TOSR, meta_ptr->TOSW, meta_ptr->ssp, meta_ptr->sctr, entry.startPC, entry.predEndPC);
 
-    TOSR = meta_ptr->TOSR;
-    TOSW = meta_ptr->TOSW;
-    ssp = meta_ptr->ssp;
-    sctr = meta_ptr->sctr;
+    state.TOSR = meta_ptr->TOSR;
+    state.TOSW = meta_ptr->TOSW;
+    state.ssp = meta_ptr->ssp;
+    state.sctr = meta_ptr->sctr;
     Addr retAddr = takenEntry.pc + takenEntry.size;
 
     // do push & pops on control squash
     if (entry.exeTaken) {
         if (takenEntry.isCall) {
-            push(retAddr);
+            push(tid, retAddr);
         }
         if (takenEntry.isReturn) {
-            pop();
+            pop(tid);
             //TOSW = (TOSR + 1) % numInflightEntries;
         }
     }
@@ -186,7 +197,7 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e
             DPRINTF(RAS, "IsRet expect target %lx, preded %lx, pred taken %d pred target %lx\n",
                 takenEntry.target, meta_ptr->target, entry.predTaken, entry.predBranchInfo.target);
         }
-        printStack("after recoverHist");
+        printStack("after recoverHist", tid);
     }
     
 }
@@ -194,83 +205,89 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e
 void
 BTBRAS::update(const FetchTarget &entry)
 {
+    const ThreadID tid = entry.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
     auto meta_ptr = std::static_pointer_cast<RASMeta>(entry.predMetas[getComponentIdx()]);
     auto takenEntry = entry.exeBranchInfo;
     if (entry.exeTaken) {
-        if (meta_ptr->ssp != nsp || meta_ptr->sctr != stack[nsp].data.ctr) {
+        if (meta_ptr->ssp != state.nsp || meta_ptr->sctr != state.stack[state.nsp].data.ctr) {
             DPRINTF(RAS, "ssp and nsp mismatch, recovering, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n",
-                meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr);
-            nsp = meta_ptr->ssp;
+                meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr);
+            state.nsp = meta_ptr->ssp;
         } else
             DPRINTF(RAS, "ssp and nsp match, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n",
-                meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr);
+                meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr);
         if (takenEntry.isCall) {
             DPRINTF(RAS, "real update call BTB hit %d meta TOSR %d TOSW %d\n entry PC %lx",
                 entry.isHit, meta_ptr->TOSR, meta_ptr->TOSW, entry.startPC);
             Addr retAddr = takenEntry.pc + takenEntry.size;
-            push_stack(retAddr);
-            BOS = inflightPtrPlus1(meta_ptr->TOSW);
+            push_stack(tid, retAddr);
+            state.BOS = inflightPtrPlus1(meta_ptr->TOSW);
         }
         if (takenEntry.isReturn) {
             DPRINTF(RAS, "update ret entry PC %lx\n", entry.startPC);
-            pop_stack();
+            pop_stack(tid);
         }
     }
     if (takenEntry.isCall || takenEntry.isReturn) {
-        printStack("after update(commit)");
+        printStack("after update(commit)", tid);
     }
 }
 
 void
-BTBRAS::push_stack(Addr retAddr)
+BTBRAS::push_stack(ThreadID tid, Addr retAddr)
 {
-    auto tos = stack[nsp];
+    auto &state = threadStates[tid];
+    auto tos = state.stack[state.nsp];
     if (tos.data.retAddr == retAddr && tos.data.ctr < maxCtr) {
-        stack[nsp].data.ctr++;
+        state.stack[state.nsp].data.ctr++;
     } else {
         // push new entry
-        ptrInc(nsp);
-        stack[nsp].data.retAddr = retAddr;
-        stack[nsp].data.ctr = 0;
+        ptrInc(state.nsp);
+        state.stack[state.nsp].data.retAddr = retAddr;
+        state.stack[state.nsp].data.ctr = 0;
     }
     // ++ndepth;
 }
 
 void
-BTBRAS::push(Addr retAddr)
+BTBRAS::push(ThreadID tid, Addr retAddr)
 {
+    auto &state = threadStates[tid];
     rasStats.Pushes++;
     DPRINTF(RAS, "doing push ");
     // update ssp and sctr first
     // meta has recorded their old value
-    auto topAddr = getTop();
-    if (retAddr == topAddr.retAddr && sctr < maxCtr) {
-        sctr++;
+    auto topAddr = getTop(tid);
+    if (retAddr == topAddr.retAddr && state.sctr < maxCtr) {
+        state.sctr++;
     } else {
-        ptrInc(ssp);
-        sctr = 0;
+        ptrInc(state.ssp);
+        state.sctr = 0;
         // do not update non-spec stack here
     }
 
     // push will always enter inflight queue
     RASInflightEntry t;
     t.data.retAddr = retAddr;
-    t.data.ctr = sctr;
-    t.nos = TOSR;
-    inflightStack[TOSW] = t;
-    TOSR = TOSW;
-    inflightPtrInc(TOSW);
+    t.data.ctr = state.sctr;
+    t.nos = state.TOSR;
+    state.inflightStack[state.TOSW] = t;
+    state.TOSR = state.TOSW;
+    inflightPtrInc(state.TOSW);
 }
 
 void
-BTBRAS::pop_stack()
+BTBRAS::pop_stack(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     //if (ndepth) {
-    auto tos = stack[nsp];
+    auto tos = state.stack[state.nsp];
     if (tos.data.ctr > 0) {
-        stack[nsp].data.ctr--;
+        state.stack[state.nsp].data.ctr--;
     } else {
-        ptrDec(nsp);
+        ptrDec(state.nsp);
     }
     //--ndepth;
     //} else {
@@ -280,30 +297,31 @@ BTBRAS::pop_stack()
 }
 
 void
-BTBRAS::pop()
+BTBRAS::pop(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     // DPRINTFR(RAS, "doing pop ndepth = %d", ndepth);
     rasStats.Pops++;
     // pop may need to deal with committed stack
-    if (inflightInRange(TOSR)) {
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
-        TOSR = inflightStack[TOSR].nos;
-        if (sctr > 0) {
-            sctr--; 
+    if (inflightInRange(state, state.TOSR)) {
+        DPRINTF(RAS, "Select from inflight, addr %lx\n", state.inflightStack[state.TOSR].data.retAddr);
+        state.TOSR = state.inflightStack[state.TOSR].nos;
+        if (state.sctr > 0) {
+            state.sctr--;
         } else {
-            ptrDec(ssp);
-            auto newTop = getTop();
-            sctr = newTop.ctr;
+            ptrDec(state.ssp);
+            auto newTop = getTop(tid);
+            state.sctr = newTop.ctr;
         }
     } else /*if (ndepth)*/ {
         // TOSR not valid, operate on committed stack
         DPRINTF(RAS, "in committed range\n");
-        if (sctr > 0) {
-            sctr--;
+        if (state.sctr > 0) {
+            state.sctr--;
         } else {
-            ptrDec(ssp);
-            auto newTop = getTop();
-            sctr = newTop.ctr;
+            ptrDec(state.ssp);
+            auto newTop = getTop(tid);
+            state.sctr = newTop.ctr;
         }
     }
     //else {
@@ -351,12 +369,12 @@ BTBRAS::inflightPtrPlus1(int ptr) {
 }
 
 bool
-BTBRAS::inflightInRange(int &ptr)
+BTBRAS::inflightInRange(const ThreadRASState &state, int ptr)
 {
-    if (TOSW > BOS) {
-        return ptr >= BOS && ptr < TOSW;
-    } else if (TOSW < BOS) {
-        return ptr < TOSW || ptr >= BOS;
+    if (state.TOSW > state.BOS) {
+        return ptr >= state.BOS && ptr < state.TOSW;
+    } else if (state.TOSW < state.BOS) {
+        return ptr < state.TOSW || ptr >= state.BOS;
     } else {
         // empty inflight queue
         return false;
@@ -364,64 +382,79 @@ BTBRAS::inflightInRange(int &ptr)
 }
 
 BTBRAS::RASEssential
-BTBRAS::getTop()
+BTBRAS::getTop(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     // results may come from two sources: inflight queue and committed stack
-    if (inflightInRange(TOSR)) {
+    if (inflightInRange(state, state.TOSR)) {
         // result come from inflight queue
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
+        DPRINTF(RAS, "Select from inflight, addr %lx\n",
+                state.inflightStack[state.TOSR].data.retAddr);
         // additional check: if nos is out of bound, check if commit stack top == inflight[nos]
         /*
-        if (!inflightInRange(inflightStack[TOSR].nos)) {
-            auto top = stack[nsp];
-            if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) {
+        if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) {
+            auto top = state.stack[state.nsp];
+            if (top.data.retAddr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.retAddr ||
+                top.data.ctr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.ctr) {
                 // inflight[nos] is not the same as stack[nsp]
                 DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n");
-                printStack("Error case stack dump");
+                printStack("Error case stack dump", tid);
             }
         }*/
 
-        return inflightStack[TOSR].data;
+        return state.inflightStack[state.TOSR].data;
     } else {
         // result come from commit queue
-        DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr);
-        return stack[ssp].data;
+        DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr);
+        return state.stack[state.ssp].data;
     }
 }
 
 BTBRAS::RASEssential
-BTBRAS::getTop_meta() {
+BTBRAS::getTop_meta(ThreadID tid) {
+    auto &state = threadStates[tid];
+    assert(state.meta);
     // results may come from two sources: inflight queue and committed stack
-    if (inflightInRange(TOSR)) {
+    if (inflightInRange(state, state.TOSR)) {
         // result come from inflight queue
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
-        meta->ssp = ssp;
-        meta->sctr = sctr;
-        meta->TOSR = TOSR;
-        meta->TOSW = TOSW;
-        meta->target = inflightStack[TOSR].data.retAddr;
+        DPRINTF(RAS, "Select from inflight, addr %lx\n",
+                state.inflightStack[state.TOSR].data.retAddr);
+        state.meta->ssp = state.ssp;
+        state.meta->sctr = state.sctr;
+        state.meta->TOSR = state.TOSR;
+        state.meta->TOSW = state.TOSW;
+        state.meta->target = state.inflightStack[state.TOSR].data.retAddr;
 
         // additional check: if nos is out of bound, check if commit stack top == inflight[nos]
         /*
-        if (!inflightInRange(inflightStack[TOSR].nos)) {
-            auto top = stack[nsp];
-            if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) {
+        if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) {
+            auto top = state.stack[state.nsp];
+            if (top.data.retAddr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.retAddr ||
+                top.data.ctr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.ctr) {
                 // inflight[nos] is not the same as stack[nsp]
                 DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n");
-                printStack("Error case stack dump");
+                printStack("Error case stack dump", tid);
             }
         }*/
 
-        return inflightStack[TOSR].data;
+        return state.inflightStack[state.TOSR].data;
     } else {
         // result come from commit queue
-        meta->ssp = ssp;
-        meta->sctr = sctr;
-        meta->TOSR = TOSR;
-        meta->TOSW = TOSW;
-        meta->target = stack[ssp].data.retAddr;
-        DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr);
-        return stack[ssp].data;
+        state.meta->ssp = state.ssp;
+        state.meta->sctr = state.sctr;
+        state.meta->TOSR = state.TOSR;
+        state.meta->TOSW = state.TOSW;
+        state.meta->target = state.stack[state.ssp].data.retAddr;
+        DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr);
+        return state.stack[state.ssp].data;
     }
 }
 
diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh
index 0055446013..19bb1f0e15 100644
--- a/src/cpu/pred/btb/ras.hh
+++ b/src/cpu/pred/btb/ras.hh
@@ -94,7 +94,7 @@ namespace btb_pred {
         void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                           std::vector<FullBTBPrediction> &stagePreds) override;
         
-        std::shared_ptr<void> getPredictionMeta() override;
+        std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
         void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
 
@@ -112,14 +112,28 @@ namespace btb_pred {
         Addr getTopAddrFromMetas(const FetchTarget &stream);
 
     private:
+        struct ThreadRASState
+        {
+            int TOSW = 0; // inflight pointer to the write top of stack
+            int TOSR = 0; // inflight pointer to the read top of stack
+            int BOS = 0;  // inflight pointer to the bottom of stack
+            int ssp = 0;  // speculative stack pointer
+            int nsp = 0;  // committed stack pointer
+            int sctr = 0;
+            std::vector<RASEntry> stack;
+            std::vector<RASInflightEntry> inflightStack;
+            std::shared_ptr<RASMeta> meta;
+        };
 
-        void push(Addr retAddr);
+        void initThreadState(ThreadRASState &state);
 
-        void pop();
+        void push(ThreadID tid, Addr retAddr);
 
-        void push_stack(Addr retAddr);
-        
-        void pop_stack();
+        void pop(ThreadID tid);
+
+        void push_stack(ThreadID tid, Addr retAddr);
+
+        void pop_stack(ThreadID tid);
 
         void ptrInc(int &ptr);
 
@@ -129,38 +143,43 @@ namespace btb_pred {
         
         void inflightPtrDec(int &ptr);
 
-        bool inflightInRange(int &ptr);
+        bool inflightInRange(const ThreadRASState &state, int ptr);
 
         int inflightPtrPlus1(int ptr);
 
-        void checkCorrectness();
+        void checkCorrectness(ThreadID tid);
 
-        RASEssential getTop();
+        RASEssential getTop(ThreadID tid);
 
-        RASEssential getTop_meta();
+        RASEssential getTop_meta(ThreadID tid);
 
-        void printStack(const char *when) {
-            DPRINTF(RAS, "printStack when %s: \n", when);
+        void printStack(const char *when, ThreadID tid) {
+            auto &state = threadStates[tid];
+            DPRINTF(RAS, "[tid:%u] printStack when %s: \n", tid, when);
             for (int i = 0; i < numEntries; i++) {
-                DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, stack[i].data.retAddr, stack[i].data.ctr);
-                if (ssp == i) {
+                DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i,
+                         state.stack[i].data.retAddr, state.stack[i].data.ctr);
+                if (state.ssp == i) {
                     DPRINTFR(RAS, " <-- SSP");
                 }
-                if (nsp == i) {
+                if (state.nsp == i) {
                     DPRINTFR(RAS, " <-- NSP");
                 }
                 DPRINTFR(RAS, "\n");
             }
             DPRINTFR(RAS, "non-volatile stack:\n");
             for (int i = 0; i < numInflightEntries; i++) {
-                DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, inflightStack[i].data.retAddr, inflightStack[i].data.ctr, inflightStack[i].nos);
-                if (TOSW == i) {
+                DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i,
+                         state.inflightStack[i].data.retAddr,
+                         state.inflightStack[i].data.ctr,
+                         state.inflightStack[i].nos);
+                if (state.TOSW == i) {
                     DPRINTFR(RAS, " <-- TOSW");
                 }
-                if (TOSR == i) {
+                if (state.TOSR == i) {
                     DPRINTFR(RAS, " <-- TOSR");
                 }
-                if (BOS == i) {
+                if (state.BOS == i) {
                     DPRINTFR(RAS, " <-- BOS");
                 }
                 DPRINTFR(RAS, "\n");
@@ -190,27 +209,11 @@ namespace btb_pred {
 
         unsigned numInflightEntries;
 
-        int TOSW; // inflight pointer to the write top of stack
-
-        int TOSR; // inflight pointer to the read top of stack
-
-        int BOS; // inflight pointer to the bottom of stack
-
         int maxCtr;
 
-        int ssp; // spec sp
-        
-        int nsp; // non-spec sp
-
-        int sctr;
-
-        //int ndepth;
-
-        std::vector<RASEntry> stack;
-        
-        std::vector<RASInflightEntry> inflightStack;
+        unsigned numThreads;
 
-        std::shared_ptr<RASMeta> meta;
+        std::vector<ThreadRASState> threadStates;
 
 #ifdef UNIT_TEST
     typedef uint64_t Scalar;
diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc
index 6b2eddc1ef..e0b75613fe 100644
--- a/src/cpu/pred/btb/test/btb_tage.test.cc
+++ b/src/cpu/pred/btb/test/btb_tage.test.cc
@@ -314,7 +314,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) {
 
     // Test case 1: Update with taken branch (PHR shifts in 2 bits from PC hash)
     // Correct order: first update folded histories with pre-update PHR, then mutate PHR
-    tage->doUpdateHist(history, true, pc, target);
+    tage->doUpdateHist(history, true, pc, target, 0);
     applyPathHistoryTaken(history, pc, target);
 
     // Verify folded history matches the ideal fold of the updated PHR
@@ -322,7 +322,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) {
 
     // Test case 2: Update with not-taken branch (PHR unchanged, folded update is no-op)
     boost::dynamic_bitset<> before_not_taken = history;
-    tage->doUpdateHist(history, false, pc, target);
+    tage->doUpdateHist(history, false, pc, target, 0);
 
     // Verify folded history remains consistent
     tage->checkFoldedHist(history, "not-taken update");
@@ -458,9 +458,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) {
     std::vector<PathFoldedHist> originalIndexFoldedHist;
 
     for (int i = 0; i < tage->numPredictors; i++) {
-        originalTagFoldedHist.push_back(tage->tagFoldedHist[i]);
-        originalAltTagFoldedHist.push_back(tage->altTagFoldedHist[i]);
-        originalIndexFoldedHist.push_back(tage->indexFoldedHist[i]);
+        originalTagFoldedHist.push_back(tage->threadHistory[0].tagFoldedHist[i]);
+        originalAltTagFoldedHist.push_back(tage->threadHistory[0].altTagFoldedHist[i]);
+        originalIndexFoldedHist.push_back(tage->threadHistory[0].indexFoldedHist[i]);
     }
 
     // Make a prediction
@@ -491,9 +491,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) {
 
     // Verify recovery produced the expected history
     for (int i = 0; i < tage->numPredictors; i++) {
-        tage->tagFoldedHist[i].check(expectedHistory);
-        tage->altTagFoldedHist[i].check(expectedHistory);
-        tage->indexFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].tagFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].altTagFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].indexFoldedHist[i].check(expectedHistory);
     }
 }
 
diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh
index fce1a6aef1..db611fef25 100644
--- a/src/cpu/pred/btb/timed_base_pred.hh
+++ b/src/cpu/pred/btb/timed_base_pred.hh
@@ -61,7 +61,10 @@ class TimedBaseBTBPredictor: public SimObject
                               const boost::dynamic_bitset<> &history,
                               std::vector<FullBTBPrediction> &stagePreds) {}
 
-    virtual std::shared_ptr<void> getPredictionMeta() { return nullptr; }
+    virtual std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0)
+    {
+        return nullptr;
+    }
 
     virtual void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {}
     virtual void specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {}
diff --git a/src/cpu/pred/btb/uras.cc b/src/cpu/pred/btb/uras.cc
index c507956d0e..53825d818a 100644
--- a/src/cpu/pred/btb/uras.cc
+++ b/src/cpu/pred/btb/uras.cc
@@ -85,8 +85,9 @@ BTBuRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
 }
 
 std::shared_ptr<void>
-BTBuRAS::getPredictionMeta()
+BTBuRAS::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     std::shared_ptr<void> meta_void_ptr = std::make_shared<uRASMeta>(meta);
     return meta_void_ptr;
 }
diff --git a/src/cpu/pred/btb/uras.hh b/src/cpu/pred/btb/uras.hh
index cdcde96b54..4ba12b3099 100644
--- a/src/cpu/pred/btb/uras.hh
+++ b/src/cpu/pred/btb/uras.hh
@@ -43,7 +43,7 @@ class BTBuRAS : public TimedBaseBTBPredictor
         void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                           std::vector<FullBTBPrediction> &stagePreds) override;
         
-        std::shared_ptr<void> getPredictionMeta() override;
+        std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
         void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
 
@@ -161,4 +161,4 @@ struct NonSpecRASTrace : public Record {
 }  // namespace branch_prediction
 
 }  // namespace gem5
-#endif  // __CPU_PRED_BTB_URAS_HH__
\ No newline at end of file
+#endif  // __CPU_PRED_BTB_URAS_HH__
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index fc91c8d2f3..27adf7f598 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -519,13 +519,14 @@ BaseSimpleCPU::readMiscReg(int misc_reg, ThreadID tid)
 }
 
 void
-BaseSimpleCPU::readGem5Regs()
+BaseSimpleCPU::readGem5Regs(ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     for (int i = 0; i < 32; i++) {
         diffAllStates->gem5RegFile[i] =
-            threadContexts[curThread]->getReg(RegId(IntRegClass, i));
+            threadContexts[tid]->getReg(RegId(IntRegClass, i));
         diffAllStates->gem5RegFile[i + 32] =
-            threadContexts[curThread]->getReg(RegId(FloatRegClass, i));
+            threadContexts[tid]->getReg(RegId(FloatRegClass, i));
     }
 }
 
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index b289ac778f..bcdd7c9066 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -207,7 +207,7 @@ class BaseSimpleCPU : public BaseCPU
 
     RegVal readMiscReg(int misc_reg, ThreadID tid) override;
 
-    void readGem5Regs() override;
+    void readGem5Regs(ThreadID tid) override;
 };
 
 } // namespace gem5
diff --git a/src/dev/riscv/HartCtrl.py b/src/dev/riscv/HartCtrl.py
new file mode 100644
index 0000000000..242c10cccd
--- /dev/null
+++ b/src/dev/riscv/HartCtrl.py
@@ -0,0 +1,13 @@
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.Device import BasicPioDevice
+
+
+class HartCtrl(BasicPioDevice):
+    type = 'HartCtrl'
+    cxx_header = "dev/riscv/hart_ctrl.hh"
+    cxx_class = 'gem5::HartCtrl'
+    pio_addr = 0x39001000
+    pio_size = Param.Addr(0x1000, "Hart control register space size")
+    num_threads = Param.Int("Number of threads in the system.")
diff --git a/src/dev/riscv/SConscript b/src/dev/riscv/SConscript
index 15bf707400..267399e9c0 100755
--- a/src/dev/riscv/SConscript
+++ b/src/dev/riscv/SConscript
@@ -34,6 +34,7 @@ SimObject('HiFive.py', sim_objects=['HiFive', 'GenericRiscvPciHost'],
 SimObject('LupV.py', sim_objects=['LupV'], tags='riscv isa')
 SimObject('Clint.py', sim_objects=['Clint'], tags='riscv isa')
 SimObject('Lint.py', sim_objects=['Lint'], tags='riscv isa')
+SimObject('HartCtrl.py', sim_objects=['HartCtrl'], tags='riscv isa')
 SimObject('PlicDevice.py', sim_objects=['PlicIntDevice'], tags='riscv isa')
 SimObject('Plic.py', sim_objects=['Plic'], tags='riscv isa')
 SimObject('RTC.py', sim_objects=['RiscvRTC'], tags='riscv isa')
@@ -55,6 +56,7 @@ Source('hifive.cc', tags='riscv isa')
 Source('lupv.cc', tags='riscv isa')
 Source('clint.cc', tags='riscv isa')
 Source('lint.cc', tags='riscv isa')
+Source('hart_ctrl.cc', tags='riscv isa')
 Source('plic_device.cc', tags='riscv isa')
 Source('plic.cc', tags='riscv isa')
 Source('rtc.cc', tags='riscv isa')
diff --git a/src/dev/riscv/hart_ctrl.cc b/src/dev/riscv/hart_ctrl.cc
new file mode 100644
index 0000000000..b0afe6c8a9
--- /dev/null
+++ b/src/dev/riscv/hart_ctrl.cc
@@ -0,0 +1,98 @@
+#include "dev/riscv/hart_ctrl.hh"
+
+#include "cpu/thread_context.hh"
+#include "mem/packet_access.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+HartCtrl::HartCtrl(const Params &p)
+    : BasicPioDevice(p, p.pio_size),
+      hartResetState(p.num_threads, 1)
+{
+    if (!hartResetState.empty()) {
+        // Hart 0 is the boot hart and is considered released by default.
+        hartResetState[0] = 0;
+    }
+}
+
+Tick
+HartCtrl::read(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
+    assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t));
+
+    const Addr offset = pkt->getAddr() - pioAddr;
+    panic_if(offset % sizeof(uint64_t) != 0,
+             "HartCtrl only supports 64-bit aligned accesses: addr=%#lx",
+             pkt->getAddr());
+
+    const ThreadID tid = offset / sizeof(uint64_t);
+    panic_if(tid >= hartResetState.size(),
+             "HartCtrl access out of range: tid=%u addr=%#lx",
+             tid, pkt->getAddr());
+
+    pkt->setLE(hartResetState[tid]);
+    pkt->makeAtomicResponse();
+    return pioDelay;
+}
+
+Tick
+HartCtrl::write(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
+    assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t));
+
+    const Addr offset = pkt->getAddr() - pioAddr;
+    panic_if(offset % sizeof(uint64_t) != 0,
+             "HartCtrl only supports 64-bit aligned accesses: addr=%#lx",
+             pkt->getAddr());
+
+    const ThreadID tid = offset / sizeof(uint64_t);
+    panic_if(tid >= hartResetState.size(),
+             "HartCtrl access out of range: tid=%u addr=%#lx",
+             tid, pkt->getAddr());
+
+    uint64_t value = 0;
+    switch (pkt->getSize()) {
+      case sizeof(uint8_t):
+        value = pkt->getLE<uint8_t>();
+        break;
+      case sizeof(uint16_t):
+        value = pkt->getLE<uint16_t>();
+        break;
+      case sizeof(uint32_t):
+        value = pkt->getLE<uint32_t>();
+        break;
+      case sizeof(uint64_t):
+        value = pkt->getLE<uint64_t>();
+        break;
+      default:
+        panic("Unsupported HartCtrl write size %u\n", pkt->getSize());
+    }
+
+    hartResetState[tid] = value;
+
+    if (value == 0) {
+        tryWakeHart(tid);
+    }
+
+    pkt->makeAtomicResponse();
+    return pioDelay;
+}
+
+void
+HartCtrl::tryWakeHart(ThreadID tid)
+{
+    panic_if(tid >= sys->threads.size(),
+             "HartCtrl wake target %u out of system thread range %zu",
+             tid, sys->threads.size());
+
+    auto *tc = sys->threads[tid];
+    panic_if(!tc, "HartCtrl target %u has no thread context", tid);
+
+    tc->activate();
+}
+
+} // namespace gem5
diff --git a/src/dev/riscv/hart_ctrl.hh b/src/dev/riscv/hart_ctrl.hh
new file mode 100644
index 0000000000..5fe47306f6
--- /dev/null
+++ b/src/dev/riscv/hart_ctrl.hh
@@ -0,0 +1,33 @@
+//
+// Created for Xiangshan bare-metal hart control MMIO.
+//
+
+#ifndef GEM5_HART_CTRL_HH
+#define GEM5_HART_CTRL_HH
+
+#include <vector>
+
+#include "dev/io_device.hh"
+#include "params/HartCtrl.hh"
+
+namespace gem5
+{
+
+class HartCtrl : public BasicPioDevice
+{
+  public:
+    typedef HartCtrlParams Params;
+    explicit HartCtrl(const Params &p);
+
+    Tick read(PacketPtr pkt) override;
+    Tick write(PacketPtr pkt) override;
+
+  private:
+    void tryWakeHart(ThreadID tid);
+
+    std::vector<uint64_t> hartResetState;
+};
+
+} // namespace gem5
+
+#endif // GEM5_HART_CTRL_HH
diff --git a/src/sim/system.cc b/src/sim/system.cc
index 7bc4ec37ce..c640334f4d 100644
--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -562,8 +562,8 @@ void System::initState()
     }
 
     // have to initiate golden memory after checkpoint restored
-    if (numCPUs > 1 && enableDifftest) {
-        warn("Creating golden memory for multi-core difftest\n");
+    if (multiContextDifftest()) {
+        warn("Creating golden memory for multi-context difftest\n");
         assert(enableMemDedup);
         goldenMem = dedupMemManager.createCopyOnWriteBranch();
         goldenMemManager.initGoldenMem(physmem.getStartaddr(), memSize(), goldenMem);
diff --git a/src/sim/system.hh b/src/sim/system.hh
index db49b66926..1dca935d6e 100644
--- a/src/sim/system.hh
+++ b/src/sim/system.hh
@@ -416,6 +416,11 @@ class System : public SimObject, public PCEventScope
 
     bool multiCore() const { return numCPUs > 1; }
 
+    bool multiContextDifftest() const
+    {
+        return enableDifftest && (multiCore() || multiThread);
+    }
+
     uint8_t *getGoldenMemPtr() const { return goldenMem; }
 
     GoldenGloablMem *getGoldenMemManager() { return &goldenMemManager; }