diff --git a/src/main/scala/coupledL2/Directory.scala b/src/main/scala/coupledL2/Directory.scala
index cbe3e2efa..968e16f90 100644
--- a/src/main/scala/coupledL2/Directory.scala
+++ b/src/main/scala/coupledL2/Directory.scala
@@ -284,7 +284,7 @@ class Directory(implicit p: Parameters) extends L2Module {
     chosenWay,
     PriorityEncoder(freeWayMask_s3)
   )
-  val hit_s3 = Cat(hitVec).orR || req_s3.cmoAll
+  val hit_s3 = Cat(hitVec).orR || (req_s3.cmoAll && VecInit(metaAll_s3.map(_.state =/= MetaData.INVALID))(req_s3.cmoWay))
   val way_s3 = Mux(req_s3.cmoAll, req_s3.cmoWay, Mux(hit_s3, hitWay, finalWay))
   val meta_s3 = metaAll_s3(way_s3)
   val tag_s3 = tagAll_s3(way_s3)
diff --git a/src/main/scala/coupledL2/L2Param.scala b/src/main/scala/coupledL2/L2Param.scala
index 764c5dd29..afcaf8d2c 100644
--- a/src/main/scala/coupledL2/L2Param.scala
+++ b/src/main/scala/coupledL2/L2Param.scala
@@ -103,7 +103,7 @@ case class L2Param(
   // L2 Flush
   enableL2Flush: Boolean = false,
   // AsyncBridge
-  enableCHIAsyncBridge: Option[Boolean] = None,
+  enableCHIAsyncBridge: Boolean = true,
   // Performance analysis
   enablePerf: Boolean = true,
   // RollingDB
diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala
index 794d449d7..7a7839544 100644
--- a/src/main/scala/coupledL2/SinkA.scala
+++ b/src/main/scala/coupledL2/SinkA.scala
@@ -178,7 +178,7 @@ class SinkA(implicit p: Parameters) extends L2Module {
       }.otherwise {
         way.foreach { _ := wayVal + 1.U }
       }
-      when (mshrValid) {
+      when (!mshrValid) {
         state.foreach { _ := sCMOREQ }
       }.otherwise {
         state.foreach { _ := sWAITMSHR }
diff --git a/src/main/scala/coupledL2/tl2chi/MainPipe.scala b/src/main/scala/coupledL2/tl2chi/MainPipe.scala
index 0a6811aa8..e6da39ef7 100644
--- a/src/main/scala/coupledL2/tl2chi/MainPipe.scala
+++ b/src/main/scala/coupledL2/tl2chi/MainPipe.scala
@@ -296,6 +296,7 @@ class MainPipe(implicit p: Parameters) extends TL2CHIL2Module with HasCHIOpcodes
   io.toMSHRCtl.mshr_alloc_s3.bits.state := alloc_state
   io.toMSHRCtl.mshr_alloc_s3.bits.task match { case task =>
     task := req_s3
+    task.tag := Mux(io.cmoAllBlock.getOrElse(false.B), dirResult_s3.tag, req_s3.tag)
     task.bufIdx := 0.U(bufIdxBits.W)
     task.mshrTask := false.B
     task.aliasTask.foreach(_ := cache_alias)
diff --git a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
index 38b51a56a..dc9e1db9b 100644
--- a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
+++ b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
@@ -70,7 +70,7 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
 
     val io_chi = IO(new PortIO)
     val io_nodeID = IO(Input(UInt()))
-    val io_cpu_wfi = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool())))
+    val io_cpu_halt = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool())))
 
     // Check port width
     require(io_chi.tx.rsp.getWidth == io_chi.rx.rsp.getWidth);
@@ -125,6 +125,9 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
       )
     }
 
+    //Coherency enable from Link Layer: when 1 cacheable request can be sent; otherwise they are gated
+    val coEnable = WireInit(false.B)
+
     slices match {
       case slices: Seq[Slice] =>
         // TXREQ
@@ -132,7 +135,12 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
         val txreq = Wire(DecoupledIO(new CHIREQ))
         slices.zip(txreq_arb.io.in.init).foreach { case (s, in) => in <> s.io.out.tx.req }
         txreq_arb.io.in.last <> mmio.io.tx.req
-        txreq <> txreq_arb.io.out
+        //Coherency Gating cacheable request, mmio always pass
+        val is_mmio = txreq_arb.io.chosen === slices.size.U
+        val req_pass = coEnable || is_mmio
+        txreq.valid := txreq_arb.io.out.valid && req_pass
+        txreq.bits := txreq_arb.io.out.bits
+        txreq_arb.io.out.ready := txreq.ready && req_pass
         txreq.bits.txnID := setSliceID(txreq_arb.io.out.bits.txnID, txreq_arb.io.chosen, mmio.io.tx.req.fire)
 
         // TXRSP
@@ -265,9 +273,9 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
         linkMonitor.io.nodeID := io_nodeID
         /* exit coherency when: l2 flush of all slices is done and core is in WFI state */
         linkMonitor.io.exitco.foreach { _ :=
-          Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_wfi.getOrElse(false.B)
+          Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_halt.getOrElse(false.B)
         }
-
+        coEnable := linkMonitor.io.coEnable
         /**
           * performance counters
           */
diff --git a/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala b/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala
index 675a68b38..73c34e2a3 100644
--- a/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala
+++ b/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala
@@ -50,6 +50,12 @@ class AsyncPortIO(
   val tx = new AsyncDownwardsLinkIO(params)
   val rx = Flipped(new AsyncUpwardsLinkIO(params))
 }
+
+class ChannelWithActive[T <: Data](gen: T) extends Bundle {
+  val channel = new ChannelIO(gen)
+  val active = Bool()
+}
+
 /*
  * This module enhances the standard async bridge by adding a front-end shadow buffer
  * to decouple local processing from asynchronous latency and provide instant credit
@@ -73,15 +79,15 @@ object ToAsyncBundleWithBuf {
     /*
      1. Shadow Buffer (depth=16, flow mode for low latency)
      */
-    val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 16, flow = true, pipe = false))
+    val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 32, flow = true, pipe = false))
     if (name.isDefined) { shadow_buffer.suggestName("shadowBuffer_" + name.get) }
     shadow_buffer.io.enq.valid := chn.flitv
     shadow_buffer.io.enq.bits  := chn.flit
     /*
      2. For rx channel (CMN->L2), send out lcrdv right after a flit entering Shadow buffer if has space
      */
-    val deqReady = shadow_buffer.io.deq.ready
-    dontTouch(deqReady)
+    val hasSpace = shadow_buffer.io.count <= 16.U 
+    dontTouch(hasSpace)
     assert(!chn.flitv || shadow_buffer.io.enq.ready, s"${name.getOrElse("ToAsyncBundle")}: Shadow buffer overflow!")
     /*
      3. AsyncQueueSource (depth=4)
@@ -90,8 +96,35 @@ object ToAsyncBundleWithBuf {
     if (name.isDefined) { source.suggestName("asyncQSource_" + name.get) }
     source.io.enq <> shadow_buffer.io.deq
 
-    (source.io.async, deqReady)
+    (source.io.async, hasSpace)
+  }
+
+  def bitPulse(
+    bit: Bool,
+    params: AsyncQueueParams = AsyncQueueParams(),
+    name: Option[String] = None
+  ) = {
+    /*
+     1. Shadow Buffer (depth=16, flow mode for low latency)
+     */
+    val shadow_buffer = Module(new Queue(Bool(), 16, flow = true, pipe = false))
+    if (name.isDefined) { shadow_buffer.suggestName("lcrdvShadowBuffer_" + name.get) }
+    shadow_buffer.io.enq.valid := bit
+    shadow_buffer.io.enq.bits  := DontCare
+    /*
+     2. AsyncQueueSource (depth =4)
+     */
+    val source = Module(new AsyncQueueSource(UInt(0.W), params))
+    if (name.isDefined) { source.suggestName("asyncQBitSource_" + name.get) }
+    source.io.enq.valid := shadow_buffer.io.deq.valid
+    source.io.enq.bits := DontCare
+
+    shadow_buffer.io.deq.ready := source.io.enq.ready
+
+    source.io.async
+
   }
+
 }
 object ToAsyncBundle {
   def channel[T <: Data](
@@ -124,8 +157,9 @@ object FromAsyncBundle {
     async: AsyncBundle[UInt],
     params: AsyncQueueParams = AsyncQueueParams(),
     name: Option[String] = None,
-    lcrdvReady: Option[Bool]= None
-  ) = {
+    lcrdvReady: Option[Bool]= None,
+    withPowerAck: Boolean = false
+  ): Data = {
     val gen = chiselTypeOf(async.mem.head)
     val out = Wire(new ChannelIO(gen))
     val sink = Module(new AsyncQueueSink(gen, params))
@@ -137,7 +171,15 @@ object FromAsyncBundle {
     // flitpend and lcrdv are assigned independently
     out.flitpend := DontCare
     out.lcrdv := DontCare
-    out
+    // extend out with 'Active' to indicate sink Queue is NOT empty 
+    if (withPowerAck) {
+      val result = Wire(new ChannelWithActive(gen))
+      result.channel <> out
+      result.active := sink.io.deq.valid
+      result
+    } else {
+      out
+    }
   }
 
   def bitPulse[T <: Data](
@@ -232,18 +274,38 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
     val async = Flipped(new AsyncPortIO(params))
     val deq = new PortIO
     val resetFinish = Output(Bool())
+    val powerAck = new Bundle {
+      val QACTIVE = Output(Bool())
+      val QACCEPTn = Output(Bool())
+      val QREQ = Input(Bool())
+    }
   })
 
+  val txState = RegInit(LinkStates.STOP)
+  val rxState = RegInit(LinkStates.STOP)
+
   val txreq_lcrdvReady = Wire(Bool())
   val txrsp_lcrdvReady = Wire(Bool())
   val txdat_lcrdvReady = Wire(Bool())
-  io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady))
-  io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady))
-  io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady))
 
-  io.async.tx.req.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv"))
-  io.async.tx.rsp.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv"))
-  io.async.tx.dat.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv"))
+//  io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady))
+//  io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady))
+//  io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady))
+
+  val txreq = FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
+  val txrsp = FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
+  val txdat = FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
+  io.deq.tx.req <> txreq.channel
+  io.deq.tx.rsp <> txrsp.channel
+  io.deq.tx.dat <> txdat.channel
+  // Add handshake to confirm Sink Tx Queue is completely drained
+  val txActive = txreq.active || txrsp.active || txdat.active
+  io.powerAck.QACTIVE := txActive
+  io.powerAck.QACCEPTn := !(io.powerAck.QREQ && !txActive && txState === LinkStates.STOP)
+
+  io.async.tx.req.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv"))
+  io.async.tx.rsp.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv"))
+  io.async.tx.dat.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv"))
 
   val async_rx_rsp = ToAsyncBundleWithBuf.channel(io.deq.rx.rsp, params, Some("rxrsp_flit"))
   val async_rx_dat = ToAsyncBundleWithBuf.channel(io.deq.rx.dat, params, Some("rxdat_flit"))
@@ -294,9 +356,6 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
   /*
    Duplicate Link Monitor tx/rx state FSM by using deq.rx deq.tx active signals which outuput to DownStream CHI
    */
-  val txState = RegInit(LinkStates.STOP)
-  val rxState = RegInit(LinkStates.STOP)
-
   Seq(txState, rxState).zip(MixedVecInit(Seq(io.deq.tx, io.deq.rx))).foreach { case (state, link) =>
     state := MuxLookup(Cat(link.linkactivereq, link.linkactiveack), LinkStates.STOP)(Seq(
       Cat(true.B, false.B) -> LinkStates.ACTIVATE,
@@ -322,6 +381,7 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
    For tx channel, add l-credit manager module to generate 'ready' to block tx flit to DownStream CHI
    a. The maximum number of L-Credits in tx channel is 4 inside bridge
    b. Use L-Credits number more than 4 in CoupledL2 to cover lcrdv sync delay from DownStream CHI to CoupledL2
+   c. Normal flits are controlled by credits (from AsyncBridge Sink), Return flits in DEACTIVATE are controlled by credits (from L2)
    */
   val txin = WireInit(0.U asTypeOf(Flipped(new DecoupledPortIO()))) //fake Decoupled IO to provide flitv
   val txout = WireInit(0.U asTypeOf(new PortIO))//fake LCredit IO to provide lcrdv
@@ -334,9 +394,9 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
   Decoupled2LCredit(txin.tx.req, txout.tx.req, LinkState(txState), Some("txreq"))
   Decoupled2LCredit(txin.tx.rsp, txout.tx.rsp, LinkState(txState), Some("txrsp"))
   Decoupled2LCredit(txin.tx.dat, txout.tx.dat, LinkState(txState), Some("txdat"))
-  txreq_lcrdvReady := txin.tx.req.ready
-  txrsp_lcrdvReady := txin.tx.rsp.ready
-  txdat_lcrdvReady := txin.tx.dat.ready
+  txreq_lcrdvReady := txin.tx.req.ready || txState === LinkStates.DEACTIVATE
+  txrsp_lcrdvReady := txin.tx.rsp.ready || txState === LinkStates.DEACTIVATE
+  txdat_lcrdvReady := txin.tx.dat.ready || txState === LinkStates.DEACTIVATE
 
   dontTouch(io)
 }
diff --git a/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala
index b939cfd4b..f4b9aac81 100644
--- a/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala
+++ b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala
@@ -274,7 +274,7 @@ class Decoupled2LCredit[T <: Bundle](
 
   // The maximum number of L-Credits that a receiver can provide is 15.
   val lcreditsMax = 15
-  val enableCHIAsync = cacheParams.enableCHIAsyncBridge.getOrElse(false)
+  val enableCHIAsync = cacheParams.enableCHIAsyncBridge
   val overlcreditVal = if(enableCHIAsync) overlcreditNum.getOrElse(0) else 0 
   val lcreditsMaxAll = lcreditsMax + overlcreditVal
   val lcreditPool = RegInit(overlcreditVal.U(log2Up(lcreditsMaxAll+1).W))
@@ -328,6 +328,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes {
     val in = Flipped(new DecoupledPortIO())
     val out = new PortIO
     val nodeID = Input(UInt(NODEID_WIDTH.W))
+    val coEnable = Output(Bool())
     val exitco = Option.when(cacheParams.enableL2Flush) (Input(Bool()))
   })
   // val s_stop :: s_activate :: s_run :: s_deactivate :: Nil = Enum(4)
@@ -357,6 +358,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes {
   //exit coherecy + deactive tx/rx when l2 flush done
   val exitco = io.exitco.getOrElse(false.B)
   val exitcoDone = !io.out.syscoreq && !io.out.syscoack && RegNext(true.B, init = false.B)
+  io.coEnable := io.out.syscoreq && io.out.syscoack
 
   io.out.tx.linkactivereq := RegNext(!exitcoDone, init = false.B)
   io.out.rx.linkactiveack := RegNext(