diff --git a/src/main/scala/coupledL2/Directory.scala b/src/main/scala/coupledL2/Directory.scala index cbe3e2efa..968e16f90 100644 --- a/src/main/scala/coupledL2/Directory.scala +++ b/src/main/scala/coupledL2/Directory.scala @@ -284,7 +284,7 @@ class Directory(implicit p: Parameters) extends L2Module { chosenWay, PriorityEncoder(freeWayMask_s3) ) - val hit_s3 = Cat(hitVec).orR || req_s3.cmoAll + val hit_s3 = Cat(hitVec).orR || (req_s3.cmoAll && VecInit(metaAll_s3.map(_.state =/= MetaData.INVALID))(req_s3.cmoWay)) val way_s3 = Mux(req_s3.cmoAll, req_s3.cmoWay, Mux(hit_s3, hitWay, finalWay)) val meta_s3 = metaAll_s3(way_s3) val tag_s3 = tagAll_s3(way_s3) diff --git a/src/main/scala/coupledL2/L2Param.scala b/src/main/scala/coupledL2/L2Param.scala index 764c5dd29..afcaf8d2c 100644 --- a/src/main/scala/coupledL2/L2Param.scala +++ b/src/main/scala/coupledL2/L2Param.scala @@ -103,7 +103,7 @@ case class L2Param( // L2 Flush enableL2Flush: Boolean = false, // AsyncBridge - enableCHIAsyncBridge: Option[Boolean] = None, + enableCHIAsyncBridge: Boolean = true, // Performance analysis enablePerf: Boolean = true, // RollingDB diff --git a/src/main/scala/coupledL2/SinkA.scala b/src/main/scala/coupledL2/SinkA.scala index 794d449d7..7a7839544 100644 --- a/src/main/scala/coupledL2/SinkA.scala +++ b/src/main/scala/coupledL2/SinkA.scala @@ -178,7 +178,7 @@ class SinkA(implicit p: Parameters) extends L2Module { }.otherwise { way.foreach { _ := wayVal + 1.U } } - when (mshrValid) { + when (!mshrValid) { state.foreach { _ := sCMOREQ } }.otherwise { state.foreach { _ := sWAITMSHR } diff --git a/src/main/scala/coupledL2/tl2chi/MainPipe.scala b/src/main/scala/coupledL2/tl2chi/MainPipe.scala index 0a6811aa8..e6da39ef7 100644 --- a/src/main/scala/coupledL2/tl2chi/MainPipe.scala +++ b/src/main/scala/coupledL2/tl2chi/MainPipe.scala @@ -296,6 +296,7 @@ class MainPipe(implicit p: Parameters) extends TL2CHIL2Module with HasCHIOpcodes io.toMSHRCtl.mshr_alloc_s3.bits.state := alloc_state io.toMSHRCtl.mshr_alloc_s3.bits.task match { case task => task := req_s3 + task.tag := Mux(io.cmoAllBlock.getOrElse(false.B), dirResult_s3.tag, req_s3.tag) task.bufIdx := 0.U(bufIdxBits.W) task.mshrTask := false.B task.aliasTask.foreach(_ := cache_alias) diff --git a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala index 38b51a56a..dc9e1db9b 100644 --- a/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala +++ b/src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala @@ -70,7 +70,7 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base { val io_chi = IO(new PortIO) val io_nodeID = IO(Input(UInt())) - val io_cpu_wfi = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool()))) + val io_cpu_halt = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool()))) // Check port width require(io_chi.tx.rsp.getWidth == io_chi.rx.rsp.getWidth); @@ -125,6 +125,9 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base { ) } + //Coherency enable from Link Layer: when 1 cacheable request can be sent; otherwise they are gated + val coEnable = WireInit(false.B) + slices match { case slices: Seq[Slice] => // TXREQ @@ -132,7 +135,12 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base { val txreq = Wire(DecoupledIO(new CHIREQ)) slices.zip(txreq_arb.io.in.init).foreach { case (s, in) => in <> s.io.out.tx.req } txreq_arb.io.in.last <> mmio.io.tx.req - txreq <> txreq_arb.io.out + //Coherency Gating cacheable request, mmio always pass + val is_mmio = txreq_arb.io.chosen === slices.size.U + val req_pass = coEnable || is_mmio + txreq.valid := txreq_arb.io.out.valid && req_pass + txreq.bits := txreq_arb.io.out.bits + txreq_arb.io.out.ready := txreq.ready && req_pass txreq.bits.txnID := setSliceID(txreq_arb.io.out.bits.txnID, txreq_arb.io.chosen, mmio.io.tx.req.fire) // TXRSP @@ -265,9 +273,9 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base { linkMonitor.io.nodeID := io_nodeID /* exit coherency when: l2 flush of all slices is done and core is in WFI state */ linkMonitor.io.exitco.foreach { _ := - Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_wfi.getOrElse(false.B) + Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_halt.getOrElse(false.B) } - + coEnable := linkMonitor.io.coEnable /** * performance counters */ diff --git a/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala b/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala index 675a68b38..73c34e2a3 100644 --- a/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala +++ b/src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala @@ -50,6 +50,12 @@ class AsyncPortIO( val tx = new AsyncDownwardsLinkIO(params) val rx = Flipped(new AsyncUpwardsLinkIO(params)) } + +class ChannelWithActive[T <: Data](gen: T) extends Bundle { + val channel = new ChannelIO(gen) + val active = Bool() +} + /* * This module enhances the standard async bridge by adding a front-end shadow buffer * to decouple local processing from asynchronous latency and provide instant credit @@ -73,15 +79,15 @@ object ToAsyncBundleWithBuf { /* 1. Shadow Buffer (depth=16, flow mode for low latency) */ - val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 16, flow = true, pipe = false)) + val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 32, flow = true, pipe = false)) if (name.isDefined) { shadow_buffer.suggestName("shadowBuffer_" + name.get) } shadow_buffer.io.enq.valid := chn.flitv shadow_buffer.io.enq.bits := chn.flit /* 2. For rx channel (CMN->L2), send out lcrdv right after a flit entering Shadow buffer if has space */ - val deqReady = shadow_buffer.io.deq.ready - dontTouch(deqReady) + val hasSpace = shadow_buffer.io.count <= 16.U + dontTouch(hasSpace) assert(!chn.flitv || shadow_buffer.io.enq.ready, s"${name.getOrElse("ToAsyncBundle")}: Shadow buffer overflow!") /* 3. AsyncQueueSource (depth=4) @@ -90,8 +96,35 @@ object ToAsyncBundleWithBuf { if (name.isDefined) { source.suggestName("asyncQSource_" + name.get) } source.io.enq <> shadow_buffer.io.deq - (source.io.async, deqReady) + (source.io.async, hasSpace) + } + + def bitPulse( + bit: Bool, + params: AsyncQueueParams = AsyncQueueParams(), + name: Option[String] = None + ) = { + /* + 1. Shadow Buffer (depth=16, flow mode for low latency) + */ + val shadow_buffer = Module(new Queue(Bool(), 16, flow = true, pipe = false)) + if (name.isDefined) { shadow_buffer.suggestName("lcrdvShadowBuffer_" + name.get) } + shadow_buffer.io.enq.valid := bit + shadow_buffer.io.enq.bits := DontCare + /* + 2. AsyncQueueSource (depth =4) + */ + val source = Module(new AsyncQueueSource(UInt(0.W), params)) + if (name.isDefined) { source.suggestName("asyncQBitSource_" + name.get) } + source.io.enq.valid := shadow_buffer.io.deq.valid + source.io.enq.bits := DontCare + + shadow_buffer.io.deq.ready := source.io.enq.ready + + source.io.async + } + } object ToAsyncBundle { def channel[T <: Data]( @@ -124,8 +157,9 @@ object FromAsyncBundle { async: AsyncBundle[UInt], params: AsyncQueueParams = AsyncQueueParams(), name: Option[String] = None, - lcrdvReady: Option[Bool]= None - ) = { + lcrdvReady: Option[Bool]= None, + withPowerAck: Boolean = false + ): Data = { val gen = chiselTypeOf(async.mem.head) val out = Wire(new ChannelIO(gen)) val sink = Module(new AsyncQueueSink(gen, params)) @@ -137,7 +171,15 @@ object FromAsyncBundle { // flitpend and lcrdv are assigned independently out.flitpend := DontCare out.lcrdv := DontCare - out + // extend out with 'Active' to indicate sink Queue is NOT empty + if (withPowerAck) { + val result = Wire(new ChannelWithActive(gen)) + result.channel <> out + result.active := sink.io.deq.valid + result + } else { + out + } } def bitPulse[T <: Data]( @@ -232,18 +274,38 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit val async = Flipped(new AsyncPortIO(params)) val deq = new PortIO val resetFinish = Output(Bool()) + val powerAck = new Bundle { + val QACTIVE = Output(Bool()) + val QACCEPTn = Output(Bool()) + val QREQ = Input(Bool()) + } }) + val txState = RegInit(LinkStates.STOP) + val rxState = RegInit(LinkStates.STOP) + val txreq_lcrdvReady = Wire(Bool()) val txrsp_lcrdvReady = Wire(Bool()) val txdat_lcrdvReady = Wire(Bool()) - io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady)) - io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady)) - io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady)) - io.async.tx.req.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv")) - io.async.tx.rsp.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv")) - io.async.tx.dat.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv")) +// io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady)) +// io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady)) +// io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady)) + + val txreq = FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]] + val txrsp = FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]] + val txdat = FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]] + io.deq.tx.req <> txreq.channel + io.deq.tx.rsp <> txrsp.channel + io.deq.tx.dat <> txdat.channel + // Add handshake to confirm Sink Tx Queue is completely drained + val txActive = txreq.active || txrsp.active || txdat.active + io.powerAck.QACTIVE := txActive + io.powerAck.QACCEPTn := !(io.powerAck.QREQ && !txActive && txState === LinkStates.STOP) + + io.async.tx.req.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv")) + io.async.tx.rsp.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv")) + io.async.tx.dat.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv")) val async_rx_rsp = ToAsyncBundleWithBuf.channel(io.deq.rx.rsp, params, Some("rxrsp_flit")) val async_rx_dat = ToAsyncBundleWithBuf.channel(io.deq.rx.dat, params, Some("rxdat_flit")) @@ -294,9 +356,6 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit /* Duplicate Link Monitor tx/rx state FSM by using deq.rx deq.tx active signals which outuput to DownStream CHI */ - val txState = RegInit(LinkStates.STOP) - val rxState = RegInit(LinkStates.STOP) - Seq(txState, rxState).zip(MixedVecInit(Seq(io.deq.tx, io.deq.rx))).foreach { case (state, link) => state := MuxLookup(Cat(link.linkactivereq, link.linkactiveack), LinkStates.STOP)(Seq( Cat(true.B, false.B) -> LinkStates.ACTIVATE, @@ -322,6 +381,7 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit For tx channel, add l-credit manager module to generate 'ready' to block tx flit to DownStream CHI a. The maximum number of L-Credits in tx channel is 4 inside bridge b. Use L-Credits number more than 4 in CoupledL2 to cover lcrdv sync delay from DownStream CHI to CoupledL2 + c. Normal flits are controlled by credits (from AsyncBridge Sink), Return flits in DEACTIVATE are controlled by credits (from L2) */ val txin = WireInit(0.U asTypeOf(Flipped(new DecoupledPortIO()))) //fake Decoupled IO to provide flitv val txout = WireInit(0.U asTypeOf(new PortIO))//fake LCredit IO to provide lcrdv @@ -334,9 +394,9 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit Decoupled2LCredit(txin.tx.req, txout.tx.req, LinkState(txState), Some("txreq")) Decoupled2LCredit(txin.tx.rsp, txout.tx.rsp, LinkState(txState), Some("txrsp")) Decoupled2LCredit(txin.tx.dat, txout.tx.dat, LinkState(txState), Some("txdat")) - txreq_lcrdvReady := txin.tx.req.ready - txrsp_lcrdvReady := txin.tx.rsp.ready - txdat_lcrdvReady := txin.tx.dat.ready + txreq_lcrdvReady := txin.tx.req.ready || txState === LinkStates.DEACTIVATE + txrsp_lcrdvReady := txin.tx.rsp.ready || txState === LinkStates.DEACTIVATE + txdat_lcrdvReady := txin.tx.dat.ready || txState === LinkStates.DEACTIVATE dontTouch(io) } diff --git a/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala index b939cfd4b..f4b9aac81 100644 --- a/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala +++ b/src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala @@ -274,7 +274,7 @@ class Decoupled2LCredit[T <: Bundle]( // The maximum number of L-Credits that a receiver can provide is 15. val lcreditsMax = 15 - val enableCHIAsync = cacheParams.enableCHIAsyncBridge.getOrElse(false) + val enableCHIAsync = cacheParams.enableCHIAsyncBridge val overlcreditVal = if(enableCHIAsync) overlcreditNum.getOrElse(0) else 0 val lcreditsMaxAll = lcreditsMax + overlcreditVal val lcreditPool = RegInit(overlcreditVal.U(log2Up(lcreditsMaxAll+1).W)) @@ -328,6 +328,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes { val in = Flipped(new DecoupledPortIO()) val out = new PortIO val nodeID = Input(UInt(NODEID_WIDTH.W)) + val coEnable = Output(Bool()) val exitco = Option.when(cacheParams.enableL2Flush) (Input(Bool())) }) // val s_stop :: s_activate :: s_run :: s_deactivate :: Nil = Enum(4) @@ -357,6 +358,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes { //exit coherecy + deactive tx/rx when l2 flush done val exitco = io.exitco.getOrElse(false.B) val exitcoDone = !io.out.syscoreq && !io.out.syscoack && RegNext(true.B, init = false.B) + io.coEnable := io.out.syscoreq && io.out.syscoack io.out.tx.linkactivereq := RegNext(!exitcoDone, init = false.B) io.out.rx.linkactiveack := RegNext(