Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
659446d
fix(AsyncBridge): handle lcredit return sequence safely
yulightenyu Mar 7, 2026
d7b9330
fix compile
yulightenyu Mar 7, 2026
1d51082
remove txsactive sysco related
yulightenyu Mar 7, 2026
64a42cd
fix(AsyncBridge): extend lcrdv AsyncQueue depth to 16
yulightenyu Mar 26, 2026
922f126
fix(LinkLayer): Enable txactive after sysco handshake done
yulightenyu Mar 26, 2026
85c4961
fix: CMOALL allocate MSHR, the tag should come from dirResult NOT fro…
yulightenyu Mar 26, 2026
de37d85
fix(SinkA): fix CMOALL should operate line by line
yulightenyu Mar 26, 2026
59af838
fix: remove change of lcrdv AsyncQueue depth change
yulightenyu Mar 26, 2026
27705ce
fix(AsyncBridge): add shadow buffer(16) for tx lcrdv
yulightenyu Mar 26, 2026
663381f
fix(LinkLayer): txsactive enable should NOT wait for sysco done
yulightenyu Mar 27, 2026
92888a9
fix: add coherency gating at the transaction layer TXREQ arbiter
yulightenyu Mar 27, 2026
735d033
fix(AsyncBridge): restore AsyncBridge performance commit
yulightenyu Apr 2, 2026
d70bdee
fix(L2Param): change enableCHIAsyncBridge params type
yulightenyu Apr 8, 2026
72d6260
fix(Linklayer): add Tx shadow buffer to cover lcrdv latency from Asyn…
yulightenyu Apr 3, 2026
e7707a6
Revert "fix(Linklayer): add Tx shadow buffer to cover lcrdv latency f…
yulightenyu Apr 8, 2026
7791c06
fix(LinkLayer): fix when params enableCHIAsyncBridge remove Option at…
yulightenyu Apr 9, 2026
ec9827b
fix(TL2CHICoupledL2): revert io_cpu_wfi to io_cpu_halt for Kunminghu-v2
yulightenyu Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/main/scala/coupledL2/Directory.scala
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class Directory(implicit p: Parameters) extends L2Module {
chosenWay,
PriorityEncoder(freeWayMask_s3)
)
val hit_s3 = Cat(hitVec).orR || req_s3.cmoAll
val hit_s3 = Cat(hitVec).orR || (req_s3.cmoAll && VecInit(metaAll_s3.map(_.state =/= MetaData.INVALID))(req_s3.cmoWay))
val way_s3 = Mux(req_s3.cmoAll, req_s3.cmoWay, Mux(hit_s3, hitWay, finalWay))
val meta_s3 = metaAll_s3(way_s3)
val tag_s3 = tagAll_s3(way_s3)
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/coupledL2/L2Param.scala
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ case class L2Param(
// L2 Flush
enableL2Flush: Boolean = false,
// AsyncBridge
enableCHIAsyncBridge: Option[Boolean] = None,
enableCHIAsyncBridge: Boolean = true,
// Performance analysis
enablePerf: Boolean = true,
// RollingDB
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/coupledL2/SinkA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ class SinkA(implicit p: Parameters) extends L2Module {
}.otherwise {
way.foreach { _ := wayVal + 1.U }
}
when (mshrValid) {
when (!mshrValid) {
state.foreach { _ := sCMOREQ }
}.otherwise {
state.foreach { _ := sWAITMSHR }
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/coupledL2/tl2chi/MainPipe.scala
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ class MainPipe(implicit p: Parameters) extends TL2CHIL2Module with HasCHIOpcodes
io.toMSHRCtl.mshr_alloc_s3.bits.state := alloc_state
io.toMSHRCtl.mshr_alloc_s3.bits.task match { case task =>
task := req_s3
task.tag := Mux(io.cmoAllBlock.getOrElse(false.B), dirResult_s3.tag, req_s3.tag)
task.bufIdx := 0.U(bufIdxBits.W)
task.mshrTask := false.B
task.aliasTask.foreach(_ := cache_alias)
Expand Down
16 changes: 12 additions & 4 deletions src/main/scala/coupledL2/tl2chi/TL2CHICoupledL2.scala
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {

val io_chi = IO(new PortIO)
val io_nodeID = IO(Input(UInt()))
val io_cpu_wfi = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool())))
val io_cpu_halt = Option.when(cacheParams.enableL2Flush) (IO(Input(Bool())))

// Check port width
require(io_chi.tx.rsp.getWidth == io_chi.rx.rsp.getWidth);
Expand Down Expand Up @@ -125,14 +125,22 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
)
}

//Coherency enable from Link Layer: when 1 cacheable request can be sent; otherwise they are gated
val coEnable = WireInit(false.B)

slices match {
case slices: Seq[Slice] =>
// TXREQ
val txreq_arb = Module(new RRArbiterInit(new CHIREQ, slices.size + 1)) // plus 1 for MMIO
val txreq = Wire(DecoupledIO(new CHIREQ))
slices.zip(txreq_arb.io.in.init).foreach { case (s, in) => in <> s.io.out.tx.req }
txreq_arb.io.in.last <> mmio.io.tx.req
txreq <> txreq_arb.io.out
//Coherency Gating cacheable request, mmio always pass
val is_mmio = txreq_arb.io.chosen === slices.size.U
val req_pass = coEnable || is_mmio
txreq.valid := txreq_arb.io.out.valid && req_pass
txreq.bits := txreq_arb.io.out.bits
txreq_arb.io.out.ready := txreq.ready && req_pass
txreq.bits.txnID := setSliceID(txreq_arb.io.out.bits.txnID, txreq_arb.io.chosen, mmio.io.tx.req.fire)

// TXRSP
Expand Down Expand Up @@ -265,9 +273,9 @@ class TL2CHICoupledL2(implicit p: Parameters) extends CoupledL2Base {
linkMonitor.io.nodeID := io_nodeID
/* exit coherency when: l2 flush of all slices is done and core is in WFI state */
linkMonitor.io.exitco.foreach { _ :=
Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_wfi.getOrElse(false.B)
Cat(slices.zipWithIndex.map { case (s, i) => s.io.l2FlushDone.getOrElse(false.B)}).andR && io_cpu_halt.getOrElse(false.B)
}

coEnable := linkMonitor.io.coEnable
/**
* performance counters
*/
Expand Down
98 changes: 79 additions & 19 deletions src/main/scala/coupledL2/tl2chi/chi/AsyncBridge.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ class AsyncPortIO(
val tx = new AsyncDownwardsLinkIO(params)
val rx = Flipped(new AsyncUpwardsLinkIO(params))
}

class ChannelWithActive[T <: Data](gen: T) extends Bundle {
val channel = new ChannelIO(gen)
val active = Bool()
}

/*
* This module enhances the standard async bridge by adding a front-end shadow buffer
* to decouple local processing from asynchronous latency and provide instant credit
Expand All @@ -73,15 +79,15 @@ object ToAsyncBundleWithBuf {
/*
1. Shadow Buffer (depth=16, flow mode for low latency)
*/
val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 16, flow = true, pipe = false))
val shadow_buffer = Module(new Queue(chiselTypeOf(chn.flit), 32, flow = true, pipe = false))
if (name.isDefined) { shadow_buffer.suggestName("shadowBuffer_" + name.get) }
shadow_buffer.io.enq.valid := chn.flitv
shadow_buffer.io.enq.bits := chn.flit
/*
2. For rx channel (CMN->L2), send out lcrdv right after a flit entering Shadow buffer if has space
*/
val deqReady = shadow_buffer.io.deq.ready
dontTouch(deqReady)
val hasSpace = shadow_buffer.io.count <= 16.U
dontTouch(hasSpace)
assert(!chn.flitv || shadow_buffer.io.enq.ready, s"${name.getOrElse("ToAsyncBundle")}: Shadow buffer overflow!")
/*
3. AsyncQueueSource (depth=4)
Expand All @@ -90,8 +96,35 @@ object ToAsyncBundleWithBuf {
if (name.isDefined) { source.suggestName("asyncQSource_" + name.get) }
source.io.enq <> shadow_buffer.io.deq

(source.io.async, deqReady)
(source.io.async, hasSpace)
}

def bitPulse(
bit: Bool,
params: AsyncQueueParams = AsyncQueueParams(),
name: Option[String] = None
) = {
/*
1. Shadow Buffer (depth=16, flow mode for low latency)
*/
val shadow_buffer = Module(new Queue(Bool(), 16, flow = true, pipe = false))
if (name.isDefined) { shadow_buffer.suggestName("lcrdvShadowBuffer_" + name.get) }
shadow_buffer.io.enq.valid := bit
shadow_buffer.io.enq.bits := DontCare
/*
2. AsyncQueueSource (depth =4)
*/
val source = Module(new AsyncQueueSource(UInt(0.W), params))
if (name.isDefined) { source.suggestName("asyncQBitSource_" + name.get) }
source.io.enq.valid := shadow_buffer.io.deq.valid
source.io.enq.bits := DontCare

shadow_buffer.io.deq.ready := source.io.enq.ready

source.io.async

}

}
object ToAsyncBundle {
def channel[T <: Data](
Expand Down Expand Up @@ -124,8 +157,9 @@ object FromAsyncBundle {
async: AsyncBundle[UInt],
params: AsyncQueueParams = AsyncQueueParams(),
name: Option[String] = None,
lcrdvReady: Option[Bool]= None
) = {
lcrdvReady: Option[Bool]= None,
withPowerAck: Boolean = false
): Data = {
val gen = chiselTypeOf(async.mem.head)
val out = Wire(new ChannelIO(gen))
val sink = Module(new AsyncQueueSink(gen, params))
Expand All @@ -137,7 +171,15 @@ object FromAsyncBundle {
// flitpend and lcrdv are assigned independently
out.flitpend := DontCare
out.lcrdv := DontCare
out
// extend out with 'Active' to indicate sink Queue is NOT empty
if (withPowerAck) {
val result = Wire(new ChannelWithActive(gen))
result.channel <> out
result.active := sink.io.deq.valid
result
} else {
out
}
}

def bitPulse[T <: Data](
Expand Down Expand Up @@ -232,18 +274,38 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
val async = Flipped(new AsyncPortIO(params))
val deq = new PortIO
val resetFinish = Output(Bool())
val powerAck = new Bundle {
val QACTIVE = Output(Bool())
val QACCEPTn = Output(Bool())
val QREQ = Input(Bool())
}
})

val txState = RegInit(LinkStates.STOP)
val rxState = RegInit(LinkStates.STOP)

val txreq_lcrdvReady = Wire(Bool())
val txrsp_lcrdvReady = Wire(Bool())
val txdat_lcrdvReady = Wire(Bool())
io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady))
io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady))
io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady))

io.async.tx.req.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv"))
io.async.tx.rsp.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv"))
io.async.tx.dat.lcrdv <> ToAsyncBundle.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv"))
// io.deq.tx.req <> FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady))
// io.deq.tx.rsp <> FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady))
// io.deq.tx.dat <> FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady))

val txreq = FromAsyncBundle.channel(io.async.tx.req.flit, params, Some("txreq_flit"), Some(txreq_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
val txrsp = FromAsyncBundle.channel(io.async.tx.rsp.flit, params, Some("txrsp_flit"), Some(txrsp_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
val txdat = FromAsyncBundle.channel(io.async.tx.dat.flit, params, Some("txdat_flit"), Some(txdat_lcrdvReady), true).asInstanceOf[ChannelWithActive[UInt]]
io.deq.tx.req <> txreq.channel
io.deq.tx.rsp <> txrsp.channel
io.deq.tx.dat <> txdat.channel
// Add handshake to confirm Sink Tx Queue is completely drained
val txActive = txreq.active || txrsp.active || txdat.active
io.powerAck.QACTIVE := txActive
io.powerAck.QACCEPTn := !(io.powerAck.QREQ && !txActive && txState === LinkStates.STOP)

io.async.tx.req.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.req.lcrdv, params, Some("txreq_lcrdv"))
io.async.tx.rsp.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.rsp.lcrdv, params, Some("txrsp_lcrdv"))
io.async.tx.dat.lcrdv <> ToAsyncBundleWithBuf.bitPulse(io.deq.tx.dat.lcrdv, params, Some("txdat_lcrdv"))

val async_rx_rsp = ToAsyncBundleWithBuf.channel(io.deq.rx.rsp, params, Some("rxrsp_flit"))
val async_rx_dat = ToAsyncBundleWithBuf.channel(io.deq.rx.dat, params, Some("rxdat_flit"))
Expand Down Expand Up @@ -294,9 +356,6 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
/*
Duplicate Link Monitor tx/rx state FSM by using deq.rx deq.tx active signals which outuput to DownStream CHI
*/
val txState = RegInit(LinkStates.STOP)
val rxState = RegInit(LinkStates.STOP)

Seq(txState, rxState).zip(MixedVecInit(Seq(io.deq.tx, io.deq.rx))).foreach { case (state, link) =>
state := MuxLookup(Cat(link.linkactivereq, link.linkactiveack), LinkStates.STOP)(Seq(
Cat(true.B, false.B) -> LinkStates.ACTIVATE,
Expand All @@ -322,6 +381,7 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
For tx channel, add l-credit manager module to generate 'ready' to block tx flit to DownStream CHI
a. The maximum number of L-Credits in tx channel is 4 inside bridge
b. Use L-Credits number more than 4 in CoupledL2 to cover lcrdv sync delay from DownStream CHI to CoupledL2
c. Normal flits are controlled by credits (from AsyncBridge Sink), Return flits in DEACTIVATE are controlled by credits (from L2)
*/
val txin = WireInit(0.U asTypeOf(Flipped(new DecoupledPortIO()))) //fake Decoupled IO to provide flitv
val txout = WireInit(0.U asTypeOf(new PortIO))//fake LCredit IO to provide lcrdv
Expand All @@ -334,9 +394,9 @@ class CHIAsyncBridgeSink(params: AsyncQueueParams = AsyncQueueParams())(implicit
Decoupled2LCredit(txin.tx.req, txout.tx.req, LinkState(txState), Some("txreq"))
Decoupled2LCredit(txin.tx.rsp, txout.tx.rsp, LinkState(txState), Some("txrsp"))
Decoupled2LCredit(txin.tx.dat, txout.tx.dat, LinkState(txState), Some("txdat"))
txreq_lcrdvReady := txin.tx.req.ready
txrsp_lcrdvReady := txin.tx.rsp.ready
txdat_lcrdvReady := txin.tx.dat.ready
txreq_lcrdvReady := txin.tx.req.ready || txState === LinkStates.DEACTIVATE
txrsp_lcrdvReady := txin.tx.rsp.ready || txState === LinkStates.DEACTIVATE
txdat_lcrdvReady := txin.tx.dat.ready || txState === LinkStates.DEACTIVATE

dontTouch(io)
}
4 changes: 3 additions & 1 deletion src/main/scala/coupledL2/tl2chi/chi/LinkLayer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ class Decoupled2LCredit[T <: Bundle](

// The maximum number of L-Credits that a receiver can provide is 15.
val lcreditsMax = 15
val enableCHIAsync = cacheParams.enableCHIAsyncBridge.getOrElse(false)
val enableCHIAsync = cacheParams.enableCHIAsyncBridge
val overlcreditVal = if(enableCHIAsync) overlcreditNum.getOrElse(0) else 0
val lcreditsMaxAll = lcreditsMax + overlcreditVal
val lcreditPool = RegInit(overlcreditVal.U(log2Up(lcreditsMaxAll+1).W))
Expand Down Expand Up @@ -328,6 +328,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes {
val in = Flipped(new DecoupledPortIO())
val out = new PortIO
val nodeID = Input(UInt(NODEID_WIDTH.W))
val coEnable = Output(Bool())
val exitco = Option.when(cacheParams.enableL2Flush) (Input(Bool()))
})
// val s_stop :: s_activate :: s_run :: s_deactivate :: Nil = Enum(4)
Expand Down Expand Up @@ -357,6 +358,7 @@ class LinkMonitor(implicit p: Parameters) extends L2Module with HasCHIOpcodes {
//exit coherecy + deactive tx/rx when l2 flush done
val exitco = io.exitco.getOrElse(false.B)
val exitcoDone = !io.out.syscoreq && !io.out.syscoack && RegNext(true.B, init = false.B)
io.coEnable := io.out.syscoreq && io.out.syscoack

io.out.tx.linkactivereq := RegNext(!exitcoDone, init = false.B)
io.out.rx.linkactiveack := RegNext(
Expand Down
Loading