xref: /XiangShan/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala (revision 5668a921eb594c3ea72da43594b3fb54e05959a3)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import chipsalliance.rocketchip.config.Parameters
20import chisel3._
21import chisel3.util._
22import xiangshan._
23import utils._
24import xiangshan.cache._
25import difftest._
26
27class SbufferFlushBundle extends Bundle {
28  val valid = Output(Bool())
29  val empty = Input(Bool())
30}
31
32trait HasSbufferConst extends HasXSParameter {
33  val EvictCycles = 1 << 20
34  val SbufferReplayDelayCycles = 16
35  require(isPow2(EvictCycles))
36  val EvictCountBits = log2Up(EvictCycles+1)
37  val MissqReplayCountBits = log2Up(SbufferReplayDelayCycles) + 1
38
39  val SbufferIndexWidth: Int = log2Up(StoreBufferSize)
40  // paddr = ptag + offset
41  val CacheLineBytes: Int = CacheLineSize / 8
42  val CacheLineWords: Int = CacheLineBytes / DataBytes
43  val OffsetWidth: Int = log2Up(CacheLineBytes)
44  val WordsWidth: Int = log2Up(CacheLineWords)
45  val PTagWidth: Int = PAddrBits - OffsetWidth
46  val VTagWidth: Int = VAddrBits - OffsetWidth
47  val WordOffsetWidth: Int = PAddrBits - WordsWidth
48}
49
50class SbufferEntryState (implicit p: Parameters) extends SbufferBundle {
51  val state_valid    = Bool() // this entry is active
52  val state_inflight = Bool() // sbuffer is trying to write this entry to dcache
53  val w_timeout = Bool() // waiting for resend store pipeline req timeout
54
55  def isInvalid(): Bool = !state_valid
56  def isValid(): Bool = state_valid
57  def isActive(): Bool = state_valid && !state_inflight
58  def isInflight(): Bool = state_inflight
59}
60
61class SbufferBundle(implicit p: Parameters) extends XSBundle with HasSbufferConst
62
63class DataWriteReq(implicit p: Parameters) extends SbufferBundle {
64  val idx = UInt(SbufferIndexWidth.W)
65  val mask = UInt((DataBits/8).W)
66  val data = UInt(DataBits.W)
67  val wordOffset = UInt(WordOffsetWidth.W)
68  val wline = Bool()
69}
70
71class SbufferData(implicit p: Parameters) extends XSModule with HasSbufferConst {
72  val io = IO(new Bundle(){
73    val writeReq = Vec(StorePipelineWidth, Flipped(ValidIO(new DataWriteReq)))
74    val dataOut = Output(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
75  })
76
77  val data = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
78
79  val req = io.writeReq
80
81  for(i <- 0 until StorePipelineWidth) {
82    when(req(i).valid){
83      for(word <- 0 until CacheLineWords){
84        for(byte <- 0 until DataBytes){
85          when(
86            req(i).bits.mask(byte) && (req(i).bits.wordOffset(WordsWidth-1, 0) === word.U) ||
87            req(i).bits.wline
88          ){
89            data(req(i).bits.idx)(word)(byte) := req(i).bits.data(byte*8+7, byte*8)
90          }
91        }
92      }
93    }
94  }
95
96  io.dataOut := data
97}
98
99class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst {
100  val io = IO(new Bundle() {
101    val hartId = Input(UInt(8.W))
102    val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReqWithVaddr)))  //Todo: store logic only support Width == 2 now
103    val dcache = Flipped(new DCacheToSbufferIO)
104    val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
105    val sqempty = Input(Bool())
106    val flush = Flipped(new SbufferFlushBundle)
107    val csrCtrl = Flipped(new CustomCSRCtrlIO)
108  })
109
110  val dataModule = Module(new SbufferData)
111  dataModule.io.writeReq <> DontCare
112  val writeReq = dataModule.io.writeReq
113
114  val ptag = Reg(Vec(StoreBufferSize, UInt(PTagWidth.W)))
115  val vtag = Reg(Vec(StoreBufferSize, UInt(VTagWidth.W)))
116  val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool()))))
117  val data = dataModule.io.dataOut
118  val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState))))
119  val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W))))
120  val missqReplayCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(MissqReplayCountBits.W))))
121
122  val willSendDcacheReq = Wire(Bool())
123
124  /*
125       idle --[flush]   --> drain   --[buf empty]--> idle
126            --[buf full]--> replace --[dcache resp]--> idle
127  */
128  // x_drain_all: drain store queue and sbuffer
129  // x_drain_sbuffer: drain sbuffer only, block store queue to sbuffer write
130  val x_idle :: x_replace :: x_drain_all :: x_drain_sbuffer :: Nil = Enum(4)
131  def needDrain(state: UInt): Bool =
132    state(1)
133  val sbuffer_state = RegInit(x_idle)
134
135  // ---------------------- Store Enq Sbuffer ---------------------
136
137  def getPTag(pa: UInt): UInt =
138    pa(PAddrBits - 1, PAddrBits - PTagWidth)
139
140  def getVTag(va: UInt): UInt =
141    va(VAddrBits - 1, VAddrBits - VTagWidth)
142
143  def getWord(pa: UInt): UInt =
144    pa(PAddrBits-1, 3)
145
146  def getWordOffset(pa: UInt): UInt =
147    pa(OffsetWidth-1, 3)
148
149  def getAddr(ptag: UInt): UInt =
150    Cat(ptag, 0.U((PAddrBits - PTagWidth).W))
151
152  def getByteOffset(offect: UInt): UInt =
153    Cat(offect(OffsetWidth - 1, 3), 0.U(3.W))
154
155  def isOneOf(key: UInt, seq: Seq[UInt]): Bool =
156    if(seq.isEmpty) false.B else Cat(seq.map(_===key)).orR()
157
158  def widthMap[T <: Data](f: Int => T) = (0 until StoreBufferSize) map f
159
160  // sbuffer entry count
161
162  val plru = new PseudoLRU(StoreBufferSize)
163  val accessIdx = Wire(Vec(StorePipelineWidth + 1, Valid(UInt(SbufferIndexWidth.W))))
164
165  val replaceIdx = plru.way
166  plru.access(accessIdx)
167
168  //-------------------------cohCount-----------------------------
169  // insert and merge: cohCount=0
170  // every cycle cohCount+=1
171  // if cohCount(EvictCountBits-1)==1, evict
172  val cohTimeOutMask = VecInit(widthMap(i => cohCount(i)(EvictCountBits - 1) && stateVec(i).isActive()))
173  val (cohTimeOutIdx, cohHasTimeOut) = PriorityEncoderWithFlag(cohTimeOutMask)
174  val missqReplayTimeOutMask = VecInit(widthMap(i => missqReplayCount(i)(MissqReplayCountBits - 1) && stateVec(i).w_timeout))
175  val (missqReplayTimeOutIdx, missqReplayMayHasTimeOut) = PriorityEncoderWithFlag(missqReplayTimeOutMask)
176  val missqReplayHasTimeOut = RegNext(missqReplayMayHasTimeOut) && !RegNext(willSendDcacheReq)
177  val missqReplayTimeOutIdxReg = RegEnable(missqReplayTimeOutIdx, missqReplayMayHasTimeOut)
178
179  val activeMask = VecInit(stateVec.map(s => s.isActive()))
180  val drainIdx = PriorityEncoder(activeMask)
181
182  val inflightMask = VecInit(stateVec.map(s => s.isInflight()))
183
184  val inptags = io.in.map(in => getPTag(in.bits.addr))
185  val invtags = io.in.map(in => getVTag(in.bits.vaddr))
186  val sameTag = inptags(0) === inptags(1)
187  val firstWord = getWord(io.in(0).bits.addr)
188  val secondWord = getWord(io.in(1).bits.addr)
189  val sameWord = firstWord === secondWord
190
191  // merge condition
192  val mergeMask = Wire(Vec(StorePipelineWidth, Vec(StoreBufferSize, Bool())))
193  val mergeIdx = mergeMask.map(PriorityEncoder(_))
194  val canMerge = mergeMask.map(ParallelOR(_))
195
196  for(i <- 0 until StorePipelineWidth){
197    mergeMask(i) := widthMap(j =>
198      inptags(i) === ptag(j) && activeMask(j)
199    )
200  }
201
202  // insert condition
203  // firstInsert: the first invalid entry
204  // if first entry canMerge or second entry has the same ptag with the first entry,
205  // secondInsert equal the first invalid entry, otherwise, the second invalid entry
206  val invalidMask = VecInit(stateVec.map(s => s.isInvalid()))
207  val evenInvalidMask = GetEvenBits(invalidMask.asUInt)
208  val oddInvalidMask = GetOddBits(invalidMask.asUInt)
209
210  val (evenRawInsertIdx, evenCanInsert) = PriorityEncoderWithFlag(evenInvalidMask)
211  val (oddRawInsertIdx, oddCanInsert) = PriorityEncoderWithFlag(oddInvalidMask)
212  val evenInsertIdx = Cat(evenRawInsertIdx, 0.U(1.W))
213  val oddInsertIdx = Cat(oddRawInsertIdx, 1.U(1.W))
214
215  val enbufferSelReg = RegInit(false.B)
216  when(io.in(0).valid) {
217    enbufferSelReg := ~enbufferSelReg
218  }
219
220  val firstInsertIdx = Mux(enbufferSelReg, evenInsertIdx, oddInsertIdx)
221  val secondInsertIdx = Mux(sameTag,
222    firstInsertIdx,
223    Mux(~enbufferSelReg, evenInsertIdx, oddInsertIdx)
224  )
225  val firstCanInsert = sbuffer_state =/= x_drain_sbuffer && Mux(enbufferSelReg, evenCanInsert, oddCanInsert)
226  val secondCanInsert = sbuffer_state =/= x_drain_sbuffer && Mux(sameTag,
227    firstCanInsert,
228    Mux(~enbufferSelReg, evenCanInsert, oddCanInsert)
229  )
230  val forward_need_uarch_drain = WireInit(false.B)
231  val merge_need_uarch_drain = WireInit(false.B)
232  val do_uarch_drain = RegNext(forward_need_uarch_drain) || RegNext(RegNext(merge_need_uarch_drain))
233  XSPerfAccumulate("do_uarch_drain", do_uarch_drain)
234
235  io.in(0).ready := firstCanInsert
236  io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready
237
238  def wordReqToBufLine(req: DCacheWordReq, reqptag: UInt, reqvtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
239    stateVec(insertIdx).state_valid := true.B
240    cohCount(insertIdx) := 0.U
241    // missqReplayCount(insertIdx) := 0.U
242    ptag(insertIdx) := reqptag
243    vtag(insertIdx) := reqvtag // update vtag iff a new sbuffer line is allocated
244    when(flushMask){
245      for(j <- 0 until CacheLineWords){
246        for(i <- 0 until DataBytes){
247          mask(insertIdx)(j)(i) := false.B
248        }
249      }
250    }
251    for(i <- 0 until DataBytes){
252      when(req.mask(i)){
253        mask(insertIdx)(wordOffset)(i) := true.B
254//        data(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
255      }
256    }
257  }
258
259  def mergeWordReq(req: DCacheWordReq, reqptag: UInt, reqvtag: UInt, mergeIdx: UInt, wordOffset: UInt): Unit = {
260    cohCount(mergeIdx) := 0.U
261    // missqReplayCount(mergeIdx) := 0.U
262    for(i <- 0 until DataBytes){
263      when(req.mask(i)){
264        mask(mergeIdx)(wordOffset)(i) := true.B
265//        data(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
266      }
267    }
268    // check if vtag is the same, if not, trigger sbuffer flush
269    when(reqvtag =/= vtag(mergeIdx)) {
270      XSDebug("reqvtag =/= sbufvtag req(vtag %x ptag %x) sbuffer(vtag %x ptag %x)\n",
271        reqvtag << OffsetWidth,
272        reqptag << OffsetWidth,
273        vtag(mergeIdx) << OffsetWidth,
274        ptag(mergeIdx) << OffsetWidth
275      )
276      merge_need_uarch_drain := true.B
277    }
278  }
279
280  for(((in, wordOffset), i) <- io.in.zip(Seq(firstWord, secondWord)).zipWithIndex){
281    writeReq(i).valid := in.fire()
282    writeReq(i).bits.wordOffset := wordOffset
283    writeReq(i).bits.mask := in.bits.mask
284    writeReq(i).bits.data := in.bits.data
285    writeReq(i).bits.wline := in.bits.wline
286    val insertIdx = if(i == 0) firstInsertIdx else secondInsertIdx
287    val flushMask = if(i == 0) true.B else !sameTag
288    accessIdx(i).valid := RegNext(in.fire())
289    accessIdx(i).bits := RegNext(Mux(canMerge(i), mergeIdx(i), insertIdx))
290    when(in.fire()){
291      when(canMerge(i)){
292        writeReq(i).bits.idx := mergeIdx(i)
293        mergeWordReq(in.bits, inptags(i), invtags(i), mergeIdx(i), wordOffset)
294        XSDebug(p"merge req $i to line [${mergeIdx(i)}]\n")
295      }.otherwise({
296        writeReq(i).bits.idx := insertIdx
297        wordReqToBufLine(in.bits, inptags(i), invtags(i), insertIdx, wordOffset, flushMask)
298        XSDebug(p"insert req $i to line[$insertIdx]\n")
299      })
300    }
301  }
302
303
304  for(i <- 0 until StoreBufferSize){
305    XSDebug(stateVec(i).isValid(),
306      p"[$i] timeout:${cohCount(i)(EvictCountBits-1)} state:${stateVec(i)}\n"
307    )
308  }
309
310  for((req, i) <- io.in.zipWithIndex){
311    XSDebug(req.fire(),
312      p"accept req [$i]: " +
313        p"addr:${Hexadecimal(req.bits.addr)} " +
314        p"mask:${Binary(req.bits.mask)} " +
315        p"data:${Hexadecimal(req.bits.data)}\n"
316    )
317    XSDebug(req.valid && !req.ready,
318      p"req [$i] blocked by sbuffer\n"
319    )
320  }
321
322  // ---------------------- Send Dcache Req ---------------------
323
324  val sbuffer_empty = Cat(invalidMask).andR()
325  val sq_empty = !Cat(io.in.map(_.valid)).orR()
326  val empty = sbuffer_empty && sq_empty
327  val threshold = RegNext(io.csrCtrl.sbuffer_threshold +& 1.U)
328  val validCount = PopCount(activeMask)
329  val do_eviction = RegNext(validCount >= threshold || validCount === (StoreBufferSize-1).U, init = false.B)
330  require((StoreBufferThreshold + 1) <= StoreBufferSize)
331
332  XSDebug(p"validCount[$validCount]\n")
333
334  io.flush.empty := RegNext(empty && io.sqempty)
335  // lru.io.flush := sbuffer_state === x_drain_all && empty
336  switch(sbuffer_state){
337    is(x_idle){
338      when(io.flush.valid){
339        sbuffer_state := x_drain_all
340      }.elsewhen(do_uarch_drain){
341        sbuffer_state := x_drain_sbuffer
342      }.elsewhen(do_eviction){
343        sbuffer_state := x_replace
344      }
345    }
346    is(x_drain_all){
347      when(empty){
348        sbuffer_state := x_idle
349      }
350    }
351    is(x_drain_sbuffer){
352      when(sbuffer_empty){
353        sbuffer_state := x_idle
354      }
355    }
356    is(x_replace){
357      when(io.flush.valid){
358        sbuffer_state := x_drain_all
359      }.elsewhen(do_uarch_drain){
360        sbuffer_state := x_drain_sbuffer
361      }.elsewhen(!do_eviction){
362        sbuffer_state := x_idle
363      }
364    }
365  }
366  XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n")
367
368  def noSameBlockInflight(idx: UInt): Bool = {
369    // stateVec(idx) itself must not be s_inflight
370    !Cat(widthMap(i => inflightMask(i) && ptag(idx) === ptag(i))).orR()
371  }
372
373  val need_drain = needDrain(sbuffer_state)
374  val need_replace = do_eviction || (sbuffer_state === x_replace)
375  val evictionIdx = Mux(missqReplayHasTimeOut,
376    missqReplayTimeOutIdxReg,
377    Mux(need_drain,
378      drainIdx,
379      Mux(cohHasTimeOut, cohTimeOutIdx, replaceIdx)
380    )
381  )
382
383  /*
384      If there is a inflight dcache req which has same ptag with evictionIdx's ptag,
385      current eviction should be blocked.
386   */
387  val prepareValid = missqReplayHasTimeOut ||
388    activeMask(evictionIdx) && (need_drain || cohHasTimeOut || need_replace) && noSameBlockInflight(evictionIdx)
389  val prepareValidReg = RegInit(false.B)
390  // when canSendDcacheReq, send dcache req stored in pipeline reg to dcache
391  val canSendDcacheReq = io.dcache.req.ready || !prepareValidReg
392  // when willSendDcacheReq, read dcache req data and store them in a pipeline reg
393  willSendDcacheReq := prepareValid && canSendDcacheReq
394  when(io.dcache.req.fire()){
395    prepareValidReg := false.B
396  }
397  when(canSendDcacheReq){
398    prepareValidReg := prepareValid
399  }
400  when(willSendDcacheReq){
401    stateVec(evictionIdx).state_inflight := true.B
402    stateVec(evictionIdx).w_timeout := false.B
403    // stateVec(evictionIdx).s_pipe_req := true.B
404    XSDebug(p"$evictionIdx will be sent to Dcache\n")
405  }
406  XSDebug(p"need drain:$need_drain cohHasTimeOut: $cohHasTimeOut need replace:$need_replace\n")
407  XSDebug(p"drainIdx:$drainIdx tIdx:$cohTimeOutIdx replIdx:$replaceIdx " +
408    p"blocked:${!noSameBlockInflight(evictionIdx)} v:${activeMask(evictionIdx)}\n")
409  XSDebug(p"prepareValid:$prepareValid evictIdx:$evictionIdx dcache ready:${io.dcache.req.ready}\n")
410  // Note: if other dcache req in the same block are inflight,
411  // the lru update may not accurate
412  accessIdx(StorePipelineWidth).valid := invalidMask(replaceIdx) || (
413    need_replace && !need_drain && !cohHasTimeOut && !missqReplayHasTimeOut && canSendDcacheReq && activeMask(replaceIdx))
414  accessIdx(StorePipelineWidth).bits := replaceIdx
415  val evictionIdxReg = RegEnable(evictionIdx, enable = willSendDcacheReq)
416  val evictionPTag = RegEnable(ptag(evictionIdx), enable = willSendDcacheReq)
417  val evictionVTag = RegEnable(vtag(evictionIdx), enable = willSendDcacheReq)
418
419  io.dcache.req.valid := prepareValidReg
420  io.dcache.req.bits := DontCare
421  io.dcache.req.bits.cmd    := MemoryOpConstants.M_XWR
422  io.dcache.req.bits.addr   := getAddr(evictionPTag)
423  io.dcache.req.bits.vaddr   := getAddr(evictionVTag)
424  io.dcache.req.bits.data  := data(evictionIdxReg).asUInt
425  io.dcache.req.bits.mask  := mask(evictionIdxReg).asUInt
426  io.dcache.req.bits.id := evictionIdxReg
427
428  when (io.dcache.req.fire()) {
429    assert(!(io.dcache.req.bits.vaddr === 0.U))
430    assert(!(io.dcache.req.bits.addr === 0.U))
431  }
432
433  XSDebug(io.dcache.req.fire(),
434    p"send buf [$evictionIdxReg] to Dcache, req fire\n"
435  )
436
437  // TODO: for timing reasons, dcache store pipe resp may need to be delayed
438  // update sbuffer status according to dcache resp source
439
440  // hit resp
441  io.dcache.hit_resps.map(resp => {
442  val dcache_resp_id = resp.bits.id
443    when (resp.fire()) {
444      stateVec(dcache_resp_id).state_inflight := false.B
445      stateVec(dcache_resp_id).state_valid := false.B
446      assert(!resp.bits.replay)
447      assert(!resp.bits.miss) // not need to resp if miss, to be opted
448      assert(stateVec(dcache_resp_id).state_inflight === true.B)
449    }
450  })
451
452  // replay resp
453  val replay_resp_id = io.dcache.replay_resp.bits.id
454  when (io.dcache.replay_resp.fire()) {
455    missqReplayCount(replay_resp_id) := 0.U
456    stateVec(replay_resp_id).w_timeout := true.B
457    // waiting for timeout
458    assert(io.dcache.replay_resp.bits.replay)
459    assert(stateVec(replay_resp_id).state_inflight === true.B)
460  }
461
462  // TODO: reuse cohCount
463  (0 until StoreBufferSize).map(i => {
464    when(stateVec(i).w_timeout && stateVec(i).state_inflight && !missqReplayCount(i)(MissqReplayCountBits-1)) {
465      missqReplayCount(i) := missqReplayCount(i) + 1.U
466    }
467    when(activeMask(i) && !cohTimeOutMask(i)){
468      cohCount(i) := cohCount(i)+1.U
469    }
470  })
471
472  if (env.EnableDifftest) {
473    // hit resp
474    io.dcache.hit_resps.zipWithIndex.map{case (resp, index) => {
475      val difftest = Module(new DifftestSbufferEvent)
476      val dcache_resp_id = resp.bits.id
477      difftest.io.clock := clock
478      difftest.io.coreid := io.hartId
479      difftest.io.index := index.U
480      difftest.io.sbufferResp := RegNext(resp.fire())
481      difftest.io.sbufferAddr := RegNext(getAddr(ptag(dcache_resp_id)))
482      difftest.io.sbufferData := RegNext(data(dcache_resp_id).asTypeOf(Vec(CacheLineBytes, UInt(8.W))))
483      difftest.io.sbufferMask := RegNext(mask(dcache_resp_id).asUInt)
484    }}
485  }
486
487  // ---------------------- Load Data Forward ---------------------
488  val mismatch = Wire(Vec(LoadPipelineWidth, Bool()))
489  XSPerfAccumulate("vaddr_match_failed", mismatch(0) || mismatch(1))
490  for ((forward, i) <- io.forward.zipWithIndex) {
491    val vtag_matches = VecInit(widthMap(w => vtag(w) === getVTag(forward.vaddr)))
492    val ptag_matches = VecInit(widthMap(w => ptag(w) === getPTag(forward.paddr)))
493    val tag_matches = vtag_matches
494    val tag_mismatch = RegNext(forward.valid) && VecInit(widthMap(w =>
495      RegNext(vtag_matches(w)) =/= RegNext(ptag_matches(w)) && RegNext((activeMask(w) || inflightMask(w)))
496    )).asUInt.orR
497    mismatch(i) := tag_mismatch
498    when (tag_mismatch) {
499      XSDebug("forward tag mismatch: pmatch %x vmatch %x vaddr %x paddr %x\n",
500        RegNext(ptag_matches.asUInt),
501        RegNext(vtag_matches.asUInt),
502        RegNext(forward.vaddr),
503        RegNext(forward.paddr)
504      )
505      forward_need_uarch_drain := true.B
506    }
507    val valid_tag_matches = widthMap(w => tag_matches(w) && activeMask(w))
508    val inflight_tag_matches = widthMap(w => tag_matches(w) && inflightMask(w))
509    val line_offset_mask = UIntToOH(getWordOffset(forward.paddr))
510
511    val valid_tag_match_reg = valid_tag_matches.map(RegNext(_))
512    val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_))
513    val line_offset_reg = RegNext(line_offset_mask)
514    val forward_data_candidate_reg = RegEnable(
515      VecInit(data.map(entry => entry(getWordOffset(forward.paddr)))),
516      forward.valid
517    )
518
519    val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
520    val selectedValidData = Mux1H(valid_tag_match_reg, forward_data_candidate_reg)
521    selectedValidData.suggestName("selectedValidData_"+i)
522
523    val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
524    val selectedInflightData = Mux1H(inflight_tag_match_reg, forward_data_candidate_reg)
525    selectedInflightData.suggestName("selectedInflightData_"+i)
526
527    val selectedInflightMaskFast = Mux1H(line_offset_mask, Mux1H(inflight_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
528    val selectedValidMaskFast = Mux1H(line_offset_mask, Mux1H(valid_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
529
530    forward.dataInvalid := false.B // data in store line merge buffer is always ready
531    forward.matchInvalid := tag_mismatch // paddr / vaddr cam result does not match
532    for (j <- 0 until DataBytes) {
533      forward.forwardMask(j) := false.B
534      forward.forwardData(j) := DontCare
535
536      // valid entries have higher priority than inflight entries
537      when(selectedInflightMask(j)) {
538        forward.forwardMask(j) := true.B
539        forward.forwardData(j) := selectedInflightData(j)
540      }
541      when(selectedValidMask(j)) {
542        forward.forwardMask(j) := true.B
543        forward.forwardData(j) := selectedValidData(j)
544      }
545
546      forward.forwardMaskFast(j) := selectedInflightMaskFast(j) || selectedValidMaskFast(j)
547    }
548  }
549
550  for (i <- 0 until StoreBufferSize) {
551    XSDebug("sbf entry " + i + " : ptag %x vtag %x valid %x active %x inflight %x w_timeout %x\n",
552      ptag(i) << OffsetWidth,
553      vtag(i) << OffsetWidth,
554      stateVec(i).isValid(),
555      activeMask(i),
556      inflightMask(i),
557      stateVec(i).w_timeout
558    )
559  }
560
561  val perf_valid_entry_count = PopCount(VecInit(stateVec.map(s => !s.isInvalid())).asUInt)
562  XSPerfHistogram("util", perf_valid_entry_count, true.B, 0, StoreBufferSize, 1)
563  XSPerfAccumulate("sbuffer_req_valid", PopCount(VecInit(io.in.map(_.valid)).asUInt))
564  XSPerfAccumulate("sbuffer_req_fire", PopCount(VecInit(io.in.map(_.fire())).asUInt))
565  XSPerfAccumulate("sbuffer_merge", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && canMerge(i)})).asUInt))
566  XSPerfAccumulate("sbuffer_newline", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && !canMerge(i)})).asUInt))
567  XSPerfAccumulate("dcache_req_valid", io.dcache.req.valid)
568  XSPerfAccumulate("dcache_req_fire", io.dcache.req.fire())
569  XSPerfAccumulate("sbuffer_idle", sbuffer_state === x_idle)
570  XSPerfAccumulate("sbuffer_flush", sbuffer_state === x_drain_sbuffer)
571  XSPerfAccumulate("sbuffer_replace", sbuffer_state === x_replace)
572  XSPerfAccumulate("evenCanInsert", evenCanInsert)
573  XSPerfAccumulate("oddCanInsert", oddCanInsert)
574  XSPerfAccumulate("mainpipe_resp_valid", io.dcache.main_pipe_hit_resp.fire())
575  XSPerfAccumulate("refill_resp_valid", io.dcache.refill_hit_resp.fire())
576  XSPerfAccumulate("replay_resp_valid", io.dcache.replay_resp.fire())
577  XSPerfAccumulate("coh_timeout", cohHasTimeOut)
578
579  // val (store_latency_sample, store_latency) = TransactionLatencyCounter(io.lsu.req.fire(), io.lsu.resp.fire())
580  // XSPerfHistogram("store_latency", store_latency, store_latency_sample, 0, 100, 10)
581  // XSPerfAccumulate("store_req", io.lsu.req.fire())
582
583  val perfinfo = IO(new Bundle(){
584    val perfEvents = Output(new PerfEventsBundle(10))
585  })
586  val perfEvents = Seq(
587    ("sbuffer_req_valid ", PopCount(VecInit(io.in.map(_.valid)).asUInt)                                                                ),
588    ("sbuffer_req_fire  ", PopCount(VecInit(io.in.map(_.fire())).asUInt)                                                               ),
589    ("sbuffer_merge     ", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && canMerge(i)})).asUInt)                ),
590    ("sbuffer_newline   ", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && !canMerge(i)})).asUInt)               ),
591    ("dcache_req_valid  ", io.dcache.req.valid                                                                                         ),
592    ("dcache_req_fire   ", io.dcache.req.fire()                                                                                        ),
593    ("sbuffer_idle      ", sbuffer_state === x_idle                                                                                    ),
594    ("sbuffer_flush     ", sbuffer_state === x_drain_sbuffer                                                                           ),
595    ("sbuffer_replace   ", sbuffer_state === x_replace                                                                                 ),
596    ("mpipe_resp_valid  ", io.dcache.main_pipe_hit_resp.fire()                                                                         ),
597    ("refill_resp_valid ", io.dcache.refill_hit_resp.fire()                                                                            ),
598    ("replay_resp_valid ", io.dcache.replay_resp.fire()                                                                                ),
599    ("coh_timeout       ", cohHasTimeOut                                                                                               ),
600    ("sbuffer_1/4_valid ", (perf_valid_entry_count < (StoreBufferSize.U/4.U))                                                          ),
601    ("sbuffer_2/4_valid ", (perf_valid_entry_count > (StoreBufferSize.U/4.U)) & (perf_valid_entry_count <= (StoreBufferSize.U/2.U))    ),
602    ("sbuffer_3/4_valid ", (perf_valid_entry_count > (StoreBufferSize.U/2.U)) & (perf_valid_entry_count <= (StoreBufferSize.U*3.U/4.U))),
603    ("sbuffer_full_valid", (perf_valid_entry_count > (StoreBufferSize.U*3.U/4.U)))
604  )
605
606  for (((perf_out,(perf_name,perf)),i) <- perfinfo.perfEvents.perf_events.zip(perfEvents).zipWithIndex) {
607    perf_out.incr_step := RegNext(perf)
608  }
609}
610