xref: /XiangShan/src/main/scala/xiangshan/backend/fu/wrapper/VFALU.scala (revision 060feae31eb5505cbd1e3372a6065e504247b56d)
1package xiangshan.backend.fu.wrapper
2
3import org.chipsalliance.cde.config.Parameters
4import chisel3._
5import chisel3.util._
6import utils.XSError
7import xiangshan.backend.fu.FuConfig
8import xiangshan.backend.fu.vector.Bundles.{VLmul, VSew, ma}
9import xiangshan.backend.fu.vector.utils.VecDataSplitModule
10import xiangshan.backend.fu.vector.{Mgu, Mgtu, VecInfo, VecPipedFuncUnit}
11import xiangshan.ExceptionNO
12import yunsuan.{VfaluType, VfpuType}
13import yunsuan.vector.VectorFloatAdder
14
15class VFAlu(cfg: FuConfig)(implicit p: Parameters) extends VecPipedFuncUnit(cfg) {
16  XSError(io.in.valid && io.in.bits.ctrl.fuOpType === VfpuType.dummy, "Vfalu OpType not supported")
17
18  // params alias
19  private val dataWidth = cfg.dataBits
20  private val dataWidthOfDataModule = 64
21  private val numVecModule = dataWidth / dataWidthOfDataModule
22
23  // io alias
24  private val opcode  = fuOpType(4,0)
25  private val resWiden  = fuOpType(5)
26  private val opbWiden  = fuOpType(6)
27
28  // modules
29  private val vfalus = Seq.fill(numVecModule)(Module(new VectorFloatAdder))
30  private val vs2Split = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
31  private val vs1Split = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
32  private val oldVdSplit  = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
33  private val mgu = Module(new Mgu(dataWidth))
34  private val mgtu = Module(new Mgtu(dataWidth))
35
36  /**
37    * In connection of [[vs2Split]], [[vs1Split]] and [[oldVdSplit]]
38    */
39  vs2Split.io.inVecData := vs2
40  vs1Split.io.inVecData := vs1
41  oldVdSplit.io.inVecData := oldVd
42
43  /**
44    * [[vfalus]]'s in connection
45    */
46  // Vec(vs2(31,0), vs2(63,32), vs2(95,64), vs2(127,96)) ==>
47  // Vec(
48  //   Cat(vs2(95,64),  vs2(31,0)),
49  //   Cat(vs2(127,96), vs2(63,32)),
50  // )
51  private val vs2GroupedVec: Vec[UInt] = VecInit(vs2Split.io.outVec32b.zipWithIndex.groupBy(_._2 % 2).map(x => x._1 -> x._2.map(_._1)).values.map(x => Cat(x.reverse)).toSeq)
52  private val vs1GroupedVec: Vec[UInt] = VecInit(vs1Split.io.outVec32b.zipWithIndex.groupBy(_._2 % 2).map(x => x._1 -> x._2.map(_._1)).values.map(x => Cat(x.reverse)).toSeq)
53  private val resultData = Wire(Vec(numVecModule,UInt(dataWidthOfDataModule.W)))
54  private val fflagsData = Wire(Vec(numVecModule,UInt(20.W)))
55  private val srcMaskRShiftForReduction = Wire(UInt((8 * numVecModule).W))
56  // for reduction
57  val isFirstGroupUop = vuopIdx === 0.U ||
58    (vuopIdx === 1.U && (vlmul === VLmul.m4 || vlmul === VLmul.m8)) ||
59    ((vuopIdx === 2.U || vuopIdx === 3.U) && vlmul === VLmul.m8)
60  val maskRshiftWidthForReduction = Wire(UInt(6.W))
61  maskRshiftWidthForReduction := Mux(fuOpType === VfaluType.vfredosum || fuOpType === VfaluType.vfwredosum,
62    vuopIdx,
63    Mux1H(Seq(
64      (vsew === VSew.e16) -> (vuopIdx(1, 0) << 4),
65      (vsew === VSew.e32) -> (vuopIdx(1, 0) << 3),
66      (vsew === VSew.e64) -> (vuopIdx(1, 0) << 2),
67    ))
68  )
69  val vlMaskForReduction = (~(Fill(VLEN, 1.U) << vl)).asUInt
70  srcMaskRShiftForReduction := ((srcMask & vlMaskForReduction) >> maskRshiftWidthForReduction)(8 * numVecModule - 1, 0)
71
72  def genMaskForReduction(inmask: UInt, sew: UInt, i: Int): UInt = {
73    val f64MaskNum = dataWidth / 64 * 2
74    val f32MaskNum = dataWidth / 32 * 2
75    val f16MaskNum = dataWidth / 16 * 2
76    val f64Mask = inmask(f64MaskNum - 1, 0)
77    val f32Mask = inmask(f32MaskNum - 1, 0)
78    val f16Mask = inmask(f16MaskNum - 1, 0)
79    // vs2 reordered, so mask use high bits
80    val f64FirstFoldMaskUnorder = Mux1H(
81      Seq(
82        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(3.W), f64Mask(0), 0.U(3.W), f64Mask(1)),
83      )
84    )
85    val f64FirstFoldMaskOrder = Mux1H(
86      Seq(
87        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(3.W), f64Mask(1), 0.U(3.W), f64Mask(0))
88      )
89    )
90    val f32FirstFoldMaskUnorder = Mux1H(
91      Seq(
92        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(2.W), f32Mask(1), f32Mask(0), 0.U(2.W), f32Mask(3), f32Mask(2)),
93        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(3.W), f32Mask(0), 0.U(3.W), f32Mask(1)),
94      )
95    )
96    val f32FirstFoldMaskOrder = Mux1H(
97      Seq(
98        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(2.W), f32Mask(3), f32Mask(2), 0.U(2.W), f32Mask(1), f32Mask(0)),
99        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(3.W), f32Mask(1), 0.U(3.W), f32Mask(0)),
100      )
101    )
102    val f16FirstFoldMaskUnorder = Mux1H(
103      Seq(
104        vecCtrl.fpu.isFoldTo1_2 -> Cat(f16Mask(7,4), f16Mask(3,0)),
105        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(2.W), f16Mask(1), f16Mask(0), 0.U(2.W), f16Mask(3), f16Mask(2)),
106        vecCtrl.fpu.isFoldTo1_8 -> Cat(0.U(3.W), f16Mask(0), 0.U(3.W), f16Mask(1)),
107      )
108    )
109    val f16FirstFoldMaskOrder = Mux1H(
110      Seq(
111        vecCtrl.fpu.isFoldTo1_2 -> Cat(f16Mask(7,4), f16Mask(3,0)),
112        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(2.W), f16Mask(3), f16Mask(2), 0.U(2.W), f16Mask(1), f16Mask(0)),
113        vecCtrl.fpu.isFoldTo1_8 -> Cat(0.U(3.W), f16Mask(1), 0.U(3.W), f16Mask(0)),
114      )
115    )
116    val f64FoldMask = Mux1H(
117      Seq(
118        vecCtrl.fpu.isFoldTo1_2 -> "b00010001".U,
119      )
120    )
121    val f32FoldMask = Mux1H(
122      Seq(
123        vecCtrl.fpu.isFoldTo1_2 -> "b00110011".U,
124        vecCtrl.fpu.isFoldTo1_4 -> "b00010001".U,
125      )
126    )
127    val f16FoldMask = Mux1H(
128      Seq(
129        vecCtrl.fpu.isFoldTo1_2 -> "b11111111".U,
130        vecCtrl.fpu.isFoldTo1_4 -> "b00110011".U,
131        vecCtrl.fpu.isFoldTo1_8 -> "b00010001".U,
132      )
133    )
134    // low 4 bits for vs2(fp_a), high 4 bits for vs1(fp_b),
135    val isFold = vecCtrl.fpu.isFoldTo1_2 || vecCtrl.fpu.isFoldTo1_4 || vecCtrl.fpu.isFoldTo1_8
136    val f64FirstNotFoldMask = Cat(0.U(3.W), f64Mask(i + 2), 0.U(3.W), f64Mask(i))
137    val f32FirstNotFoldMask = Cat(0.U(2.W), f32Mask(i * 2 + 5, i * 2 + 4), 0.U(2.W), Cat(f32Mask(i * 2 + 1, i * 2)))
138    val f16FirstNotFoldMask = Cat(f16Mask(i * 4 + 11, i * 4 + 8), f16Mask(i * 4 + 3, i * 4))
139    val f64MaskI = Mux(fuOpType === VfaluType.vfredosum || fuOpType === VfaluType.vfwredosum,
140      Mux(isFold, f64FirstFoldMaskOrder, f64FirstNotFoldMask),
141      Mux(isFirstGroupUop,
142        Mux(isFold, f64FirstFoldMaskUnorder, f64FirstNotFoldMask),
143        Mux(isFold, f64FoldMask, Fill(8, 1.U))))
144    val f32MaskI = Mux(fuOpType === VfaluType.vfredosum || fuOpType === VfaluType.vfwredosum,
145      Mux(isFold, f32FirstFoldMaskOrder, f32FirstNotFoldMask),
146      Mux(isFirstGroupUop,
147        Mux(isFold, f32FirstFoldMaskUnorder, f32FirstNotFoldMask),
148        Mux(isFold, f32FoldMask, Fill(8, 1.U))))
149    val f16MaskI = Mux(fuOpType === VfaluType.vfredosum || fuOpType === VfaluType.vfwredosum,
150      Mux(isFold, f16FirstFoldMaskOrder, f16FirstNotFoldMask),
151      Mux(isFirstGroupUop,
152        Mux(isFold, f16FirstFoldMaskUnorder, f16FirstNotFoldMask),
153        Mux(isFold, f16FoldMask, Fill(8, 1.U))))
154    val outMask = Mux1H(
155      Seq(
156        (sew === 3.U) -> f64MaskI,
157        (sew === 2.U) -> f32MaskI,
158        (sew === 1.U) -> f16MaskI,
159      )
160    )
161    Mux(fuOpType === VfaluType.vfredosum || fuOpType === VfaluType.vfwredosum, outMask(0),outMask)
162  }
163  def genMaskForMerge(inmask:UInt, sew:UInt, i:Int): UInt = {
164    val f64MaskNum = dataWidth / 64
165    val f32MaskNum = dataWidth / 32
166    val f16MaskNum = dataWidth / 16
167    val f64Mask = inmask(f64MaskNum-1,0)
168    val f32Mask = inmask(f32MaskNum-1,0)
169    val f16Mask = inmask(f16MaskNum-1,0)
170    val f64MaskI = Cat(0.U(3.W),f64Mask(i))
171    val f32MaskI = Cat(0.U(2.W),f32Mask(2*i+1,2*i))
172    val f16MaskI = f16Mask(4*i+3,4*i)
173    val outMask = Mux1H(
174      Seq(
175        (sew === 3.U) -> f64MaskI,
176        (sew === 2.U) -> f32MaskI,
177        (sew === 1.U) -> f16MaskI,
178      )
179    )
180    outMask
181  }
182  val isScalarMove = (fuOpType === VfaluType.vfmv_f_s) || (fuOpType === VfaluType.vfmv_s_f)
183  val srcMaskRShift = Wire(UInt((4 * numVecModule).W))
184  val maskRshiftWidth = Wire(UInt(6.W))
185  maskRshiftWidth := Mux1H(
186    Seq(
187      (vsew === VSew.e16) -> (vuopIdx(2,0) << 3),
188      (vsew === VSew.e32) -> (vuopIdx(2,0) << 2),
189      (vsew === VSew.e64) -> (vuopIdx(2,0) << 1),
190    )
191  )
192  srcMaskRShift := (srcMask >> maskRshiftWidth)(4 * numVecModule - 1, 0)
193  val fp_aIsFpCanonicalNAN = Wire(Vec(numVecModule,Bool()))
194  val fp_bIsFpCanonicalNAN = Wire(Vec(numVecModule,Bool()))
195  vfalus.zipWithIndex.foreach {
196    case (mod, i) =>
197      mod.io.fire             := io.in.valid
198      mod.io.fp_a             := Mux(opbWiden, vs1Split.io.outVec64b(i), vs2Split.io.outVec64b(i))  // very dirty TODO
199      mod.io.fp_b             := Mux(opbWiden, vs2Split.io.outVec64b(i), vs1Split.io.outVec64b(i))  // very dirty TODO
200      mod.io.widen_a          := Mux(opbWiden, Cat(vs1Split.io.outVec32b(i+numVecModule), vs1Split.io.outVec32b(i)), Cat(vs2Split.io.outVec32b(i+numVecModule), vs2Split.io.outVec32b(i)))
201      mod.io.widen_b          := Mux(opbWiden, Cat(vs2Split.io.outVec32b(i+numVecModule), vs2Split.io.outVec32b(i)), Cat(vs1Split.io.outVec32b(i+numVecModule), vs1Split.io.outVec32b(i)))
202      mod.io.frs1             := 0.U     // already vf -> vv
203      mod.io.is_frs1          := false.B // already vf -> vv
204      mod.io.mask             := Mux(isScalarMove, !vuopIdx.orR, genMaskForMerge(inmask = srcMaskRShift, sew = vsew, i = i))
205      mod.io.maskForReduction := genMaskForReduction(inmask = srcMaskRShiftForReduction, sew = vsew, i = i)
206      mod.io.uop_idx          := Mux(fuOpType === VfaluType.vfwredosum, 0.U, vuopIdx(0))
207      mod.io.is_vec           := true.B // Todo
208      mod.io.round_mode       := frm
209      mod.io.fp_format        := Mux(resWiden, vsew + 1.U, vsew)
210      mod.io.opb_widening     := opbWiden || (fuOpType === VfaluType.vfwredosum)
211      mod.io.res_widening     := resWiden
212      mod.io.op_code          := opcode
213      resultData(i)           := mod.io.fp_result
214      fflagsData(i)           := mod.io.fflags
215      fp_aIsFpCanonicalNAN(i) := vecCtrl.fpu.isFpToVecInst & (
216          ((vsew === VSew.e32) & (!vs2Split.io.outVec64b(i).head(32).andR)) |
217          ((vsew === VSew.e16) & (!vs2Split.io.outVec64b(i).head(48).andR))
218        )
219      fp_bIsFpCanonicalNAN(i) := vecCtrl.fpu.isFpToVecInst & (
220          ((vsew === VSew.e32) & (!vs1Split.io.outVec64b(i).head(32).andR)) |
221          ((vsew === VSew.e16) & (!vs1Split.io.outVec64b(i).head(48).andR))
222        )
223      mod.io.fp_aIsFpCanonicalNAN := fp_aIsFpCanonicalNAN(i)
224      mod.io.fp_bIsFpCanonicalNAN := fp_bIsFpCanonicalNAN(i)
225  }
226  val resultDataUInt = resultData.asUInt
227  val cmpResultWidth = dataWidth / 16
228  val cmpResult = Wire(Vec(cmpResultWidth, Bool()))
229  for (i <- 0 until cmpResultWidth) {
230    if(i == 0) {
231      cmpResult(i) := resultDataUInt(0)
232    }
233    else if(i < dataWidth / 64) {
234      cmpResult(i) := Mux1H(
235        Seq(
236          (outVecCtrl.vsew === 1.U) -> resultDataUInt(i*16),
237          (outVecCtrl.vsew === 2.U) -> resultDataUInt(i*32),
238          (outVecCtrl.vsew === 3.U) -> resultDataUInt(i*64)
239        )
240      )
241    }
242    else if(i < dataWidth / 32) {
243      cmpResult(i) := Mux1H(
244        Seq(
245          (outVecCtrl.vsew === 1.U) -> resultDataUInt(i * 16),
246          (outVecCtrl.vsew === 2.U) -> resultDataUInt(i * 32),
247          (outVecCtrl.vsew === 3.U) -> false.B
248        )
249      )
250    }
251    else if(i <  dataWidth / 16) {
252      cmpResult(i) := Mux(outVecCtrl.vsew === 1.U, resultDataUInt(i*16), false.B)
253    }
254  }
255
256  val outEew = Mux(RegEnable(resWiden, io.in.fire), outVecCtrl.vsew + 1.U, outVecCtrl.vsew)
257  val outVuopidx = outVecCtrl.vuopIdx(2, 0)
258  val vlMax = ((VLEN/8).U >> outEew).asUInt
259  val lmulAbs = Mux(outVecCtrl.vlmul(2), (~outVecCtrl.vlmul(1,0)).asUInt + 1.U, outVecCtrl.vlmul(1,0))
260  //  vfmv_f_s need vl=1, reduction last uop need vl=1, other uop need vl=vlmax
261  val numOfUopVFRED = {
262    // addTime include add frs1
263    val addTime = MuxLookup(outVecCtrl.vlmul, 1.U(4.W), Array(
264      VLmul.m2 -> 2.U,
265      VLmul.m4 -> 4.U,
266      VLmul.m8 -> 8.U,
267    ))
268    val foldLastVlmul = MuxLookup(outVecCtrl.vsew, "b000".U, Array(
269      VSew.e16 -> VLmul.mf8,
270      VSew.e32 -> VLmul.mf4,
271      VSew.e64 -> VLmul.mf2,
272    ))
273    // lmul < 1, foldTime = vlmul - foldFastVlmul
274    // lmul >= 1, foldTime = 0.U - foldFastVlmul
275    val foldTime = Mux(outVecCtrl.vlmul(2), outVecCtrl.vlmul, 0.U) - foldLastVlmul
276    addTime + foldTime
277  }
278  val reductionVl = Mux((outVecCtrl.vuopIdx ===  numOfUopVFRED - 1.U) || (outCtrl.fuOpType === VfaluType.vfredosum || outCtrl.fuOpType === VfaluType.vfwredosum), 1.U, vlMax)
279  val outIsResuction = outCtrl.fuOpType === VfaluType.vfredusum ||
280    outCtrl.fuOpType === VfaluType.vfredmax ||
281    outCtrl.fuOpType === VfaluType.vfredmin ||
282    outCtrl.fuOpType === VfaluType.vfredosum ||
283    outCtrl.fuOpType === VfaluType.vfwredosum
284  val outVlFix = Mux(
285    outVecCtrl.fpu.isFpToVecInst || (outCtrl.fuOpType === VfaluType.vfmv_f_s),
286    1.U,
287    Mux(
288      outCtrl.fuOpType === VfaluType.vfmv_s_f,
289      outVl.orR,
290      Mux(outIsResuction, reductionVl, outVl)
291    )
292  )
293  val vlMaxAllUop = Wire(outVl.cloneType)
294  vlMaxAllUop := Mux(outVecCtrl.vlmul(2), vlMax >> lmulAbs, vlMax << lmulAbs).asUInt
295  val vlMaxThisUop = Mux(outVecCtrl.vlmul(2), vlMax >> lmulAbs, vlMax).asUInt
296  val vlSetThisUop = Mux(outVlFix > outVuopidx*vlMaxThisUop, outVlFix - outVuopidx*vlMaxThisUop, 0.U)
297  val vlThisUop = Wire(UInt(3.W))
298  vlThisUop := Mux(vlSetThisUop < vlMaxThisUop, vlSetThisUop, vlMaxThisUop)
299  val vlMaskRShift = Wire(UInt((4 * numVecModule).W))
300  vlMaskRShift := Fill(4 * numVecModule, 1.U(1.W)) >> ((4 * numVecModule).U - vlThisUop)
301
302  private val needNoMask = outCtrl.fuOpType === VfaluType.vfmerge ||
303    outCtrl.fuOpType === VfaluType.vfmv_s_f ||
304    outIsResuction ||
305    outVecCtrl.fpu.isFpToVecInst
306  val maskToMgu = Mux(needNoMask, allMaskTrue, outSrcMask)
307  val allFFlagsEn = Wire(Vec(4*numVecModule,Bool()))
308  val outSrcMaskRShift = Wire(UInt((4*numVecModule).W))
309  outSrcMaskRShift := (maskToMgu >> (outVecCtrl.vuopIdx(2,0) * vlMax))(4*numVecModule-1,0)
310  val f16FFlagsEn = outSrcMaskRShift
311  val f32FFlagsEn = Wire(Vec(numVecModule,UInt(4.W)))
312  for (i <- 0 until numVecModule){
313    f32FFlagsEn(i) := Cat(Fill(2, 0.U),outSrcMaskRShift(2*i+1,2*i))
314  }
315  val f64FFlagsEn = Wire(Vec(numVecModule, UInt(4.W)))
316  for (i <- 0 until numVecModule) {
317    f64FFlagsEn(i) := Cat(Fill(3, 0.U), outSrcMaskRShift(i))
318  }
319  val fflagsEn= Mux1H(
320    Seq(
321      (outEew === 1.U) -> f16FFlagsEn.asUInt,
322      (outEew === 2.U) -> f32FFlagsEn.asUInt,
323      (outEew === 3.U) -> f64FFlagsEn.asUInt
324    )
325  )
326  allFFlagsEn := Mux(outIsResuction, Fill(4*numVecModule, 1.U), (fflagsEn & vlMaskRShift)).asTypeOf(allFFlagsEn)
327
328  val allFFlags = fflagsData.asTypeOf(Vec(4*numVecModule,UInt(5.W)))
329  val outFFlags = allFFlagsEn.zip(allFFlags).map{
330    case(en,fflags) => Mux(en, fflags, 0.U(5.W))
331  }.reduce(_ | _)
332  io.out.bits.res.fflags.get := outFFlags
333
334
335  val cmpResultOldVd = Wire(UInt(cmpResultWidth.W))
336  val cmpResultOldVdRshiftWidth = Wire(UInt(6.W))
337  cmpResultOldVdRshiftWidth := Mux1H(
338    Seq(
339      (outVecCtrl.vsew === VSew.e16) -> (outVecCtrl.vuopIdx(2, 0) << 3),
340      (outVecCtrl.vsew === VSew.e32) -> (outVecCtrl.vuopIdx(2, 0) << 2),
341      (outVecCtrl.vsew === VSew.e64) -> (outVecCtrl.vuopIdx(2, 0) << 1),
342    )
343  )
344  cmpResultOldVd := (outOldVd >> cmpResultOldVdRshiftWidth)(4*numVecModule-1,0)
345  val cmpResultForMgu = Wire(Vec(cmpResultWidth, Bool()))
346  private val maxVdIdx = 8
347  private val elementsInOneUop = Mux1H(
348    Seq(
349      (outEew === 1.U) -> (cmpResultWidth).U(4.W),
350      (outEew === 2.U) -> (cmpResultWidth / 2).U(4.W),
351      (outEew === 3.U) -> (cmpResultWidth / 4).U(4.W),
352    )
353  )
354  private val vdIdx = outVecCtrl.vuopIdx(2, 0)
355  private val elementsComputed = Mux1H(Seq.tabulate(maxVdIdx)(i => (vdIdx === i.U) -> (elementsInOneUop * i.U)))
356  for (i <- 0 until cmpResultWidth) {
357    val cmpResultWithVmask = Mux(outSrcMaskRShift(i), cmpResult(i), Mux(outVecCtrl.vma, true.B, cmpResultOldVd(i)))
358    cmpResultForMgu(i) := Mux(elementsComputed +& i.U >= outVl, true.B, cmpResultWithVmask)
359  }
360  val outIsFold = outVecCtrl.fpu.isFoldTo1_2 || outVecCtrl.fpu.isFoldTo1_4 || outVecCtrl.fpu.isFoldTo1_8
361  val outOldVdForREDO = Mux1H(Seq(
362    (outVecCtrl.vsew === VSew.e16) -> (outOldVd >> 16),
363    (outVecCtrl.vsew === VSew.e32) -> (outOldVd >> 32),
364    (outVecCtrl.vsew === VSew.e64) -> (outOldVd >> 64),
365  ))
366  val outOldVdForWREDO = Mux(
367    !outIsFold,
368    Mux(outVecCtrl.vsew === VSew.e16, Cat(outOldVd(VLEN-1-16,16), 0.U(32.W)), Cat(outOldVd(VLEN-1-32,32), 0.U(64.W))),
369    Mux(outVecCtrl.vsew === VSew.e16,
370      // Divide vuopIdx by 8 and the remainder is 1
371      Mux(outVecCtrl.vuopIdx(2,0) === 1.U, outOldVd, outOldVd >> 16),
372      // Divide vuopIdx by 4 and the remainder is 1
373      Mux(outVecCtrl.vuopIdx(1,0) === 1.U, outOldVd, outOldVd >> 32)
374    ),
375  )
376  val outOldVdForRED = Mux(outCtrl.fuOpType === VfaluType.vfredosum, outOldVdForREDO, outOldVdForWREDO)
377  val numOfUopVFREDOSUM = {
378    val uvlMax = MuxLookup(outVecCtrl.vsew, 0.U, Array(
379      VSew.e16 -> 8.U,
380      VSew.e32 -> 4.U,
381      VSew.e64 -> 2.U,
382    ))
383    val vlMax = Mux(outVecCtrl.vlmul(2), uvlMax >> (-outVecCtrl.vlmul)(1, 0), uvlMax << outVecCtrl.vlmul(1, 0)).asUInt
384    vlMax
385  }
386  val isOutOldVdForREDO = (outCtrl.fuOpType === VfaluType.vfredosum && outIsFold) || outCtrl.fuOpType === VfaluType.vfwredosum
387  val taIsFalseForVFREDO = ((outCtrl.fuOpType === VfaluType.vfredosum) || (outCtrl.fuOpType === VfaluType.vfwredosum)) && (outVecCtrl.vuopIdx =/= numOfUopVFREDOSUM - 1.U)
388  // outVecCtrl.fpu.isFpToVecInst means the instruction is float instruction, not vector float instruction
389  val notUseVl = outVecCtrl.fpu.isFpToVecInst || (outCtrl.fuOpType === VfaluType.vfmv_f_s)
390  val notModifyVd = !notUseVl && (outVl === 0.U)
391  mgu.io.in.vd := Mux(outVecCtrl.isDstMask, Cat(0.U((dataWidth / 16 * 15).W), cmpResultForMgu.asUInt), resultDataUInt)
392  mgu.io.in.oldVd := Mux(isOutOldVdForREDO, outOldVdForRED, outOldVd)
393  mgu.io.in.mask := maskToMgu
394  mgu.io.in.info.ta := Mux(outCtrl.fuOpType === VfaluType.vfmv_f_s, true.B , Mux(taIsFalseForVFREDO, false.B, outVecCtrl.vta))
395  mgu.io.in.info.ma := Mux(outCtrl.fuOpType === VfaluType.vfmv_s_f, true.B , outVecCtrl.vma)
396  mgu.io.in.info.vl := outVlFix
397  mgu.io.in.info.vstart := outVecCtrl.vstart
398  mgu.io.in.info.vlmul := outVecCtrl.vlmul
399  mgu.io.in.info.valid := Mux(notModifyVd, false.B, io.in.valid)
400  mgu.io.in.info.vstart := Mux(outVecCtrl.fpu.isFpToVecInst, 0.U, outVecCtrl.vstart)
401  mgu.io.in.info.eew := outEew
402  mgu.io.in.info.vsew := outVecCtrl.vsew
403  mgu.io.in.info.vdIdx := Mux(outIsResuction, 0.U, outVecCtrl.vuopIdx)
404  mgu.io.in.info.narrow := outVecCtrl.isNarrow
405  mgu.io.in.info.dstMask := outVecCtrl.isDstMask
406  mgu.io.in.isIndexedVls := false.B
407  mgtu.io.in.vd := Mux(outVecCtrl.isDstMask, mgu.io.out.vd, resultDataUInt)
408  mgtu.io.in.vl := outVl
409  val resultFpMask = Wire(UInt(VLEN.W))
410  val isFclass = outVecCtrl.fpu.isFpToVecInst && (outCtrl.fuOpType === VfaluType.vfclass)
411  val fpCmpFuOpType = Seq(VfaluType.vfeq, VfaluType.vflt, VfaluType.vfle)
412  val isCmp = outVecCtrl.fpu.isFpToVecInst && (fpCmpFuOpType.map(_ === outCtrl.fuOpType).reduce(_|_))
413  resultFpMask := Mux(isFclass || isCmp, Fill(16, 1.U(1.W)), Fill(VLEN, 1.U(1.W)))
414  // when dest is mask, the result need to be masked by mgtu
415  io.out.bits.res.data := Mux(notModifyVd, outOldVd, Mux(outVecCtrl.isDstMask, mgtu.io.out.vd, mgu.io.out.vd) & resultFpMask)
416  io.out.bits.ctrl.exceptionVec.get(ExceptionNO.illegalInstr) := mgu.io.out.illegal
417
418}
419
420class VFMgu(vlen:Int)(implicit p: Parameters) extends Module{
421  val io = IO(new VFMguIO(vlen))
422
423  val vd = io.in.vd
424  val oldvd = io.in.oldVd
425  val mask = io.in.mask
426  val vsew = io.in.info.eew
427  val num16bits = vlen / 16
428
429}
430
431class VFMguIO(vlen: Int)(implicit p: Parameters) extends Bundle {
432  val in = new Bundle {
433    val vd = Input(UInt(vlen.W))
434    val oldVd = Input(UInt(vlen.W))
435    val mask = Input(UInt(vlen.W))
436    val info = Input(new VecInfo)
437  }
438  val out = new Bundle {
439    val vd = Output(UInt(vlen.W))
440  }
441}