1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include "helpers.h"
7
8 using namespace aco;
9
10 BEGIN_TEST(optimize.neg)
11 for (unsigned i = GFX9; i <= GFX10; i++) {
12 //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
13 if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i))
14 continue;
15
16 //! v1: %res0 = v_mul_f32 %a, -%b
17 //! p_unit_test 0, %res0
18 Temp neg_b = fneg(inputs[1]);
19 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
20
21 //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
22 //~gfx9! v1: %res1 = v_max_f32 0x123456, %neg_a
23 //~gfx10! v1: %res1 = v_max_f32 0x123456, -%a
24 //! p_unit_test 1, %res1
25 Temp neg_a = fneg(inputs[0]);
26 writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
27
28 //! v1: %res2 = v_mul_f32 %a, %b
29 //! p_unit_test 2, %res2
30 Temp neg_neg_a = fneg(neg_a);
31 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
32
33 //! v1: %res3 = v_mul_f32 |%a|, %b
34 //! p_unit_test 3, %res3
35 Temp abs_neg_a = fabs(neg_a);
36 writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
37
38 //! v1: %res4 = v_mul_f32 -|%a|, %b
39 //! p_unit_test 4, %res4
40 Temp abs_a = fabs(inputs[0]);
41 Temp neg_abs_a = fneg(abs_a);
42 writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
43
44 //~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
45 //~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi
46 //! p_unit_test 5, %res5
47 writeout(5,
48 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
49
50 //! v1: %res6 = v_subrev_f32 %a, %b
51 //! p_unit_test 6, %res6
52 writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
53
54 //! v1: %res7 = v_sub_f32 %b, %a
55 //! p_unit_test 7, %res7
56 writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
57
58 //! v1: %res8 = v_mul_f32 %a, -%c
59 //! p_unit_test 8, %res8
60 Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
61 writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
62
63 // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
64 // //! p_unit_test 9, %res9
65 Temp abs_neg_abs_a = fabs(neg_abs_a);
66 writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
67
68 finish_opt_test();
69 }
70 END_TEST
71
72 BEGIN_TEST(optimize.output_modifiers)
73 //>> v1: %a, v1: %b = p_startpgm
74 if (!setup_cs("v1 v1", GFX9))
75 return;
76
77 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
78
79 /* 32-bit modifiers */
80
81 //! v1: %res0 = v_add_f32 %a, %b *0.5
82 //! p_unit_test 0, %res0
83 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
84 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
85
86 //! v1: %res1 = v_add_f32 %a, %b *2
87 //! p_unit_test 1, %res1
88 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
89 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
90
91 //! v1: %res2 = v_add_f32 %a, %b *4
92 //! p_unit_test 2, %res2
93 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
94 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
95
96 //! v1: %res3 = v_add_f32 %a, %b clamp
97 //! p_unit_test 3, %res3
98 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
99 writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
100 Operand::c32(0x3f800000u), tmp));
101
102 //! v1: %res4 = v_add_f32 %a, %b *2 clamp
103 //! p_unit_test 4, %res4
104 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
106 writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
107 Operand::c32(0x3f800000u), tmp));
108
109 /* 16-bit modifiers */
110
111 //! v2b: %res5 = v_add_f16 %a, %b *0.5
112 //! p_unit_test 5, %res5
113 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
114 writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
115
116 //! v2b: %res6 = v_add_f16 %a, %b *2
117 //! p_unit_test 6, %res6
118 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
119 writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
120
121 //! v2b: %res7 = v_add_f16 %a, %b *4
122 //! p_unit_test 7, %res7
123 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
124 writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
125
126 //! v2b: %res8 = v_add_f16 %a, %b clamp
127 //! p_unit_test 8, %res8
128 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
129 writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
130 Operand::c16(0x3c00u), tmp));
131
132 //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
133 //! p_unit_test 9, %res9
134 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
135 tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
136 writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
137 Operand::c16(0x3c00u), tmp));
138
139 /* clamping is done after omod */
140
141 //! v1: %res10_tmp = v_add_f32 %a, %b clamp
142 //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
143 //! p_unit_test 10, %res10
144 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
145 tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
146 tmp);
147 writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
148
149 /* unsupported instructions */
150
151 //! v1: %res11_tmp = v_xor_b32 %a, %b
152 //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
153 //! p_unit_test 11, %res11
154 tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
155 writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
156
157 /* several users */
158
159 //! v1: %res12_tmp = v_add_f32 %a, %b
160 //! p_unit_test %res12_tmp
161 //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
162 //! p_unit_test 12, %res12
163 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
164 bld.pseudo(aco_opcode::p_unit_test, tmp);
165 writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
166
167 //! v1: %res13 = v_add_f32 %a, %b
168 //! p_unit_test 13, %res13
169 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
170 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
171 writeout(13, tmp);
172
173 /* omod has no effect if denormals are enabled but clamp is fine */
174
175 //>> BB1
176 //! /* logical preds: BB0, / linear preds: BB0, / kind: */
177 program->next_fp_mode.denorm32 = fp_denorm_keep;
178 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
179 bld.reset(program->create_and_insert_block());
180 program->blocks[0].linear_succs.push_back(1);
181 program->blocks[0].logical_succs.push_back(1);
182 program->blocks[1].linear_preds.push_back(0);
183 program->blocks[1].logical_preds.push_back(0);
184
185 //! v1: %res14_tmp = v_add_f32 %a, %b
186 //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
187 //! p_unit_test 14, %res14
188 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
189 writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
190
191 //! v1: %res15 = v_add_f32 %a, %b clamp
192 //! p_unit_test 15, %res15
193 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
194 writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
195 Operand::c32(0x3f800000u), tmp));
196
197 //>> BB2
198 //! /* logical preds: BB1, / linear preds: BB1, / kind: */
199 program->next_fp_mode.denorm32 = fp_denorm_flush;
200 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
201 bld.reset(program->create_and_insert_block());
202 program->blocks[1].linear_succs.push_back(2);
203 program->blocks[1].logical_succs.push_back(2);
204 program->blocks[2].linear_preds.push_back(1);
205 program->blocks[2].logical_preds.push_back(1);
206
207 //! v2b: %res16_tmp = v_add_f16 %a, %b
208 //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
209 //! p_unit_test 16, %res16
210 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
211 writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
212
213 //! v2b: %res17 = v_add_f16 %a, %b clamp
214 //! p_unit_test 17, %res17
215 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
216 writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
217 Operand::c16(0x3c00u), tmp));
218
219 /* omod flushes -0.0 to +0.0 */
220
221 //>> BB3
222 //! /* logical preds: BB2, / linear preds: BB2, / kind: uniform, */
223 program->next_fp_mode.denorm32 = fp_denorm_flush;
224 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
225 bld.reset(program->create_and_insert_block());
226 bld.is_sz_preserve = true;
227 program->blocks[2].linear_succs.push_back(3);
228 program->blocks[2].logical_succs.push_back(3);
229 program->blocks[3].linear_preds.push_back(2);
230 program->blocks[3].logical_preds.push_back(2);
231
232 //! v1: (SzPreserve)%res18_tmp = v_add_f32 %a, %b
233 //! v1: (SzPreserve)%res18 = v_mul_f32 2.0, %res18_tmp
234 //! p_unit_test 18, %res18
235 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
236 writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
237 //! v1: (SzPreserve)%res19 = v_add_f32 %a, %b clamp
238 //! p_unit_test 19, %res19
239 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
240 writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
241 Operand::c32(0x3f800000u), tmp));
242
243 //! v2b: (SzPreserve)%res20_tmp = v_add_f16 %a, %b
244 //! v2b: (SzPreserve)%res20 = v_mul_f16 2.0, %res20_tmp
245 //! p_unit_test 20, %res20
246 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
247 writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
248 //! v2b: (SzPreserve)%res21 = v_add_f16 %a, %b clamp
249 //! p_unit_test 21, %res21
250 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
251 writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
252 Operand::c16(0x3c00u), tmp));
253
254 finish_opt_test();
255 END_TEST
256
257 Temp
create_subbrev_co(Operand op0,Operand op1,Operand op2)258 create_subbrev_co(Operand op0, Operand op1, Operand op2)
259 {
260 return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2);
261 }
262
263 BEGIN_TEST(optimize.cndmask)
264 for (unsigned i = GFX9; i <= GFX10; i++) {
265 //>> v1: %a, s1: %b, s2: %c = p_startpgm
266 if (!setup_cs("v1 s1 s2", (amd_gfx_level)i))
267 continue;
268
269 Temp subbrev;
270
271 //! v1: %res0 = v_cndmask_b32 0, %a, %c
272 //! p_unit_test 0, %res0
273 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
274 writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
275
276 //! v1: %res1 = v_cndmask_b32 0, 42, %c
277 //! p_unit_test 1, %res1
278 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
279 writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
280
281 //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
282 //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
283 //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
284 //! p_unit_test 2, %res2
285 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
286 writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
287
288 //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
289 //! v1: %xor = v_xor_b32 %a, %subbrev1
290 //! v1: %res3 = v_cndmask_b32 0, %xor, %c
291 //! p_unit_test 3, %res3
292 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
293 Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
294 writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
295
296 //! v1: %res4 = v_cndmask_b32 0, %a, %c
297 //! p_unit_test 4, %res4
298 Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
299 Operand::c32(1u), Operand(inputs[2]));
300 Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
301 writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
302
303 finish_opt_test();
304 }
305 END_TEST
306
307 BEGIN_TEST(optimize.add_lshl)
308 for (unsigned i = GFX8; i <= GFX10; i++) {
309 //>> s1: %a, v1: %b = p_startpgm
310 if (!setup_cs("s1 v1", (amd_gfx_level)i))
311 continue;
312
313 Temp shift;
314
315 //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
316 //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
317 //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
318 //! p_unit_test 0, %res0
319 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
320 Operand::c32(3u));
321 writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
322 Operand::c32(4u)));
323
324 //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
325 //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
326 //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
327 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
328 //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
329 //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
330 //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
331 //! p_unit_test 1, %res1
332 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
333 Operand::c32(3u));
334 Temp sadd =
335 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
336 Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
337 writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
338
339 //~gfx8! s1: %lshl2, s1: %_:scc = s_lshl_b32 %a, 3
340 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
341 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
342 //! p_unit_test 2, %res2
343 Temp lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
344 Operand(inputs[0]), Operand::c32(3u));
345 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
346
347 //~gfx8! s1: %lshl3, s1: %_:scc = s_lshl_b32 (is24bit)%a, 7
348 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
349 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
350 //! p_unit_test 3, %res3
351 Operand a_24bit = Operand(inputs[0]);
352 a_24bit.set24bit(true);
353 lshl =
354 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(7u));
355 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
356
357 //! s1: %lshl4, s1: %_:scc = s_lshl_b32 (is24bit)%a, 3
358 //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
359 //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
360 //! p_unit_test 4, %carry
361 lshl =
362 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
363 Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
364 writeout(4, carry);
365
366 //~gfx8! s1: %lshl5, s1: %_:scc = s_lshl_b32 (is24bit)%a, (is24bit)%a
367 //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
368 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
369 //! p_unit_test 5, %res5
370 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, a_24bit);
371 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
372
373 //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
374 //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
375 //! p_unit_test 6, %res6
376 lshl =
377 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
378 writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
379
380 //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
381 //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
382 //! p_unit_test 7, %res7
383 Operand a_16bit = Operand(inputs[0]);
384 a_16bit.set16bit(true);
385 lshl =
386 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_16bit, Operand::c32(4u));
387 writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
388
389 finish_opt_test();
390 }
391 END_TEST
392
393 BEGIN_TEST(optimize.bcnt)
394 for (unsigned i = GFX8; i <= GFX10; i++) {
395 //>> v1: %a, s1: %b = p_startpgm
396 if (!setup_cs("v1 s1", (amd_gfx_level)i))
397 continue;
398
399 Temp bcnt;
400
401 //! v1: %res0 = v_bcnt_u32_b32 %a, %a
402 //! p_unit_test 0, %res0
403 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
404 writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
405
406 //! v1: %res1 = v_bcnt_u32_b32 %a, %b
407 //! p_unit_test 1, %res1
408 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
409 writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
410
411 //! v1: %res2 = v_bcnt_u32_b32 %a, 42
412 //! p_unit_test 2, %res2
413 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
414 writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
415
416 //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
417 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
418 //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
419 //! p_unit_test 3, %res3
420 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
421 writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
422
423 //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
424 //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
425 //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
426 //! p_unit_test 4, %carry
427 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
428 Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
429 writeout(4, carry);
430
431 finish_opt_test();
432 }
433 END_TEST
434
435 struct clamp_config {
436 const char* name;
437 aco_opcode min, max, med3;
438 Operand lb, ub;
439 };
440
441 static const clamp_config clamp_configs[] = {
442 /* 0.0, 4.0 */
443 {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
444 Operand::zero(), Operand::c32(0x40800000u)},
445 {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
446 Operand::c16(0u), Operand::c16(0x4400)},
447 /* -1.0, 0.0 */
448 {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
449 Operand::c32(0xbf800000u), Operand::zero()},
450 {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
451 Operand::c16(0xBC00), Operand::c16(0u)},
452 /* 0, 3 */
453 {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
454 Operand::zero(), Operand::c32(3u)},
455 {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
456 Operand::c16(0u), Operand::c16(3u)},
457 {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
458 Operand::zero(), Operand::c32(3u)},
459 {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
460 Operand::c16(0u), Operand::c16(3u)},
461 /* -5, 0 */
462 {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
463 Operand::c32(0xfffffffbu), Operand::zero()},
464 {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
465 Operand::c16(0xfffbu), Operand::c16(0u)},
466 };
467
468 BEGIN_TEST(optimize.clamp)
469 for (clamp_config cfg : clamp_configs) {
470 if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
471 continue;
472
473 //! cfg: @match_func(min max med3 lb ub)
474 fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
475 fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
476 fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
477 aco_print_operand(&cfg.lb, output);
478 fprintf(output, " ");
479 aco_print_operand(&cfg.ub, output);
480 fprintf(output, "\n");
481
482 //>> v1: %a, v1: %b, v1: %c = p_startpgm
483
484 //! v1: %res0 = @med3 @ub, @lb, %a
485 //! p_unit_test 0, %res0
486 writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
487 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
488
489 //! v1: %res1 = @med3 @lb, @ub, %a
490 //! p_unit_test 1, %res1
491 writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
492 bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
493
494 /* min constant must be greater than max constant */
495 //! v1: %res2_tmp = @min @lb, %a
496 //! v1: %res2 = @max @ub, %res2_tmp
497 //! p_unit_test 2, %res2
498 writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
499 bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
500
501 //! v1: %res3_tmp = @max @ub, %a
502 //! v1: %res3 = @min @lb, %res3_tmp
503 //! p_unit_test 3, %res3
504 writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
505 bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
506
507 /* needs two constants */
508
509 //! v1: %res4_tmp = @max @lb, %a
510 //! v1: %res4 = @min %b, %res4_tmp
511 //! p_unit_test 4, %res4
512 writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
513 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
514
515 //! v1: %res5_tmp = @max %b, %a
516 //! v1: %res5 = @min @ub, %res5_tmp
517 //! p_unit_test 5, %res5
518 writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
519 bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
520
521 //! v1: %res6_tmp = @max %c, %a
522 //! v1: %res6 = @min %b, %res6_tmp
523 //! p_unit_test 6, %res6
524 writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
525 bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
526
527 /* correct NaN behaviour with precise */
528 if (cfg.min == aco_opcode::v_min_f16 || cfg.min == aco_opcode::v_min_f32) {
529 //~f(16|32)! v1: %res7 = @med3 @ub, @lb, %a
530 //~f(16|32)! p_unit_test 7, %res7
531 Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
532 max.def(0).setPrecise(true);
533 Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
534 max.def(0).setPrecise(true);
535 writeout(7, min);
536
537 //~f(16|32)! v1: (precise)%res8_tmp = @min @ub, %a
538 //~f(16|32)! v1: %res8 = @max @lb, %res8_tmp
539 //~f(16|32)! p_unit_test 8, %res8
540 min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
541 min.def(0).setPrecise(true);
542 writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
543 }
544
545 finish_opt_test();
546 }
547 END_TEST
548
549 BEGIN_TEST(optimize.add3)
550 //>> v1: %a, v1: %b, v1: %c = p_startpgm
551 if (!setup_cs("v1 v1 v1", GFX9))
552 return;
553
554 //! v1: %res0 = v_add3_u32 %a, %b, %c
555 //! p_unit_test 0, %res0
556 Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
557 writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
558
559 //! v1: %tmp1 = v_add_u32 %b, %c clamp
560 //! v1: %res1 = v_add_u32 %a, %tmp1
561 //! p_unit_test 1, %res1
562 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
563 tmp->valu().clamp = true;
564 writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
565
566 //! v1: %tmp2 = v_add_u32 %b, %c
567 //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
568 //! p_unit_test 2, %res2
569 tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
570 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
571 tmp->valu().clamp = true;
572 writeout(2, tmp);
573
574 finish_opt_test();
575 END_TEST
576
577 BEGIN_TEST(optimize.minmax)
578 for (unsigned i = GFX10_3; i <= GFX11; i++) {
579 //>> v1: %a, v1: %b, v1: %c = p_startpgm
580 if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
581 continue;
582
583 Temp a = inputs[0];
584 Temp b = inputs[1];
585 Temp c = inputs[2];
586
587 //! v1: %res0 = v_min3_f32 %a, %b, %c
588 //! p_unit_test 0, %res0
589 writeout(0, fmin(c, fmin(a, b)));
590
591 //! v1: %res1 = v_max3_f32 %a, %b, %c
592 //! p_unit_test 1, %res1
593 writeout(1, fmax(c, fmax(a, b)));
594
595 //! v1: %res2 = v_min3_f32 -%a, -%b, %c
596 //! p_unit_test 2, %res2
597 writeout(2, fmin(c, fneg(fmax(a, b))));
598
599 //! v1: %res3 = v_max3_f32 -%a, -%b, %c
600 //! p_unit_test 3, %res3
601 writeout(3, fmax(c, fneg(fmin(a, b))));
602
603 //! v1: %res4 = v_max3_f32 -%a, %b, %c
604 //! p_unit_test 4, %res4
605 writeout(4, fmax(c, fneg(fmin(a, fneg(b)))));
606
607 //~gfx10_3! v1: %res5_tmp = v_max_f32 %a, %b
608 //~gfx10_3! v1: %res5 = v_min_f32 %c, %res5_tmp
609 //~gfx11! v1: %res5 = v_maxmin_f32 %a, %b, %c
610 //! p_unit_test 5, %res5
611 writeout(5, fmin(c, fmax(a, b)));
612
613 //~gfx10_3! v1: %res6_tmp = v_min_f32 %a, %b
614 //~gfx10_3! v1: %res6 = v_max_f32 %c, %res6_tmp
615 //~gfx11! v1: %res6 = v_minmax_f32 %a, %b, %c
616 //! p_unit_test 6, %res6
617 writeout(6, fmax(c, fmin(a, b)));
618
619 //~gfx10_3! v1: %res7_tmp = v_min_f32 %a, %b
620 //~gfx10_3! v1: %res7 = v_min_f32 %c, -%res7_tmp
621 //~gfx11! v1: %res7 = v_maxmin_f32 -%a, -%b, %c
622 //! p_unit_test 7, %res7
623 writeout(7, fmin(c, fneg(fmin(a, b))));
624
625 //~gfx10_3! v1: %res8_tmp = v_max_f32 %a, %b
626 //~gfx10_3! v1: %res8 = v_max_f32 %c, -%res8_tmp
627 //~gfx11! v1: %res8 = v_minmax_f32 -%a, -%b, %c
628 //! p_unit_test 8, %res8
629 writeout(8, fmax(c, fneg(fmax(a, b))));
630
631 //~gfx10_3! v1: %res9_tmp = v_max_f32 %a, -%b
632 //~gfx10_3! v1: %res9 = v_max_f32 %c, -%res9_tmp
633 //~gfx11! v1: %res9 = v_minmax_f32 -%a, %b, %c
634 //! p_unit_test 9, %res9
635 writeout(9, fmax(c, fneg(fmax(a, fneg(b)))));
636
637 finish_opt_test();
638 }
639 END_TEST
640
641 BEGIN_TEST(optimize.mad_32_24)
642 for (unsigned i = GFX8; i <= GFX9; i++) {
643 //>> v1: %a, v1: %b, v1: %c = p_startpgm
644 if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
645 continue;
646
647 //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
648 //! p_unit_test 0, %res0
649 Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
650 writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
651
652 //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
653 //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
654 //! p_unit_test 1, %res1
655 mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
656 writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
657
658 finish_opt_test();
659 }
660 END_TEST
661
662 BEGIN_TEST(optimize.add_lshlrev)
663 for (unsigned i = GFX8; i <= GFX10; i++) {
664 //>> v1: %a, v1: %b, s1: %c = p_startpgm
665 if (!setup_cs("v1 v1 s1", (amd_gfx_level)i))
666 continue;
667
668 Temp lshl;
669
670 //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
671 //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
672 //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
673 //! p_unit_test 0, %res0
674 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
675 writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
676
677 //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
678 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
679 //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
680 //! p_unit_test 1, %res1
681 Operand a_24bit = Operand(inputs[0]);
682 a_24bit.set24bit(true);
683 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
684 writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
685
686 //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
687 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
688 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
689 //! p_unit_test 2, %res2
690 Operand b_24bit = Operand(inputs[1]);
691 b_24bit.set24bit(true);
692 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
693 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
694
695 //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
696 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
697 //! p_unit_test 3, %res3
698 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
699 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
700
701 //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
702 //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
703 //! p_unit_test 4, %res4
704 Operand a_16bit = Operand(inputs[0]);
705 a_16bit.set16bit(true);
706 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
707 writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
708
709 //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
710 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
711 //! p_unit_test 5, %res5
712 Operand c_24bit = Operand(inputs[2]);
713 c_24bit.set24bit(true);
714 lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
715 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
716
717 finish_opt_test();
718 }
719 END_TEST
720
721 enum denorm_op {
722 denorm_mul1 = 0,
723 denorm_fneg = 1,
724 denorm_fabs = 2,
725 denorm_fnegabs = 3,
726 };
727
728 static const char* denorm_op_names[] = {
729 "mul1",
730 "fneg",
731 "fabs",
732 "fnegabs",
733 };
734
735 struct denorm_config {
736 bool flush;
737 unsigned op;
738 aco_opcode src;
739 aco_opcode dest;
740 };
741
742 static const char*
srcdest_op_name(aco_opcode op)743 srcdest_op_name(aco_opcode op)
744 {
745 switch (op) {
746 case aco_opcode::v_cndmask_b32: return "cndmask";
747 case aco_opcode::v_min_f32: return "min";
748 case aco_opcode::v_rcp_f32: return "rcp";
749 default: return "none";
750 }
751 }
752
753 static Temp
emit_denorm_srcdest(aco_opcode op,Temp val)754 emit_denorm_srcdest(aco_opcode op, Temp val)
755 {
756 switch (op) {
757 case aco_opcode::v_cndmask_b32:
758 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
759 case aco_opcode::v_min_f32:
760 return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
761 case aco_opcode::v_rcp_f32: return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
762 default: return val;
763 }
764 }
765
766 BEGIN_TEST(optimize.denorm_propagation)
767 for (unsigned i = GFX8; i <= GFX9; i++) {
768 std::vector<denorm_config> configs;
769 for (bool flush : {false, true}) {
770 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
771 configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
772
773 for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
774 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
775 configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
776 }
777
778 for (aco_opcode src :
779 {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
780 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
781 configs.push_back({flush, op, src, aco_opcode::num_opcodes});
782 }
783 }
784
785 for (denorm_config cfg : configs) {
786 char subvariant[128];
787 sprintf(subvariant, "_%s_%s_%s_%s", cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
788 denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
789 if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
790 continue;
791
792 bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 ||
793 (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
794 cfg.dest == aco_opcode::v_rcp_f32 ||
795 (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || !cfg.flush;
796
797 fprintf(output, "src, dest, op: %s %s %s\n", srcdest_op_name(cfg.src),
798 srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
799 fprintf(output, "can_propagate: %u\n", can_propagate);
800 //! src, dest, op: $src $dest $op
801 //! can_propagate: #can_propagate
802 //>> v1: %a, s2: %b = p_startpgm
803
804 //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
805 //; 'min': 'v1: %{} = v_min_f32 0, {}',
806 //; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
807 //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
808 //; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
809 //; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
810 //; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
811 //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
812
813 //; name = 'a'
814 //; if src != 'none':
815 //; insert_pattern(patterns[src].format('src_res', '%'+name))
816 //; name = 'src_res'
817
818 //; if can_propagate:
819 //; name = inline_ops[op].format(name)
820 //; else:
821 //; insert_pattern(ops[op].format('op_res', name))
822 //; name = '%op_res'
823
824 //; if dest != 'none':
825 //; insert_pattern(patterns[dest].format('dest_res', name))
826 //; name = '%dest_res'
827
828 //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
829 //! p_unit_test 0, %res
830
831 program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
832
833 Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
834 switch (cfg.op) {
835 case denorm_mul1:
836 val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
837 break;
838 case denorm_fneg: val = fneg(val); break;
839 case denorm_fabs: val = fabs(val); break;
840 case denorm_fnegabs: val = fneg(fabs(val)); break;
841 }
842 val = emit_denorm_srcdest(cfg.dest, val);
843 writeout(
844 0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
845
846 finish_opt_test();
847 }
848 }
849 END_TEST
850
851 BEGIN_TEST(optimizer.dpp)
852 //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
853 if (!setup_cs("v1 v1 s2 s1", GFX10_3))
854 return;
855
856 Operand a(inputs[0]);
857 Operand b(inputs[1]);
858 Operand c(inputs[2]);
859 Operand d(inputs[3]);
860
861 /* basic optimization */
862 //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi
863 //! p_unit_test 0, %res0
864 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
865 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
866 writeout(0, res0);
867
868 /* operand swapping */
869 //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi
870 //! p_unit_test 1, %res1
871 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
872 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
873 writeout(1, res1);
874
875 //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
876 //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi
877 //! p_unit_test 2, %res2
878 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
879 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
880 writeout(2, res2);
881
882 /* modifiers */
883 //! v1: %res3 = v_max_f32 -%a, %b row_mirror bound_ctrl:1 fi
884 //! p_unit_test 3, %res3
885 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
886 tmp3->dpp16().neg[0] = true;
887 Temp res3 = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), tmp3, b);
888 writeout(3, res3);
889
890 //! v1: %res4 = v_max_f32 -%a, %b row_mirror bound_ctrl:1 fi
891 //! p_unit_test 4, %res4
892 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
893 auto res4 = bld.vop2_e64(aco_opcode::v_max_f32, bld.def(v1), tmp4, b);
894 res4->valu().neg[0] = true;
895 writeout(4, res4);
896
897 //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
898 //! v1: %res5 = v_add_f32 %tmp5, %b clamp
899 //! p_unit_test 5, %res5
900 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
901 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
902 res5->valu().clamp = true;
903 writeout(5, res5);
904
905 //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi
906 //! p_unit_test 6, %res6
907 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
908 tmp6->dpp16().neg[0] = true;
909 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
910 res6->valu().abs[0] = true;
911 writeout(6, res6);
912
913 //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi
914 //! p_unit_test 7, %res7
915 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
916 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
917 res7->valu().abs[0] = true;
918 writeout(7, res7);
919
920 //! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
921 //! v1: %res11 = v_add_u32 %tmp11, %b
922 //! p_unit_test 11, %res11
923 auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
924 tmp11->dpp16().neg[0] = true;
925 Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b);
926 writeout(11, res11);
927
928 //! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
929 //! v1: %res12 = v_add_f16 %tmp12, %b
930 //! p_unit_test 12, %res12
931 auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
932 tmp12->dpp16().neg[0] = true;
933 Temp res12 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1), tmp12, b);
934 writeout(12, res12);
935
936 /* vcc */
937 //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi
938 //! p_unit_test 8, %res8
939 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
940 Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
941 writeout(8, res8);
942
943 /* sgprs */
944 //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
945 //! v1: %res9 = v_add_f32 %tmp9, %d
946 //! p_unit_test 9, %res9
947 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
948 Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
949 writeout(9, res9);
950
951 //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
952 //! v1: %res10 = v_add_f32 %d, %tmp10
953 //! p_unit_test 10, %res10
954 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
955 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
956 writeout(10, res10);
957
958 finish_opt_test();
959 END_TEST
960
961 BEGIN_TEST(optimize.dpp_prop)
962 //>> v1: %a, s1: %b = p_startpgm
963 if (!setup_cs("v1 s1", GFX10))
964 return;
965
966 //! v1: %one = p_parallelcopy 1
967 //! v1: %res0 = v_mul_f32 1, %a
968 //! p_unit_test 0, %res0
969 Temp one = bld.copy(bld.def(v1), Operand::c32(1));
970 writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_rr(1)));
971
972 //! v1: %res1 = v_mul_f32 %a, %one row_ror:1 bound_ctrl:1 fi
973 //! p_unit_test 1, %res1
974 writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_rr(1)));
975
976 //! v1: %res2 = v_mul_f32 0x12345678, %a
977 //! p_unit_test 2, %res2
978 Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
979 writeout(2,
980 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_rr(1)));
981
982 //! v1: %literal2 = p_parallelcopy 0x12345679
983 //! v1: %res3 = v_mul_f32 %a, %literal row_ror:1 bound_ctrl:1 fi
984 //! p_unit_test 3, %res3
985 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
986 writeout(3,
987 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_rr(1)));
988
989 //! v1: %b_v = p_parallelcopy %b
990 //! v1: %res4 = v_mul_f32 %b, %a
991 //! p_unit_test 4, %res4
992 Temp b_v = bld.copy(bld.def(v1), inputs[1]);
993 writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_rr(1)));
994
995 //! v1: %res5 = v_mul_f32 %a, %b_v row_ror:1 bound_ctrl:1 fi
996 //! p_unit_test 5, %res5
997 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_rr(1)));
998
999 //! v1: %res6 = v_rcp_f32 %b
1000 //! p_unit_test 6, %res6
1001 writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_rr(1)));
1002
1003 finish_opt_test();
1004 END_TEST
1005
1006 BEGIN_TEST(optimize.casts)
1007 //>> v1: %a, v2b: %a16 = p_startpgm
1008 if (!setup_cs("v1 v2b", GFX10_3))
1009 return;
1010
1011 Temp a = inputs[0];
1012 Temp a16 = inputs[1];
1013
1014 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1015
1016 //! v1: %res0_tmp = v_mul_f32 -1.0, %a
1017 //! v2b: %res0 = v_mul_f16 %res0_tmp, %a16
1018 //! p_unit_test 0, %res0
1019 writeout(0, fmul(u2u16(fneg(a)), a16));
1020
1021 //! v2b: %res1_tmp = v_mul_f16 -1.0, %a16
1022 //! v1: %res1 = v_mul_f32 %res1_tmp, %a
1023 //! p_unit_test 1, %res1
1024 writeout(1, fmul(bld.as_uniform(fneg(a16)), a));
1025
1026 //! v1: %res2_tmp = v_mul_f32 -1.0, %a16
1027 //! v2b: %res2 = v_mul_f16 %res2_tmp, %a16
1028 //! p_unit_test 2, %res2
1029 writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1),
1030 Operand::c32(0xbf800000u), bld.as_uniform(a16))),
1031 a16));
1032
1033 //! v1: %res3_tmp = v_mul_f32 %a, %a
1034 //! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
1035 //! p_unit_test 3, %res3
1036 writeout(3, fsat(u2u16(fmul(a, a))));
1037
1038 //! v2b: %res4_tmp = v_mul_f16 %a16, %a16
1039 //! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
1040 //! p_unit_test 4, %res4
1041 writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
1042
1043 //! v1: %res5_tmp = v_mul_f32 %a, %a
1044 //! v2b: %res5 = v_mul_f16 2.0, %res5_tmp
1045 //! p_unit_test 5, %res5
1046 writeout(5, fmul(u2u16(fmul(a, a)), bld.copy(bld.def(v2b), Operand::c16(0x4000))));
1047
1048 //! v2b: %res6_tmp = v_mul_f16 %a16, %a16
1049 //! v1: %res6 = v_mul_f32 2.0, %res6_tmp
1050 //! p_unit_test 6, %res6
1051 writeout(6,
1052 fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1053
1054 //! v1: %res7_tmp = v_mul_f32 %a, %a
1055 //! v2b: %res7 = v_add_f16 %res7_tmp, %a16
1056 //! p_unit_test 7, %res7
1057 writeout(7, fadd(u2u16(fmul(a, a)), a16));
1058
1059 //! v2b: %res8_tmp = v_mul_f16 %a16, %a16
1060 //! v1: %res8 = v_add_f32 %res8_tmp, %a
1061 //! p_unit_test 8, %res8
1062 writeout(8, fadd(bld.as_uniform(fmul(a16, a16)), a));
1063
1064 //! v1: %res9_tmp = v_mul_f32 %a, %a
1065 //! v2b: %res9 = v_mul_f16 -1.0, %res9_tmp
1066 //! p_unit_test 9, %res9
1067 writeout(9, fneg(u2u16(fmul(a, a))));
1068
1069 //! v2b: %res10_tmp = v_mul_f16 %a16, %a16
1070 //! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
1071 //! p_unit_test 10, %res10
1072 writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u),
1073 bld.as_uniform(fmul(a16, a16))));
1074
1075 finish_opt_test();
1076 END_TEST
1077
1078 BEGIN_TEST(optimize.mad_mix.input_conv.basic)
1079 for (unsigned i = GFX9; i <= GFX10; i++) {
1080 //>> v1: %a, v2b: %a16 = p_startpgm
1081 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1082 continue;
1083
1084 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1085
1086 Temp a = inputs[0];
1087 Temp a16 = inputs[1];
1088
1089 //! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), neg(0)
1090 //! p_unit_test 0, %res0
1091 writeout(0, fmul(a, f2f32(a16)));
1092
1093 //! v1: %res1 = v_fma_mix_f32 1.0, lo(%a16), %a
1094 //! p_unit_test 1, %res1
1095 writeout(1, fadd(a, f2f32(a16)));
1096
1097 //! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
1098 //! p_unit_test 2, %res2
1099 writeout(2, fadd(f2f32(a16), a));
1100
1101 //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
1102 //! p_unit_test 3, %res3
1103 writeout(3, fma(a, a, f2f32(a16)));
1104
1105 //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
1106 //! p_unit_test 4, %res4
1107 writeout(4, fma(a, a, f2f32(a16)));
1108
1109 finish_opt_test();
1110 }
1111 END_TEST
1112
1113 BEGIN_TEST(optimize.mad_mix.input_conv.precision)
1114 for (unsigned i = GFX9; i <= GFX10; i++) {
1115 //>> v1: %a, v2b: %a16 = p_startpgm
1116 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1117 continue;
1118
1119 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1120
1121 Temp a = inputs[0];
1122 Temp a16 = inputs[1];
1123
1124 /* precise arithmetic */
1125 //~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
1126 //~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
1127 //~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
1128 //! p_unit_test 0, %res0
1129 writeout(0, fma(a, a, f2f32(a16), bld.precise()));
1130
1131 //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1132 //! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt
1133 //! p_unit_test 1, %res1
1134 writeout(1, fmul(a16, f2f16(a), bld.precise()));
1135
1136 //! v2b: %res2_cvt = v_cvt_f16_f32 %a
1137 //! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt
1138 //! p_unit_test 2, %res2
1139 writeout(2, fadd(a16, f2f16(a), bld.precise()));
1140
1141 //! v2b: %res3_cvt = v_cvt_f16_f32 %a
1142 //! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt
1143 //! p_unit_test 3, %res3
1144 writeout(3, fma(a16, a16, f2f16(a), bld.precise()));
1145
1146 /* precise conversions */
1147 //! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a
1148 //! v2b: %res4 = v_mul_f16 %a16, %res4_cvt
1149 //! p_unit_test 4, %res4
1150 writeout(4, fmul(a16, f2f16(a, bld.precise())));
1151
1152 //! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a
1153 //! v2b: %res5 = v_add_f16 %a16, %res5_cvt
1154 //! p_unit_test 5, %res5
1155 writeout(5, fadd(a16, f2f16(a, bld.precise())));
1156
1157 //! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a
1158 //! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt
1159 //! p_unit_test 6, %res6
1160 writeout(6, fma(a16, a16, f2f16(a, bld.precise())));
1161
1162 finish_opt_test();
1163 }
1164 END_TEST
1165
1166 BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
1167 for (unsigned i = GFX9; i <= GFX11; i++) {
1168 if (i == GFX10_3)
1169 continue;
1170 //>> v1: %a, v2b: %a16 = p_startpgm
1171 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1172 continue;
1173
1174 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1175
1176 Temp a = inputs[0];
1177 Temp a16 = inputs[1];
1178
1179 /* check whether modifiers are preserved when converting to VOP3P */
1180 //! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), neg(0)
1181 //! p_unit_test 0, %res0
1182 writeout(0, fmul(fneg(a), f2f32(a16)));
1183
1184 //! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), neg(0)
1185 //! p_unit_test 1, %res1
1186 writeout(1, fmul(fabs(a), f2f32(a16)));
1187
1188 /* fneg modifiers */
1189 //! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), neg(0)
1190 //! p_unit_test 2, %res2
1191 writeout(2, fmul(a, fneg(f2f32(a16))));
1192
1193 //! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), neg(0)
1194 //! p_unit_test 3, %res3
1195 writeout(3, fmul(a, f2f32(fneg(a16))));
1196
1197 /* fabs modifiers */
1198 //! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
1199 //! p_unit_test 4, %res4
1200 writeout(4, fmul(a, fabs(f2f32(a16))));
1201
1202 //! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
1203 //! p_unit_test 5, %res5
1204 writeout(5, fmul(a, f2f32(fabs(a16))));
1205
1206 /* both fabs and fneg modifiers */
1207 //! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
1208 //! p_unit_test 6, %res6
1209 writeout(6, fmul(a, fneg(f2f32(fabs(a16)))));
1210
1211 //! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
1212 //! p_unit_test 7, %res7
1213 writeout(7, fmul(a, fabs(f2f32(fabs(a16)))));
1214
1215 //! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
1216 //! p_unit_test 8, %res8
1217 writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16))))));
1218
1219 //! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
1220 //! p_unit_test 9, %res9
1221 writeout(9, fmul(a, f2f32(fneg(fabs(a16)))));
1222
1223 //! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
1224 //! p_unit_test 10, %res10
1225 writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16))))));
1226
1227 //! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, neg(0)
1228 //! p_unit_test 11, %res11
1229 writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16))))));
1230
1231 //! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, neg(0)
1232 //! p_unit_test 12, %res12
1233 writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16)))))));
1234
1235 /* sdwa */
1236 //! v1: %res13 = v_fma_mix_f32 lo(%a), %a, neg(0)
1237 //! p_unit_test 13, %res13
1238 writeout(13, fmul(f2f32(ext_ushort(a, 0)), a));
1239
1240 //! v1: %res14 = v_fma_mix_f32 hi(%a), %a, neg(0)
1241 //! p_unit_test 14, %res14
1242 writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
1243
1244 //~gfx(9|10)! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
1245 //~gfx11! v1: %res16_cvt1 = v_fma_mix_f32 lo(%a), 1.0, neg(0)
1246 //~gfx11! v1: %res15_cvt = p_extract %res16_cvt1, 0, 16, 0
1247 //! v1: %res15 = v_mul_f32 %res15_cvt, %a
1248 //! p_unit_test 15, %res15
1249 writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
1250
1251 //~gfx(9|10)! v1: %res16_cvt = v_cvt_f32_f16 %a
1252 //~gfx(9|10)! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
1253 //~gfx11! v1: %res16_cvt = v_fma_mix_f32 lo(%a), 1.0, neg(0)
1254 //~gfx11! v1: %res16_ext = p_extract %res16_cvt, 1, 16, 0
1255 //~gfx11! v1: %res16 = v_mul_f32 %res16_ext, %a
1256 //! p_unit_test 16, %res16
1257 writeout(16, fmul(ext_ushort(f2f32(a), 1), a));
1258
1259 //~gfx(9|10)! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
1260 //~gfx(9|10)! v1: %res17 = v_mul_f32 %res17_cvt, %a
1261 //~gfx11! v1: %res17_ext = p_extract %a, 2, 8, 0
1262 //~gfx11! v1: %res17 = v_fma_mix_f32 lo(%res17_ext), %a, neg(0)
1263 //! p_unit_test 17, %res17
1264 writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a));
1265
1266 finish_opt_test();
1267 }
1268 END_TEST
1269
1270 BEGIN_TEST(optimize.mad_mix.output_conv.basic)
1271 for (unsigned i = GFX9; i <= GFX10; i++) {
1272 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1273 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1274 continue;
1275
1276 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1277
1278 Temp a = inputs[0];
1279 Temp b = inputs[1];
1280 Temp c = inputs[2];
1281 Temp a16 = inputs[3];
1282 Temp b16 = inputs[4];
1283
1284 //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, neg(lo(0))
1285 //! p_unit_test 0, %res0
1286 writeout(0, f2f16(fmul(a, b)));
1287
1288 //! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b
1289 //! p_unit_test 1, %res1
1290 writeout(1, f2f16(fadd(a, b)));
1291
1292 //! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c
1293 //! p_unit_test 2, %res2
1294 writeout(2, f2f16(fma(a, b, c)));
1295
1296 //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, neg(lo(0))
1297 //! p_unit_test 3, %res3
1298 writeout(3, f2f16(fmul(f2f32(a16), b)));
1299
1300 //! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16)
1301 //! p_unit_test 4, %res4
1302 writeout(4, f2f16(fadd(a, f2f32(b16))));
1303
1304 //! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c
1305 //! p_unit_test 5, %res5
1306 writeout(5, f2f16(fma(a, f2f32(b16), c)));
1307
1308 finish_opt_test();
1309 }
1310 END_TEST
1311
1312 BEGIN_TEST(optimize.mad_mix.output_conv.precision)
1313 for (unsigned i = GFX9; i <= GFX10; i++) {
1314 //>> v2b: %a16 = p_startpgm
1315 if (!setup_cs("v2b", (amd_gfx_level)i))
1316 continue;
1317
1318 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1319
1320 Temp a16 = inputs[0];
1321
1322 //! v2b: %res0_tmp = v_mul_f16 %a16, %a16
1323 //! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
1324 //! p_unit_test 0, %res0
1325 writeout(0, f2f32(fmul(a16, a16), bld.precise()));
1326
1327 //! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16
1328 //! v1: %res1 = v_cvt_f32_f16 %res1_tmp
1329 //! p_unit_test 1, %res1
1330 writeout(1, f2f32(fmul(a16, a16, bld.precise())));
1331
1332 finish_opt_test();
1333 }
1334 END_TEST
1335
1336 BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
1337 for (unsigned i = GFX9; i <= GFX10; i++) {
1338 //>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm
1339 if (!setup_cs("v1 v1 v2b v2b", (amd_gfx_level)i))
1340 continue;
1341
1342 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1343
1344 Temp a = inputs[0];
1345 Temp b = inputs[1];
1346 Temp a16 = inputs[2];
1347 Temp b16 = inputs[3];
1348
1349 /* fneg/fabs */
1350 //! v1: %res0_add = v_add_f32 %1, %2
1351 //! v2b: %res0 = v_cvt_f16_f32 |%res0_add|
1352 //! p_unit_test 0, %res0
1353 writeout(0, f2f16(fabs(fadd(a, b))));
1354
1355 //! v1: %res1_add = v_add_f32 %1, %2
1356 //! v2b: %res1 = v_cvt_f16_f32 -%res1_add
1357 //! p_unit_test 1, %res1
1358 writeout(1, f2f16(fneg(fadd(a, b))));
1359
1360 //! v2b: %res2_add = v_add_f16 %3, %4
1361 //! v1: %res2 = v_cvt_f32_f16 |%res2_add|
1362 //! p_unit_test 2, %res2
1363 writeout(2, f2f32(fabs(fadd(a16, b16))));
1364
1365 //! v2b: %res3_add = v_add_f16 %3, %4
1366 //! v1: %res3 = v_cvt_f32_f16 -%res3_add
1367 //! p_unit_test 3, %res3
1368 writeout(3, f2f32(fneg(fadd(a16, b16))));
1369
1370 /* sdwa */
1371 //! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
1372 //! v2b: %res4 = p_extract %res4_add, 0, 8, 0
1373 //! p_unit_test 4, %res4
1374 writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
1375
1376 //! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
1377 //! v2b: %res5 = v_cvt_f16_f32 %res5_mul
1378 //! p_unit_test 5, %res5
1379 writeout(5, f2f16(ext_ushort(fadd(a, b), 0)));
1380
1381 finish_opt_test();
1382 }
1383 END_TEST
1384
1385 BEGIN_TEST(optimize.mad_mix.fma.basic)
1386 for (unsigned i = GFX9; i <= GFX10; i++) {
1387 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
1388 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1389 continue;
1390
1391 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1392
1393 Temp a = inputs[0];
1394 Temp b = inputs[1];
1395 Temp c = inputs[2];
1396 Temp a16 = inputs[3];
1397 Temp c16 = inputs[4];
1398
1399 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1400 //! p_unit_test 0, %res0
1401 writeout(0, fadd(fmul(f2f32(a16), b), c));
1402
1403 //! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
1404 //! p_unit_test 1, %res1
1405 writeout(1, fadd(fmul(a, b), f2f32(c16)));
1406
1407 /* omod/clamp check */
1408 //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, neg(0)
1409 //! v1: %res2 = v_add_f32 %res2_mul, %c *2
1410 //! p_unit_test 2, %res2
1411 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000),
1412 fadd(fmul(f2f32(a16), b), c)));
1413
1414 /* neg/abs modifiers */
1415 //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
1416 //! p_unit_test 3, %res3
1417 writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16))));
1418
1419 //! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
1420 //! p_unit_test 4, %res4
1421 writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16)));
1422
1423 //! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16)
1424 //! p_unit_test 5, %res5
1425 writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16)));
1426
1427 //! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16)
1428 //! p_unit_test 6, %res6
1429 writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
1430
1431 /* output conversions */
1432 //! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
1433 //! p_unit_test 7, %res7
1434 writeout(7, f2f16(fadd(fmul(a, b), c)));
1435
1436 finish_opt_test();
1437 }
1438 END_TEST
1439
1440 BEGIN_TEST(optimize.mad_mix.fma.precision)
1441 for (unsigned i = GFX9; i <= GFX10; i++) {
1442 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1443 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1444 continue;
1445
1446 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1447
1448 Temp a = inputs[0];
1449 Temp b = inputs[1];
1450 Temp c = inputs[2];
1451 Temp a16 = inputs[3];
1452 Temp b16 = inputs[4];
1453
1454 /* the optimization is precise for 32-bit on GFX9 */
1455 //~gfx9! v1: (precise)%res0 = v_fma_mix_f32 lo(%a16), %b, %c
1456 //~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, neg(0)
1457 //~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
1458 //! p_unit_test 0, %res0
1459 writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c));
1460
1461 //~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
1462 //~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, neg(0)
1463 //~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
1464 //! p_unit_test 1, %res1
1465 writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
1466
1467 /* never promote 16-bit arithmetic to 32-bit */
1468 //! v2b: %res2_tmp = v_cvt_f16_f32 %a
1469 //! v2b: %res2 = v_add_f16 %res2_tmp, %b16
1470 //! p_unit_test 2, %res2
1471 writeout(2, fadd(f2f16(a), b16));
1472
1473 //! v2b: %res3_tmp = v_cvt_f16_f32 %a
1474 //! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
1475 //! p_unit_test 3, %res3
1476 writeout(3, fmul(f2f16(a), b16));
1477
1478 //! v2b: %res4_tmp = v_mul_f16 %a16, %b16
1479 //! v1: %res4 = v_cvt_f32_f16 %res4_tmp
1480 //! p_unit_test 4, %res4
1481 writeout(4, f2f32(fmul(a16, b16)));
1482
1483 //! v2b: %res5_tmp = v_add_f16 %a16, %b16
1484 //! v1: %res5 = v_cvt_f32_f16 %res5_tmp
1485 //! p_unit_test 5, %res5
1486 writeout(5, f2f32(fadd(a16, b16)));
1487
1488 //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, neg(lo(0))
1489 //! v2b: %res6 = v_add_f16 %res6_tmp, %a16
1490 //! p_unit_test 6, %res6
1491 writeout(6, fadd(f2f16(fmul(a, b)), a16));
1492
1493 //! v2b: %res7_tmp = v_mul_f16 %a16, %b16
1494 //! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
1495 //! p_unit_test 7, %res7
1496 writeout(7, fadd(f2f32(fmul(a16, b16)), c));
1497
1498 finish_opt_test();
1499 }
1500 END_TEST
1501
1502 BEGIN_TEST(optimize.mad_mix.clamp)
1503 for (unsigned i = GFX9; i <= GFX10; i++) {
1504 //>> v1: %a, v2b: %a16 = p_startpgm
1505 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1506 continue;
1507
1508 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1509
1510 Temp a = inputs[0];
1511 Temp a16 = inputs[1];
1512
1513 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, neg(0) clamp
1514 //! p_unit_test 0, %res0
1515 writeout(0, fsat(fmul(f2f32(a16), a)));
1516
1517 //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, neg(lo(0)) clamp
1518 //! p_unit_test 1, %res1
1519 writeout(1, f2f16(fsat(fmul(a, a))));
1520
1521 //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, neg(lo(0)) clamp
1522 //! p_unit_test 2, %res2
1523 writeout(2, fsat(f2f16(fmul(a, a))));
1524
1525 finish_opt_test();
1526 }
1527 END_TEST
1528
1529 BEGIN_TEST(optimize.mad_mix.cast)
1530 for (unsigned i = GFX9; i <= GFX10; i++) {
1531 //>> v1: %a, v2b: %a16 = p_startpgm
1532 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1533 continue;
1534
1535 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1536
1537 Temp a = inputs[0];
1538 Temp a16 = inputs[1];
1539
1540 /* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
1541 * optimizer has to check compatibility.
1542 */
1543
1544 //! v1: %res0_cvt = v_cvt_f32_f16 %a16
1545 //! v2b: %res0 = v_mul_f16 %res0_cvt, %a16
1546 //! p_unit_test 0, %res0
1547 writeout(0, fmul(u2u16(f2f32(a16)), a16));
1548
1549 //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1550 //! v1: %res1 = v_mul_f32 %res1_cvt, %a
1551 //! p_unit_test 1, %res1
1552 writeout(1, fmul(bld.as_uniform(f2f16(a)), a));
1553
1554 //! v2b: %res2_mul = v_mul_f16 %a16, %a16
1555 //! v2b: %res2 = v_cvt_f16_f32 %res2_mul
1556 //! p_unit_test 2, %res2
1557 writeout(2, f2f16(bld.as_uniform(fmul(a16, a16))));
1558
1559 //! v1: %res3_mul = v_mul_f32 %a, %a
1560 //! v1: %res3 = v_cvt_f32_f16 %res3_mul
1561 //! p_unit_test 3, %res3
1562 writeout(3, f2f32(u2u16(fmul(a, a))));
1563
1564 //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, neg(0)
1565 //! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
1566 //! p_unit_test 4, %res4
1567 writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
1568
1569 //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, neg(lo(0))
1570 //! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
1571 //! p_unit_test 5, %res5
1572 writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
1573
1574 //! v1: %res6_mul = v_mul_f32 %a, %a
1575 //! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
1576 //! p_unit_test 6, %res6
1577 writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
1578
1579 //! v2b: %res7_mul = v_mul_f16 %a16, %a16
1580 //! v1: %res7 = v_fma_mix_f32 1.0, lo(%a16), %res7_mul
1581 //! p_unit_test 7, %res7
1582 writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));
1583
1584 /* opsel_hi should be obtained from the original opcode, not the operand regclass */
1585 //! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, neg(0)
1586 //! p_unit_test 8, %res8
1587 writeout(8, fmul(f2f32(a16), a16));
1588
1589 finish_opt_test();
1590 }
1591 END_TEST
1592
1593 static void
vop3p_constant(unsigned * idx,aco_opcode op,const char * swizzle,uint32_t val)1594 vop3p_constant(unsigned* idx, aco_opcode op, const char* swizzle, uint32_t val)
1595 {
1596 uint32_t halves[2] = {val & 0xffff, val >> 16};
1597 uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16);
1598 fprintf(output, "Expected for %u: 0x%.8x / %u\n", *idx, expected, expected);
1599
1600 unsigned opsel_lo = swizzle[0] == 'x' ? 0x0 : 0x1;
1601 unsigned opsel_hi = swizzle[1] == 'x' ? 0x2 : 0x3;
1602 writeout((*idx)++, bld.vop3p(op, bld.def(v1), bld.copy(bld.def(v1), Operand::c32(val)),
1603 inputs[0], opsel_lo, opsel_hi));
1604 }
1605
1606 BEGIN_TEST(optimize.vop3p_constants)
1607 for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) {
1608 for (const char* swizzle : {"xx", "yy", "xy", "yx"}) {
1609 char variant[16];
1610 strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16");
1611 strcat(variant, "_");
1612 strcat(variant, swizzle);
1613
1614 //; for i in range(36):
1615 //; insert_pattern('Expected for %u: $_ / #expected%u' % (i, i))
1616
1617 //>> v1: %a = p_startpgm
1618 if (!setup_cs("v1", GFX10_3, CHIP_UNKNOWN, variant))
1619 continue;
1620
1621 //; opcode = 'v_pk_add_u16' if 'u16' in variant else 'v_pk_add_f16'
1622 //; for i in range(36):
1623 //; insert_pattern('v1: %%res%u = %s $got%u %%a' % (i, opcode, i))
1624 //; insert_pattern('p_unit_test %u, %%res%u' % (i, i))
1625 //! s_endpgm
1626
1627 //; def parse_op(op):
1628 //; is_int = opcode == 'v_pk_add_u16'
1629 //; op = op.rstrip(',')
1630 //;
1631 //; mods = lambda v: v
1632 //; if op.endswith('*[1,-1]'):
1633 //; mods = lambda v: v ^ 0x80000000
1634 //; assert(not is_int)
1635 //; elif op.endswith('*[-1,1]'):
1636 //; mods = lambda v: v ^ 0x00008000
1637 //; assert(not is_int)
1638 //; elif op.startswith('neg('):
1639 //; mods = lambda v: v ^ 0x80008000
1640 //; assert(not is_int)
1641 //; op = op[4:-1]
1642 //; op = op.split('*')[0]
1643 //;
1644 //; swizzle = lambda v: v
1645 //; if op.endswith('.xx'):
1646 //; swizzle = lambda v: ((v & 0xffff) | (v << 16)) & 0xffffffff;
1647 //; elif op.endswith('.yy'):
1648 //; swizzle = lambda v: (v >> 16) | (v & 0xffff0000);
1649 //; elif op.endswith('.yx'):
1650 //; swizzle = lambda v: ((v >> 16) | (v << 16)) & 0xffffffff;
1651 //; op = op.rstrip('xy.')
1652 //;
1653 //; val = None
1654 //; if op.startswith('0x'):
1655 //; val = int(op[2:], 16)
1656 //; elif op == '-1.0':
1657 //; val = 0xbf800000 if is_int else 0xbC00
1658 //; elif op == '1.0':
1659 //; val = 0x3f800000 if is_int else 0x3c00
1660 //; else:
1661 //; val = int(op) & 0xffffffff
1662 //;
1663 //; return mods(swizzle(val))
1664
1665 //; # Check correctness
1666 //; for i in range(36):
1667 //; expected = globals()['expected%u' % i]
1668 //; got = globals()['got%u' % i]
1669 //; got_parsed = parse_op(got)
1670 //; if got_parsed != expected:
1671 //; raise Exception('Check %u failed: expected 0x%.8x, got 0x%.8x ("%s")' % (i, expected, got_parsed, got))
1672
1673 //; # Check that all literals are ones that cannot be encoded as inline constants
1674 //; allowed_literals = [0x00004242, 0x0000fffe, 0x00308030, 0x0030ffff, 0x3c00ffff,
1675 //; 0x42420000, 0x42424242, 0x4242c242, 0x4242ffff, 0x7ffefffe,
1676 //; 0x80300030, 0xbeefdead, 0xc2424242, 0xdeadbeef, 0xfffe0000,
1677 //; 0xfffe7ffe, 0xffff0030, 0xffff3c00, 0xffff4242, 0xc242c242,
1678 //; 0x80308030, 0xdeaddead, 0xbeefbeef, 0x7ffe7ffe]
1679 //; if opcode == 'v_pk_add_u16':
1680 //; allowed_literals.extend([0x00003c00, 0x3c000000, 0x3c003c00, 0x3c00bc00, 0xbc003c00, 0xbc00bc00])
1681 //; else:
1682 //; allowed_literals.extend([0x00003f80, 0x3f800000, 0x3f803f80])
1683 //;
1684 //; for i in range(36):
1685 //; got = globals()['got%u' % i].removeprefix('neg(')
1686 //; if not got.startswith('0x'):
1687 //; continue;
1688 //; got = int(got[2:].rstrip(',)').split('*')[0].split('.')[0], 16)
1689 //; if got not in allowed_literals:
1690 //; raise Exception('Literal check %u failed: 0x%.8x not in allowed literals' % (i, got))
1691
1692 unsigned idx = 0;
1693 for (uint32_t constant : {0x3C00, 0x0030, 0xfffe, 0x4242}) {
1694 vop3p_constant(&idx, op, swizzle, constant);
1695 vop3p_constant(&idx, op, swizzle, constant | 0xffff0000);
1696 vop3p_constant(&idx, op, swizzle, constant | (constant << 16));
1697 vop3p_constant(&idx, op, swizzle, constant << 16);
1698 vop3p_constant(&idx, op, swizzle, (constant << 16) | 0x0000ffff);
1699 vop3p_constant(&idx, op, swizzle, constant | ((constant ^ 0x8000) << 16));
1700 vop3p_constant(&idx, op, swizzle, (constant ^ 0x8000) | (constant << 16));
1701 }
1702
1703 for (uint32_t constant : {0x3f800000u, 0xfffffffeu, 0x00000030u, 0xdeadbeefu}) {
1704 uint32_t lo = constant & 0xffff;
1705 uint32_t hi = constant >> 16;
1706 vop3p_constant(&idx, op, swizzle, constant);
1707 vop3p_constant(&idx, op, swizzle, hi | (lo << 16));
1708 }
1709
1710 finish_opt_test();
1711 }
1712 }
1713 END_TEST
1714
1715 BEGIN_TEST(optimize.fmamix_two_literals)
1716 /* This test has to recreate literals sometimes because we don't combine them at all if there's
1717 * at least one uncombined use.
1718 */
1719 for (unsigned i = GFX10; i <= GFX10_3; i++) {
1720 //>> v1: %a, v1: %b = p_startpgm
1721 if (!setup_cs("v1 v1", (amd_gfx_level)i))
1722 continue;
1723
1724 Temp a = inputs[0];
1725 Temp b = inputs[1];
1726
1727 Temp c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1728 Temp c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1729 Temp c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
1730
1731 //! v1: %res0 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
1732 //! p_unit_test 0, %res0
1733 writeout(0, fma(a, c15, c30));
1734
1735 /* No need to use v_fma_mix_f32. */
1736 //! v1: %res1 = v_fmaak_f32 %a, %b, 0x40400000
1737 //! p_unit_test 1, %res1
1738 writeout(1, fma(a, b, c30));
1739
1740 /* Separate mul/add can become v_fma_mix_f32 if it's not precise. */
1741 //! v1: %res2 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
1742 //! p_unit_test 2, %res2
1743 writeout(2, fadd(fmul(a, c15), c30));
1744
1745 //~gfx10! v1: %c15 = p_parallelcopy 0x3fc00000
1746 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1747 c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1748
1749 /* v_fma_mix_f32 is a fused mul/add, so it can't be used for precise separate mul/add. */
1750 //~gfx10! v1: (precise)%res3 = v_madak_f32 %a, %c15, 0x40400000
1751 //~gfx10_3! v1: (precise)%res3_tmp = v_mul_f32 %a, 0x3fc00000
1752 //~gfx10_3! v1: %res3 = v_add_f32 %res3_tmp, 0x40400000
1753 //! p_unit_test 3, %res3
1754 writeout(3, fadd(bld.precise().vop2(aco_opcode::v_mul_f32, bld.def(v1), a, c15), c30));
1755
1756 //~gfx10! v1: (precise)%res4 = v_madak_f32 %1, %c16, 0x40400000
1757 //~gfx10_3! v1: %res4_tmp = v_mul_f32 %a, 0x3fc00000
1758 //~gfx10_3! v1: (precise)%res4 = v_add_f32 %res4_tmp, 0x40400000
1759 //! p_unit_test 4, %res4
1760 writeout(4, bld.precise().vop2(aco_opcode::v_add_f32, bld.def(v1), fmul(a, c15), c30));
1761
1762 /* Can't convert to fp16 if it will be flushed as a denormal. */
1763 //! v1: %res5 = v_fma_mix_f32 %1, lo(0x3ff3e00), hi(0x3ff3e00)
1764 //! p_unit_test 5, %res5
1765 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1766 writeout(5, fma(a, c15, c_denorm));
1767
1768 //>> BB1
1769 //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
1770 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
1771 bld.reset(program->create_and_insert_block());
1772 program->blocks[0].linear_succs.push_back(1);
1773 program->blocks[0].logical_succs.push_back(1);
1774 program->blocks[1].linear_preds.push_back(0);
1775 program->blocks[1].logical_preds.push_back(0);
1776
1777 //~gfx10; del c15
1778 //! v1: %c15 = p_parallelcopy 0x3fc00000
1779 //! v1: %res6 = v_fmaak_f32 %a, %c15, 0x387fc000
1780 //! p_unit_test 6, %res6
1781 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1782 c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
1783 writeout(6, fma(a, c15, c_denorm));
1784
1785 /* Can't accept more than 3 unique fp16 literals. */
1786 //! v1: %c45 = p_parallelcopy 0x40900000
1787 //! v1: %res7 = v_fma_mix_f32 lo(0x42003e00), hi(0x42003e00), %c45
1788 //! p_unit_test 7, %res7
1789 Temp c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
1790 writeout(7, fma(c15, c30, c45));
1791
1792 /* Modifiers must be preserved. */
1793 //! v1: %res8 = v_fma_mix_f32 -%a, lo(0x44804200), hi(0x44804200)
1794 //! p_unit_test 8, %res8
1795 writeout(8, fma(fneg(a), c30, c45));
1796
1797 //! v1: %res9 = v_fma_mix_f32 lo(0x44804200), |%a|, hi(0x44804200)
1798 //! p_unit_test 9, %res9
1799 writeout(9, fma(c30, fabs(a), c45));
1800
1801 //! v1: %res10 = v_fma_mix_f32 %a, lo(0x44804200), hi(0x44804200) clamp
1802 //! p_unit_test 10, %res10
1803 writeout(10, fsat(fma(a, c30, c45)));
1804
1805 /* Output modifiers are not supported by v_fma_mix_f32. */
1806 c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1807 //; del c45
1808 //! v1: %c45 = p_parallelcopy 0x40900000
1809 //! v1: %res11 = v_fma_f32 %a, 0x40400000, %c45 *0.5
1810 //! p_unit_test 11, %res11
1811 c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
1812 writeout(11, fmul(fma(a, c30, c45), bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
1813
1814 /* Has a literal which can't be represented as fp16. */
1815 //! v1: %c03 = p_parallelcopy 0x3e99999a
1816 //! v1: %res12 = v_fmaak_f32 %a, %c03, 0x40400000
1817 //! p_unit_test 12, %res12
1818 Temp c03 = bld.copy(bld.def(v1), Operand::c32(fui(0.3f)));
1819 writeout(12, fma(a, c03, c30));
1820
1821 /* We should still use fmaak/fmamk if the two literals are identical. */
1822 //! v1: %res13 = v_fmaak_f32 0x40400000, %a, 0x40400000
1823 //! p_unit_test 13, %res13
1824 writeout(13, fma(a, c30, c30));
1825
1826 finish_opt_test();
1827 }
1828 END_TEST
1829
1830 BEGIN_TEST(optimize.fma_opsel)
1831 /* TODO make these work before GFX11 using SDWA. */
1832 for (unsigned i = GFX11; i <= GFX11; i++) {
1833 //>> v2b: %a, v2b: %b, v1: %c, v1: %d, v1: %e = p_startpgm
1834 if (!setup_cs("v2b v2b v1 v1 v1", (amd_gfx_level)i))
1835 continue;
1836
1837 Temp a = inputs[0];
1838 Temp b = inputs[1];
1839 Temp c = inputs[2];
1840 Temp d = inputs[3];
1841 Temp e = inputs[4];
1842 Temp c_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), c, Operand::c32(1));
1843 Temp d_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), d, Operand::c32(1));
1844 Temp e_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), e, Operand::c32(1));
1845
1846 //! v2b: %res0 = v_fma_f16 %b, hi(%c), %a
1847 //! p_unit_test 0, %res0
1848 writeout(0, fadd(fmul(b, c_hi), a));
1849
1850 //! v2b: %res1 = v_fma_f16 %a, %b, hi(%d)
1851 //! p_unit_test 1, %res1
1852 writeout(1, fadd(fmul(a, b), d_hi));
1853
1854 //! v2b: %res2 = v_fma_f16 %a, %b, hi(%e)
1855 //! p_unit_test 2, %res2
1856 writeout(2, fma(a, b, e_hi));
1857
1858 finish_opt_test();
1859 }
1860 END_TEST
1861
1862 BEGIN_TEST(optimize.dpp_opsel)
1863 //>> v1: %a, v1: %b = p_startpgm
1864 if (!setup_cs("v1 v1", GFX11))
1865 return;
1866
1867 Temp a = inputs[0];
1868 Temp b = inputs[1];
1869
1870 Temp dpp16 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1871 Temp dpp16_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp16, Operand::c32(1));
1872 Temp dpp8 = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a);
1873 Temp dpp8_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp8, Operand::c32(1));
1874
1875 Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
1876 Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0));
1877
1878 //! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi
1879 //! p_unit_test 0, %res0
1880 writeout(0, fadd(dpp16_hi, b_hi));
1881
1882 //! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi
1883 //! p_unit_test 1, %res1
1884 writeout(1, fadd(b_lo, dpp8_hi));
1885
1886 finish_opt_test();
1887 END_TEST
1888
1889 BEGIN_TEST(optimize.apply_sgpr_swap_opsel)
1890 //>> v1: %a, s1: %b = p_startpgm
1891 if (!setup_cs("v1 s1", GFX11))
1892 return;
1893
1894 Temp a = inputs[0];
1895 Temp b = inputs[1];
1896
1897 Temp b_vgpr = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), bld.copy(bld.def(v1), b),
1898 Operand::c32(0));
1899
1900 Temp res0 = bld.tmp(v2b);
1901 VALU_instruction& valu = bld.vop2(aco_opcode::v_sub_f16, Definition(res0), a, b_vgpr)->valu();
1902 valu.opsel[0] = true;
1903
1904 //! v2b: %res0 = v_subrev_f16 %b, hi(%a)
1905 //! p_unit_test 0, %res0
1906 writeout(0, res0);
1907
1908 finish_opt_test();
1909 END_TEST
1910
1911 BEGIN_TEST(optimize.max3_opsel)
1912 /* TODO make these work before GFX11 using SDWA. */
1913 for (unsigned i = GFX11; i <= GFX11; i++) {
1914 //>> v1: %a, v1: %b, v2b: %c = p_startpgm
1915 if (!setup_cs("v1 v1 v2b", GFX11))
1916 continue;
1917
1918 Temp a = inputs[0];
1919 Temp b = inputs[1];
1920 Temp c = inputs[2];
1921
1922 Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
1923 Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
1924
1925 //! v2b: %res0 = v_max3_f16 hi(%a), %c, hi(%b)
1926 //! p_unit_test 0, %res0
1927 writeout(0, bld.vop2(aco_opcode::v_max_f16, bld.def(v2b),
1928 bld.vop2(aco_opcode::v_max_f16, bld.def(v2b), a_hi, c), b_hi));
1929
1930 finish_opt_test();
1931 }
1932 END_TEST
1933
1934 BEGIN_TEST(optimize.neg_mul_opsel)
1935 //>> v1: %a, v2b: %b = p_startpgm
1936 if (!setup_cs("v1 v2b", GFX11))
1937 return;
1938
1939 Temp a = inputs[0];
1940 Temp b = inputs[1];
1941
1942 Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
1943
1944 //! v2b: %res0 = v_mul_f16 -hi(%a), %b
1945 //! p_unit_test 0, %res0
1946 writeout(0, fneg(fmul(a_hi, b)));
1947
1948 //! v1: %res1 = v_fma_mix_f32 -hi(%a), lo(%b), neg(0)
1949 //! p_unit_test 1, %res1
1950 writeout(1, fneg(fmul(f2f32(a_hi), f2f32(b))));
1951
1952 finish_opt_test();
1953 END_TEST
1954
1955 BEGIN_TEST(optimize.vinterp_inreg_output_modifiers)
1956 //>> v1: %a, v1: %b, v1: %c = p_startpgm
1957 if (!setup_cs("v1 v1 v1", GFX11))
1958 return;
1959
1960 //! v1: %res0 = v_interp_p2_f32_inreg %a, %b, %c clamp
1961 //! p_unit_test 0, %res0
1962 Temp tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[0],
1963 inputs[1], inputs[2]);
1964 writeout(0, fsat(tmp));
1965
1966 //! v1: %res1 = v_fma_f32 %b, %a, %c *2 quad_perm:[2,2,2,2] fi
1967 //! p_unit_test 1, %res1
1968 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[1], inputs[0],
1969 inputs[2]);
1970 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
1971 writeout(1, tmp);
1972
1973 //! v2b: %res2 = v_interp_p2_f16_f32_inreg %a, %b, %c clamp
1974 //! p_unit_test 2, %res2
1975 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0],
1976 inputs[1], inputs[2]);
1977 writeout(2, fsat(tmp));
1978
1979 //! v2b: %tmp3 = v_interp_p2_f16_f32_inreg %b, %a, %c
1980 //! v2b: %res3 = v_mul_f16 2.0, %tmp3
1981 //! p_unit_test 3, %res3
1982 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[1],
1983 inputs[0], inputs[2]);
1984 tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp);
1985 writeout(3, tmp);
1986
1987 //! v2b: %res4 = v_fma_mixlo_f16 %c, %b, %a quad_perm:[2,2,2,2] fi
1988 //! p_unit_test 4, %res4
1989 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[2], inputs[1],
1990 inputs[0]);
1991 writeout(4, f2f16(tmp));
1992
1993 finish_opt_test();
1994 END_TEST
1995
1996 BEGIN_TEST(optimize.s_pack)
1997 //>> s1: %a, s1: %b, s1: %c = p_startpgm
1998 if (!setup_cs("s1 s1 s1", GFX11))
1999 return;
2000
2001 Temp lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1],
2002 Operand::c32(0), Operand::c32(16u), Operand::c32(false));
2003 Temp hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2],
2004 Operand::c32(1), Operand::c32(16u), Operand::c32(false));
2005
2006 //! s1: %res0 = s_pack_lh_b32_b16 %b, %c
2007 //! p_unit_test 0, %res0
2008 writeout(0, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, hi));
2009
2010 //! s1: %res1 = s_pack_ll_b32_b16 %b, %b
2011 //! p_unit_test 1, %res1
2012 writeout(1, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, lo));
2013
2014 //! s1: %res2 = s_pack_hl_b32_b16 %c, %b
2015 //! p_unit_test 2, %res2
2016 writeout(2, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, lo));
2017
2018 //! s1: %res3 = s_pack_hh_b32_b16 %c, %c
2019 //! p_unit_test 3, %res3
2020 writeout(3, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, hi));
2021
2022 lo = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[1], Operand::c32(0),
2023 Operand::c32(16u), Operand::c32(false));
2024 hi = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), inputs[2], Operand::c32(1),
2025 Operand::c32(16u), Operand::c32(false));
2026
2027 //! s1: %res4 = s_pack_ll_b32_b16 %a, %b
2028 //! p_unit_test 4, %res4
2029 writeout(4, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], lo));
2030
2031 //! s1: %res5 = s_pack_lh_b32_b16 %a, %c
2032 //! p_unit_test 5, %res5
2033 writeout(5, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), inputs[0], hi));
2034
2035 //! s1: %res6 = s_pack_ll_b32_b16 %b, %a
2036 //! p_unit_test 6, %res6
2037 writeout(6, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), lo, inputs[0]));
2038
2039 //! s1: %res7 = s_pack_hl_b32_b16 %c, %a
2040 //! p_unit_test 7, %res7
2041 writeout(7, bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), hi, inputs[0]));
2042
2043 finish_opt_test();
2044 END_TEST
2045