xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/pmul.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7
8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
9; SSE2-LABEL: mul_v16i8c:
10; SSE2:       # BB#0: # %entry
11; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
12; SSE2-NEXT:    psraw $8, %xmm1
13; SSE2-NEXT:    movdqa %xmm0, %xmm2
14; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
15; SSE2-NEXT:    psraw $8, %xmm2
16; SSE2-NEXT:    pmullw %xmm1, %xmm2
17; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
18; SSE2-NEXT:    pand %xmm3, %xmm2
19; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
20; SSE2-NEXT:    psraw $8, %xmm0
21; SSE2-NEXT:    pmullw %xmm1, %xmm0
22; SSE2-NEXT:    pand %xmm3, %xmm0
23; SSE2-NEXT:    packuswb %xmm2, %xmm0
24; SSE2-NEXT:    retq
25;
26; SSE41-LABEL: mul_v16i8c:
27; SSE41:       # BB#0: # %entry
28; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
29; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
30; SSE41-NEXT:    pmullw %xmm2, %xmm1
31; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
32; SSE41-NEXT:    pand %xmm3, %xmm1
33; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
34; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
35; SSE41-NEXT:    pmullw %xmm2, %xmm0
36; SSE41-NEXT:    pand %xmm3, %xmm0
37; SSE41-NEXT:    packuswb %xmm0, %xmm1
38; SSE41-NEXT:    movdqa %xmm1, %xmm0
39; SSE41-NEXT:    retq
40;
41; AVX2-LABEL: mul_v16i8c:
42; AVX2:       # BB#0: # %entry
43; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
44; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm1
45; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
46; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
47; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
48; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
49; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
50; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
51; AVX2-NEXT:    vzeroupper
52; AVX2-NEXT:    retq
53;
54; AVX512F-LABEL: mul_v16i8c:
55; AVX512F:       # BB#0: # %entry
56; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
57; AVX512F-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm1
58; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
59; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
60; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
61; AVX512F-NEXT:    retq
62;
63; AVX512BW-LABEL: mul_v16i8c:
64; AVX512BW:       # BB#0: # %entry
65; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
66; AVX512BW-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm1
67; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
68; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
69; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
70; AVX512BW-NEXT:    retq
71entry:
72  %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
73  ret <16 x i8> %A
74}
75
76define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind  {
77; SSE-LABEL: mul_v8i16c:
78; SSE:       # BB#0: # %entry
79; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: mul_v8i16c:
83; AVX:       # BB#0: # %entry
84; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
85; AVX-NEXT:    retq
86entry:
87  %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
88  ret <8 x i16> %A
89}
90
91define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
92; SSE2-LABEL: mul_v4i32c:
93; SSE2:       # BB#0: # %entry
94; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
95; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
96; SSE2-NEXT:    pmuludq %xmm1, %xmm0
97; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
98; SSE2-NEXT:    pmuludq %xmm1, %xmm2
99; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
100; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
101; SSE2-NEXT:    retq
102;
103; SSE41-LABEL: mul_v4i32c:
104; SSE41:       # BB#0: # %entry
105; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
106; SSE41-NEXT:    retq
107;
108; AVX-LABEL: mul_v4i32c:
109; AVX:       # BB#0: # %entry
110; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
111; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
112; AVX-NEXT:    retq
113entry:
114  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
115  ret <4 x i32> %A
116}
117
118define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind  {
119; SSE-LABEL: mul_v2i64c:
120; SSE:       # BB#0: # %entry
121; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
122; SSE-NEXT:    movdqa %xmm0, %xmm2
123; SSE-NEXT:    pmuludq %xmm1, %xmm2
124; SSE-NEXT:    psrlq $32, %xmm0
125; SSE-NEXT:    pmuludq %xmm1, %xmm0
126; SSE-NEXT:    psllq $32, %xmm0
127; SSE-NEXT:    paddq %xmm2, %xmm0
128; SSE-NEXT:    retq
129;
130; AVX-LABEL: mul_v2i64c:
131; AVX:       # BB#0: # %entry
132; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [117,117]
133; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
134; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
135; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
136; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
137; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
138; AVX-NEXT:    retq
139entry:
140  %A = mul <2 x i64> %i, < i64 117, i64 117 >
141  ret <2 x i64> %A
142}
143
144define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
145; SSE2-LABEL: mul_v16i8:
146; SSE2:       # BB#0: # %entry
147; SSE2-NEXT:    movdqa %xmm1, %xmm2
148; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
149; SSE2-NEXT:    psraw $8, %xmm2
150; SSE2-NEXT:    movdqa %xmm0, %xmm3
151; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
152; SSE2-NEXT:    psraw $8, %xmm3
153; SSE2-NEXT:    pmullw %xmm2, %xmm3
154; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
155; SSE2-NEXT:    pand %xmm2, %xmm3
156; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
157; SSE2-NEXT:    psraw $8, %xmm1
158; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
159; SSE2-NEXT:    psraw $8, %xmm0
160; SSE2-NEXT:    pmullw %xmm1, %xmm0
161; SSE2-NEXT:    pand %xmm2, %xmm0
162; SSE2-NEXT:    packuswb %xmm3, %xmm0
163; SSE2-NEXT:    retq
164;
165; SSE41-LABEL: mul_v16i8:
166; SSE41:       # BB#0: # %entry
167; SSE41-NEXT:    pmovsxbw %xmm1, %xmm3
168; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
169; SSE41-NEXT:    pmullw %xmm3, %xmm2
170; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
171; SSE41-NEXT:    pand %xmm3, %xmm2
172; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
173; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
174; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
175; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
176; SSE41-NEXT:    pmullw %xmm1, %xmm0
177; SSE41-NEXT:    pand %xmm3, %xmm0
178; SSE41-NEXT:    packuswb %xmm0, %xmm2
179; SSE41-NEXT:    movdqa %xmm2, %xmm0
180; SSE41-NEXT:    retq
181;
182; AVX2-LABEL: mul_v16i8:
183; AVX2:       # BB#0: # %entry
184; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
185; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
186; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
187; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
188; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
189; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
190; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
191; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
192; AVX2-NEXT:    vzeroupper
193; AVX2-NEXT:    retq
194;
195; AVX512F-LABEL: mul_v16i8:
196; AVX512F:       # BB#0: # %entry
197; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
198; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
199; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
200; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
201; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
202; AVX512F-NEXT:    retq
203;
204; AVX512BW-LABEL: mul_v16i8:
205; AVX512BW:       # BB#0: # %entry
206; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
207; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
208; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
209; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
210; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
211; AVX512BW-NEXT:    retq
212entry:
213  %A = mul <16 x i8> %i, %j
214  ret <16 x i8> %A
215}
216
217define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind  {
218; SSE-LABEL: mul_v8i16:
219; SSE:       # BB#0: # %entry
220; SSE-NEXT:    pmullw %xmm1, %xmm0
221; SSE-NEXT:    retq
222;
223; AVX-LABEL: mul_v8i16:
224; AVX:       # BB#0: # %entry
225; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
226; AVX-NEXT:    retq
227entry:
228  %A = mul <8 x i16> %i, %j
229  ret <8 x i16> %A
230}
231
232define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind  {
233; SSE2-LABEL: mul_v4i32:
234; SSE2:       # BB#0: # %entry
235; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
236; SSE2-NEXT:    pmuludq %xmm1, %xmm0
237; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
238; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
239; SSE2-NEXT:    pmuludq %xmm2, %xmm1
240; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
241; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
242; SSE2-NEXT:    retq
243;
244; SSE41-LABEL: mul_v4i32:
245; SSE41:       # BB#0: # %entry
246; SSE41-NEXT:    pmulld %xmm1, %xmm0
247; SSE41-NEXT:    retq
248;
249; AVX-LABEL: mul_v4i32:
250; AVX:       # BB#0: # %entry
251; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
252; AVX-NEXT:    retq
253entry:
254  %A = mul <4 x i32> %i, %j
255  ret <4 x i32> %A
256}
257
258define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind  {
259; SSE-LABEL: mul_v2i64:
260; SSE:       # BB#0: # %entry
261; SSE-NEXT:    movdqa %xmm0, %xmm2
262; SSE-NEXT:    pmuludq %xmm1, %xmm2
263; SSE-NEXT:    movdqa %xmm1, %xmm3
264; SSE-NEXT:    psrlq $32, %xmm3
265; SSE-NEXT:    pmuludq %xmm0, %xmm3
266; SSE-NEXT:    psllq $32, %xmm3
267; SSE-NEXT:    paddq %xmm3, %xmm2
268; SSE-NEXT:    psrlq $32, %xmm0
269; SSE-NEXT:    pmuludq %xmm1, %xmm0
270; SSE-NEXT:    psllq $32, %xmm0
271; SSE-NEXT:    paddq %xmm2, %xmm0
272; SSE-NEXT:    retq
273;
274; AVX-LABEL: mul_v2i64:
275; AVX:       # BB#0: # %entry
276; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
277; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
278; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
279; AVX-NEXT:    vpsllq $32, %xmm3, %xmm3
280; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
281; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
282; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
283; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
284; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
285; AVX-NEXT:    retq
286entry:
287  %A = mul <2 x i64> %i, %j
288  ret <2 x i64> %A
289}
290
291declare void @foo()
292
293define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind  {
294; SSE2-LABEL: mul_v4i32spill:
295; SSE2:       # BB#0: # %entry
296; SSE2-NEXT:    subq $40, %rsp
297; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
298; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
299; SSE2-NEXT:    callq foo
300; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
301; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
302; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
303; SSE2-NEXT:    pmuludq %xmm2, %xmm0
304; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
305; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
306; SSE2-NEXT:    pmuludq %xmm1, %xmm2
307; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
308; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
309; SSE2-NEXT:    addq $40, %rsp
310; SSE2-NEXT:    retq
311;
312; SSE41-LABEL: mul_v4i32spill:
313; SSE41:       # BB#0: # %entry
314; SSE41-NEXT:    subq $40, %rsp
315; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
316; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
317; SSE41-NEXT:    callq foo
318; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
319; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
320; SSE41-NEXT:    addq $40, %rsp
321; SSE41-NEXT:    retq
322;
323; AVX-LABEL: mul_v4i32spill:
324; AVX:       # BB#0: # %entry
325; AVX-NEXT:    subq $40, %rsp
326; AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
327; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
328; AVX-NEXT:    callq foo
329; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
330; AVX-NEXT:    vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
331; AVX-NEXT:    addq $40, %rsp
332; AVX-NEXT:    retq
333entry:
334  ; Use a call to force spills.
335  call void @foo()
336  %A = mul <4 x i32> %i, %j
337  ret <4 x i32> %A
338}
339
340define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind  {
341; SSE-LABEL: mul_v2i64spill:
342; SSE:       # BB#0: # %entry
343; SSE-NEXT:    subq $40, %rsp
344; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
345; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
346; SSE-NEXT:    callq foo
347; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
348; SSE-NEXT:    movdqa %xmm0, %xmm2
349; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
350; SSE-NEXT:    pmuludq %xmm3, %xmm2
351; SSE-NEXT:    movdqa %xmm3, %xmm1
352; SSE-NEXT:    psrlq $32, %xmm1
353; SSE-NEXT:    pmuludq %xmm0, %xmm1
354; SSE-NEXT:    psllq $32, %xmm1
355; SSE-NEXT:    paddq %xmm1, %xmm2
356; SSE-NEXT:    psrlq $32, %xmm0
357; SSE-NEXT:    pmuludq %xmm3, %xmm0
358; SSE-NEXT:    psllq $32, %xmm0
359; SSE-NEXT:    paddq %xmm2, %xmm0
360; SSE-NEXT:    addq $40, %rsp
361; SSE-NEXT:    retq
362;
363; AVX-LABEL: mul_v2i64spill:
364; AVX:       # BB#0: # %entry
365; AVX-NEXT:    subq $40, %rsp
366; AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
367; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
368; AVX-NEXT:    callq foo
369; AVX-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
370; AVX-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
371; AVX-NEXT:    vpmuludq %xmm2, %xmm3, %xmm0
372; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm1
373; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
374; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
375; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
376; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm1
377; AVX-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
378; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
379; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
380; AVX-NEXT:    addq $40, %rsp
381; AVX-NEXT:    retq
382entry:
383  ; Use a call to force spills.
384  call void @foo()
385  %A = mul <2 x i64> %i, %j
386  ret <2 x i64> %A
387}
388
389define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
390; SSE2-LABEL: mul_v32i8c:
391; SSE2:       # BB#0: # %entry
392; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
393; SSE2-NEXT:    psraw $8, %xmm2
394; SSE2-NEXT:    movdqa %xmm0, %xmm3
395; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
396; SSE2-NEXT:    psraw $8, %xmm3
397; SSE2-NEXT:    pmullw %xmm2, %xmm3
398; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
399; SSE2-NEXT:    pand %xmm4, %xmm3
400; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
401; SSE2-NEXT:    psraw $8, %xmm0
402; SSE2-NEXT:    pmullw %xmm2, %xmm0
403; SSE2-NEXT:    pand %xmm4, %xmm0
404; SSE2-NEXT:    packuswb %xmm3, %xmm0
405; SSE2-NEXT:    movdqa %xmm1, %xmm3
406; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
407; SSE2-NEXT:    psraw $8, %xmm3
408; SSE2-NEXT:    pmullw %xmm2, %xmm3
409; SSE2-NEXT:    pand %xmm4, %xmm3
410; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
411; SSE2-NEXT:    psraw $8, %xmm1
412; SSE2-NEXT:    pmullw %xmm2, %xmm1
413; SSE2-NEXT:    pand %xmm4, %xmm1
414; SSE2-NEXT:    packuswb %xmm3, %xmm1
415; SSE2-NEXT:    retq
416;
417; SSE41-LABEL: mul_v32i8c:
418; SSE41:       # BB#0: # %entry
419; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
420; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm4
421; SSE41-NEXT:    pmullw %xmm4, %xmm2
422; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
423; SSE41-NEXT:    pand %xmm5, %xmm2
424; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
425; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
426; SSE41-NEXT:    pmullw %xmm4, %xmm0
427; SSE41-NEXT:    pand %xmm5, %xmm0
428; SSE41-NEXT:    packuswb %xmm0, %xmm2
429; SSE41-NEXT:    pmovsxbw %xmm1, %xmm3
430; SSE41-NEXT:    pmullw %xmm4, %xmm3
431; SSE41-NEXT:    pand %xmm5, %xmm3
432; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
433; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
434; SSE41-NEXT:    pmullw %xmm4, %xmm0
435; SSE41-NEXT:    pand %xmm5, %xmm0
436; SSE41-NEXT:    packuswb %xmm0, %xmm3
437; SSE41-NEXT:    movdqa %xmm2, %xmm0
438; SSE41-NEXT:    movdqa %xmm3, %xmm1
439; SSE41-NEXT:    retq
440;
441; AVX2-LABEL: mul_v32i8c:
442; AVX2:       # BB#0: # %entry
443; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
444; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
445; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
446; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
447; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
448; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
449; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
450; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
451; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
452; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
453; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
454; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
455; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
456; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
457; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
458; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
459; AVX2-NEXT:    retq
460;
461; AVX512F-LABEL: mul_v32i8c:
462; AVX512F:       # BB#0: # %entry
463; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
464; AVX512F-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
465; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
466; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
467; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
468; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
469; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
470; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
471; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
472; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
473; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
474; AVX512F-NEXT:    retq
475;
476; AVX512BW-LABEL: mul_v32i8c:
477; AVX512BW:       # BB#0: # %entry
478; AVX512BW-NEXT:    vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
479; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
480; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
481; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
482; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
483; AVX512BW-NEXT:    retq
484entry:
485  %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
486  ret <32 x i8> %A
487}
488
489define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind  {
490; SSE-LABEL: mul_v16i16c:
491; SSE:       # BB#0: # %entry
492; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
493; SSE-NEXT:    pmullw %xmm2, %xmm0
494; SSE-NEXT:    pmullw %xmm2, %xmm1
495; SSE-NEXT:    retq
496;
497; AVX-LABEL: mul_v16i16c:
498; AVX:       # BB#0: # %entry
499; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
500; AVX-NEXT:    retq
501entry:
502  %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
503  ret <16 x i16> %A
504}
505
506define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind  {
507; SSE2-LABEL: mul_v8i32c:
508; SSE2:       # BB#0: # %entry
509; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117]
510; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
511; SSE2-NEXT:    pmuludq %xmm2, %xmm0
512; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
513; SSE2-NEXT:    pmuludq %xmm2, %xmm3
514; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
515; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
516; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
517; SSE2-NEXT:    pmuludq %xmm2, %xmm1
518; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
519; SSE2-NEXT:    pmuludq %xmm2, %xmm3
520; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
521; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
522; SSE2-NEXT:    retq
523;
524; SSE41-LABEL: mul_v8i32c:
525; SSE41:       # BB#0: # %entry
526; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117]
527; SSE41-NEXT:    pmulld %xmm2, %xmm0
528; SSE41-NEXT:    pmulld %xmm2, %xmm1
529; SSE41-NEXT:    retq
530;
531; AVX-LABEL: mul_v8i32c:
532; AVX:       # BB#0: # %entry
533; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
534; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
535; AVX-NEXT:    retq
536entry:
537  %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 >
538  ret <8 x i32> %A
539}
540
541define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind  {
542; SSE-LABEL: mul_v4i64c:
543; SSE:       # BB#0: # %entry
544; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117]
545; SSE-NEXT:    movdqa %xmm0, %xmm3
546; SSE-NEXT:    pmuludq %xmm2, %xmm3
547; SSE-NEXT:    psrlq $32, %xmm0
548; SSE-NEXT:    pmuludq %xmm2, %xmm0
549; SSE-NEXT:    psllq $32, %xmm0
550; SSE-NEXT:    paddq %xmm3, %xmm0
551; SSE-NEXT:    movdqa %xmm1, %xmm3
552; SSE-NEXT:    pmuludq %xmm2, %xmm3
553; SSE-NEXT:    psrlq $32, %xmm1
554; SSE-NEXT:    pmuludq %xmm2, %xmm1
555; SSE-NEXT:    psllq $32, %xmm1
556; SSE-NEXT:    paddq %xmm3, %xmm1
557; SSE-NEXT:    retq
558;
559; AVX-LABEL: mul_v4i64c:
560; AVX:       # BB#0: # %entry
561; AVX-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
562; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
563; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
564; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
565; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
566; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
567; AVX-NEXT:    retq
568entry:
569  %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
570  ret <4 x i64> %A
571}
572
573define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
574; SSE2-LABEL: mul_v32i8:
575; SSE2:       # BB#0: # %entry
576; SSE2-NEXT:    movdqa %xmm2, %xmm4
577; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
578; SSE2-NEXT:    psraw $8, %xmm4
579; SSE2-NEXT:    movdqa %xmm0, %xmm5
580; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
581; SSE2-NEXT:    psraw $8, %xmm5
582; SSE2-NEXT:    pmullw %xmm4, %xmm5
583; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
584; SSE2-NEXT:    pand %xmm4, %xmm5
585; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
586; SSE2-NEXT:    psraw $8, %xmm2
587; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
588; SSE2-NEXT:    psraw $8, %xmm0
589; SSE2-NEXT:    pmullw %xmm2, %xmm0
590; SSE2-NEXT:    pand %xmm4, %xmm0
591; SSE2-NEXT:    packuswb %xmm5, %xmm0
592; SSE2-NEXT:    movdqa %xmm3, %xmm2
593; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
594; SSE2-NEXT:    psraw $8, %xmm2
595; SSE2-NEXT:    movdqa %xmm1, %xmm5
596; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
597; SSE2-NEXT:    psraw $8, %xmm5
598; SSE2-NEXT:    pmullw %xmm2, %xmm5
599; SSE2-NEXT:    pand %xmm4, %xmm5
600; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
601; SSE2-NEXT:    psraw $8, %xmm3
602; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
603; SSE2-NEXT:    psraw $8, %xmm1
604; SSE2-NEXT:    pmullw %xmm3, %xmm1
605; SSE2-NEXT:    pand %xmm4, %xmm1
606; SSE2-NEXT:    packuswb %xmm5, %xmm1
607; SSE2-NEXT:    retq
608;
609; SSE41-LABEL: mul_v32i8:
610; SSE41:       # BB#0: # %entry
611; SSE41-NEXT:    pmovsxbw %xmm2, %xmm5
612; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
613; SSE41-NEXT:    pmullw %xmm5, %xmm4
614; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
615; SSE41-NEXT:    pand %xmm5, %xmm4
616; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
617; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
618; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
619; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
620; SSE41-NEXT:    pmullw %xmm2, %xmm0
621; SSE41-NEXT:    pand %xmm5, %xmm0
622; SSE41-NEXT:    packuswb %xmm0, %xmm4
623; SSE41-NEXT:    pmovsxbw %xmm3, %xmm0
624; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
625; SSE41-NEXT:    pmullw %xmm0, %xmm2
626; SSE41-NEXT:    pand %xmm5, %xmm2
627; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
628; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
629; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
630; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
631; SSE41-NEXT:    pmullw %xmm0, %xmm1
632; SSE41-NEXT:    pand %xmm5, %xmm1
633; SSE41-NEXT:    packuswb %xmm1, %xmm2
634; SSE41-NEXT:    movdqa %xmm4, %xmm0
635; SSE41-NEXT:    movdqa %xmm2, %xmm1
636; SSE41-NEXT:    retq
637;
638; AVX2-LABEL: mul_v32i8:
639; AVX2:       # BB#0: # %entry
640; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
641; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
642; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
643; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
644; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
645; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
646; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
647; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
648; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
649; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
650; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
651; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
652; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
653; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
654; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
655; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
656; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
657; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
658; AVX2-NEXT:    retq
659;
660; AVX512F-LABEL: mul_v32i8:
661; AVX512F:       # BB#0: # %entry
662; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
663; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm3
664; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
665; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
666; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
667; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
668; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
669; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
670; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
671; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
672; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
673; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
674; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
675; AVX512F-NEXT:    retq
676;
677; AVX512BW-LABEL: mul_v32i8:
678; AVX512BW:       # BB#0: # %entry
679; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
680; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
681; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
682; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
683; AVX512BW-NEXT:    retq
684entry:
685  %A = mul <32 x i8> %i, %j
686  ret <32 x i8> %A
687}
688
689define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind  {
690; SSE-LABEL: mul_v16i16:
691; SSE:       # BB#0: # %entry
692; SSE-NEXT:    pmullw %xmm2, %xmm0
693; SSE-NEXT:    pmullw %xmm3, %xmm1
694; SSE-NEXT:    retq
695;
696; AVX-LABEL: mul_v16i16:
697; AVX:       # BB#0: # %entry
698; AVX-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
699; AVX-NEXT:    retq
700entry:
701  %A = mul <16 x i16> %i, %j
702  ret <16 x i16> %A
703}
704
705define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind  {
706; SSE2-LABEL: mul_v8i32:
707; SSE2:       # BB#0: # %entry
708; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
709; SSE2-NEXT:    pmuludq %xmm2, %xmm0
710; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
711; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
712; SSE2-NEXT:    pmuludq %xmm4, %xmm2
713; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
714; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
715; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
716; SSE2-NEXT:    pmuludq %xmm3, %xmm1
717; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
718; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
719; SSE2-NEXT:    pmuludq %xmm2, %xmm3
720; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
721; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
722; SSE2-NEXT:    retq
723;
724; SSE41-LABEL: mul_v8i32:
725; SSE41:       # BB#0: # %entry
726; SSE41-NEXT:    pmulld %xmm2, %xmm0
727; SSE41-NEXT:    pmulld %xmm3, %xmm1
728; SSE41-NEXT:    retq
729;
730; AVX-LABEL: mul_v8i32:
731; AVX:       # BB#0: # %entry
732; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
733; AVX-NEXT:    retq
734entry:
735  %A = mul <8 x i32> %i, %j
736  ret <8 x i32> %A
737}
738
739define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind  {
740; SSE-LABEL: mul_v4i64:
741; SSE:       # BB#0: # %entry
742; SSE-NEXT:    movdqa %xmm0, %xmm4
743; SSE-NEXT:    pmuludq %xmm2, %xmm4
744; SSE-NEXT:    movdqa %xmm2, %xmm5
745; SSE-NEXT:    psrlq $32, %xmm5
746; SSE-NEXT:    pmuludq %xmm0, %xmm5
747; SSE-NEXT:    psllq $32, %xmm5
748; SSE-NEXT:    paddq %xmm5, %xmm4
749; SSE-NEXT:    psrlq $32, %xmm0
750; SSE-NEXT:    pmuludq %xmm2, %xmm0
751; SSE-NEXT:    psllq $32, %xmm0
752; SSE-NEXT:    paddq %xmm4, %xmm0
753; SSE-NEXT:    movdqa %xmm1, %xmm2
754; SSE-NEXT:    pmuludq %xmm3, %xmm2
755; SSE-NEXT:    movdqa %xmm3, %xmm4
756; SSE-NEXT:    psrlq $32, %xmm4
757; SSE-NEXT:    pmuludq %xmm1, %xmm4
758; SSE-NEXT:    psllq $32, %xmm4
759; SSE-NEXT:    paddq %xmm4, %xmm2
760; SSE-NEXT:    psrlq $32, %xmm1
761; SSE-NEXT:    pmuludq %xmm3, %xmm1
762; SSE-NEXT:    psllq $32, %xmm1
763; SSE-NEXT:    paddq %xmm2, %xmm1
764; SSE-NEXT:    retq
765;
766; AVX-LABEL: mul_v4i64:
767; AVX:       # BB#0: # %entry
768; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
769; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm3
770; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
771; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
772; AVX-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
773; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
774; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
775; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
776; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
777; AVX-NEXT:    retq
778entry:
779  %A = mul <4 x i64> %i, %j
780  ret <4 x i64> %A
781}
782
783define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
784; SSE2-LABEL: mul_v64i8c:
785; SSE2:       # BB#0: # %entry
786; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
787; SSE2-NEXT:    psraw $8, %xmm4
788; SSE2-NEXT:    movdqa %xmm0, %xmm6
789; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
790; SSE2-NEXT:    psraw $8, %xmm6
791; SSE2-NEXT:    pmullw %xmm4, %xmm6
792; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
793; SSE2-NEXT:    pand %xmm5, %xmm6
794; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
795; SSE2-NEXT:    psraw $8, %xmm0
796; SSE2-NEXT:    pmullw %xmm4, %xmm0
797; SSE2-NEXT:    pand %xmm5, %xmm0
798; SSE2-NEXT:    packuswb %xmm6, %xmm0
799; SSE2-NEXT:    movdqa %xmm1, %xmm6
800; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
801; SSE2-NEXT:    psraw $8, %xmm6
802; SSE2-NEXT:    pmullw %xmm4, %xmm6
803; SSE2-NEXT:    pand %xmm5, %xmm6
804; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
805; SSE2-NEXT:    psraw $8, %xmm1
806; SSE2-NEXT:    pmullw %xmm4, %xmm1
807; SSE2-NEXT:    pand %xmm5, %xmm1
808; SSE2-NEXT:    packuswb %xmm6, %xmm1
809; SSE2-NEXT:    movdqa %xmm2, %xmm6
810; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
811; SSE2-NEXT:    psraw $8, %xmm6
812; SSE2-NEXT:    pmullw %xmm4, %xmm6
813; SSE2-NEXT:    pand %xmm5, %xmm6
814; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
815; SSE2-NEXT:    psraw $8, %xmm2
816; SSE2-NEXT:    pmullw %xmm4, %xmm2
817; SSE2-NEXT:    pand %xmm5, %xmm2
818; SSE2-NEXT:    packuswb %xmm6, %xmm2
819; SSE2-NEXT:    movdqa %xmm3, %xmm6
820; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
821; SSE2-NEXT:    psraw $8, %xmm6
822; SSE2-NEXT:    pmullw %xmm4, %xmm6
823; SSE2-NEXT:    pand %xmm5, %xmm6
824; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
825; SSE2-NEXT:    psraw $8, %xmm3
826; SSE2-NEXT:    pmullw %xmm4, %xmm3
827; SSE2-NEXT:    pand %xmm5, %xmm3
828; SSE2-NEXT:    packuswb %xmm6, %xmm3
829; SSE2-NEXT:    retq
830;
831; SSE41-LABEL: mul_v64i8c:
832; SSE41:       # BB#0: # %entry
833; SSE41-NEXT:    movdqa %xmm1, %xmm4
834; SSE41-NEXT:    movdqa %xmm0, %xmm1
835; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
836; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm6
837; SSE41-NEXT:    pmullw %xmm6, %xmm0
838; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
839; SSE41-NEXT:    pand %xmm7, %xmm0
840; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
841; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
842; SSE41-NEXT:    pmullw %xmm6, %xmm1
843; SSE41-NEXT:    pand %xmm7, %xmm1
844; SSE41-NEXT:    packuswb %xmm1, %xmm0
845; SSE41-NEXT:    pmovsxbw %xmm4, %xmm1
846; SSE41-NEXT:    pmullw %xmm6, %xmm1
847; SSE41-NEXT:    pand %xmm7, %xmm1
848; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
849; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
850; SSE41-NEXT:    pmullw %xmm6, %xmm4
851; SSE41-NEXT:    pand %xmm7, %xmm4
852; SSE41-NEXT:    packuswb %xmm4, %xmm1
853; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
854; SSE41-NEXT:    pmullw %xmm6, %xmm4
855; SSE41-NEXT:    pand %xmm7, %xmm4
856; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
857; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
858; SSE41-NEXT:    pmullw %xmm6, %xmm2
859; SSE41-NEXT:    pand %xmm7, %xmm2
860; SSE41-NEXT:    packuswb %xmm2, %xmm4
861; SSE41-NEXT:    pmovsxbw %xmm3, %xmm5
862; SSE41-NEXT:    pmullw %xmm6, %xmm5
863; SSE41-NEXT:    pand %xmm7, %xmm5
864; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
865; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
866; SSE41-NEXT:    pmullw %xmm6, %xmm2
867; SSE41-NEXT:    pand %xmm7, %xmm2
868; SSE41-NEXT:    packuswb %xmm2, %xmm5
869; SSE41-NEXT:    movdqa %xmm4, %xmm2
870; SSE41-NEXT:    movdqa %xmm5, %xmm3
871; SSE41-NEXT:    retq
872;
873; AVX2-LABEL: mul_v64i8c:
874; AVX2:       # BB#0: # %entry
875; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
876; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
877; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm3
878; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
879; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
880; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
881; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
882; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
883; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
884; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
885; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
886; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
887; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
888; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
889; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
890; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
891; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
892; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
893; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
894; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
895; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
896; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
897; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
898; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
899; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
900; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
901; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
902; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
903; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
904; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
905; AVX2-NEXT:    retq
906;
907; AVX512F-LABEL: mul_v64i8c:
908; AVX512F:       # BB#0: # %entry
909; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
910; AVX512F-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm3
911; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
912; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
913; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
914; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
915; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
916; AVX512F-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
917; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
918; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
919; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
920; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
921; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
922; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
923; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
924; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
925; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
926; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
927; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
928; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
929; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
930; AVX512F-NEXT:    retq
931;
932; AVX512BW-LABEL: mul_v64i8c:
933; AVX512BW:       # BB#0: # %entry
934; AVX512BW-NEXT:    vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
935; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
936; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm2
937; AVX512BW-NEXT:    vpmullw %zmm1, %zmm2, %zmm2
938; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
939; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
940; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
941; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
942; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
943; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
944; AVX512BW-NEXT:    retq
945entry:
946  %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
947  ret <64 x i8> %A
948}
949
950define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
951; SSE2-LABEL: mul_v64i8:
952; SSE2:       # BB#0: # %entry
953; SSE2-NEXT:    movdqa %xmm4, %xmm8
954; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
955; SSE2-NEXT:    psraw $8, %xmm8
956; SSE2-NEXT:    movdqa %xmm0, %xmm9
957; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
958; SSE2-NEXT:    psraw $8, %xmm9
959; SSE2-NEXT:    pmullw %xmm8, %xmm9
960; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
961; SSE2-NEXT:    pand %xmm8, %xmm9
962; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
963; SSE2-NEXT:    psraw $8, %xmm4
964; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
965; SSE2-NEXT:    psraw $8, %xmm0
966; SSE2-NEXT:    pmullw %xmm4, %xmm0
967; SSE2-NEXT:    pand %xmm8, %xmm0
968; SSE2-NEXT:    packuswb %xmm9, %xmm0
969; SSE2-NEXT:    movdqa %xmm5, %xmm9
970; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
971; SSE2-NEXT:    psraw $8, %xmm9
972; SSE2-NEXT:    movdqa %xmm1, %xmm4
973; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
974; SSE2-NEXT:    psraw $8, %xmm4
975; SSE2-NEXT:    pmullw %xmm9, %xmm4
976; SSE2-NEXT:    pand %xmm8, %xmm4
977; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
978; SSE2-NEXT:    psraw $8, %xmm5
979; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
980; SSE2-NEXT:    psraw $8, %xmm1
981; SSE2-NEXT:    pmullw %xmm5, %xmm1
982; SSE2-NEXT:    pand %xmm8, %xmm1
983; SSE2-NEXT:    packuswb %xmm4, %xmm1
984; SSE2-NEXT:    movdqa %xmm6, %xmm4
985; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
986; SSE2-NEXT:    psraw $8, %xmm4
987; SSE2-NEXT:    movdqa %xmm2, %xmm5
988; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
989; SSE2-NEXT:    psraw $8, %xmm5
990; SSE2-NEXT:    pmullw %xmm4, %xmm5
991; SSE2-NEXT:    pand %xmm8, %xmm5
992; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
993; SSE2-NEXT:    psraw $8, %xmm6
994; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
995; SSE2-NEXT:    psraw $8, %xmm2
996; SSE2-NEXT:    pmullw %xmm6, %xmm2
997; SSE2-NEXT:    pand %xmm8, %xmm2
998; SSE2-NEXT:    packuswb %xmm5, %xmm2
999; SSE2-NEXT:    movdqa %xmm7, %xmm4
1000; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1001; SSE2-NEXT:    psraw $8, %xmm4
1002; SSE2-NEXT:    movdqa %xmm3, %xmm5
1003; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1004; SSE2-NEXT:    psraw $8, %xmm5
1005; SSE2-NEXT:    pmullw %xmm4, %xmm5
1006; SSE2-NEXT:    pand %xmm8, %xmm5
1007; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1008; SSE2-NEXT:    psraw $8, %xmm7
1009; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1010; SSE2-NEXT:    psraw $8, %xmm3
1011; SSE2-NEXT:    pmullw %xmm7, %xmm3
1012; SSE2-NEXT:    pand %xmm8, %xmm3
1013; SSE2-NEXT:    packuswb %xmm5, %xmm3
1014; SSE2-NEXT:    retq
1015;
1016; SSE41-LABEL: mul_v64i8:
1017; SSE41:       # BB#0: # %entry
1018; SSE41-NEXT:    movdqa %xmm1, %xmm8
1019; SSE41-NEXT:    movdqa %xmm0, %xmm1
1020; SSE41-NEXT:    pmovsxbw %xmm4, %xmm9
1021; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
1022; SSE41-NEXT:    pmullw %xmm9, %xmm0
1023; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
1024; SSE41-NEXT:    pand %xmm9, %xmm0
1025; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
1026; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
1027; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1028; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
1029; SSE41-NEXT:    pmullw %xmm4, %xmm1
1030; SSE41-NEXT:    pand %xmm9, %xmm1
1031; SSE41-NEXT:    packuswb %xmm1, %xmm0
1032; SSE41-NEXT:    pmovsxbw %xmm5, %xmm4
1033; SSE41-NEXT:    pmovsxbw %xmm8, %xmm1
1034; SSE41-NEXT:    pmullw %xmm4, %xmm1
1035; SSE41-NEXT:    pand %xmm9, %xmm1
1036; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
1037; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
1038; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
1039; SSE41-NEXT:    pmovsxbw %xmm5, %xmm5
1040; SSE41-NEXT:    pmullw %xmm4, %xmm5
1041; SSE41-NEXT:    pand %xmm9, %xmm5
1042; SSE41-NEXT:    packuswb %xmm5, %xmm1
1043; SSE41-NEXT:    pmovsxbw %xmm6, %xmm5
1044; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
1045; SSE41-NEXT:    pmullw %xmm5, %xmm4
1046; SSE41-NEXT:    pand %xmm9, %xmm4
1047; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
1048; SSE41-NEXT:    pmovsxbw %xmm5, %xmm5
1049; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1050; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1051; SSE41-NEXT:    pmullw %xmm5, %xmm2
1052; SSE41-NEXT:    pand %xmm9, %xmm2
1053; SSE41-NEXT:    packuswb %xmm2, %xmm4
1054; SSE41-NEXT:    pmovsxbw %xmm7, %xmm2
1055; SSE41-NEXT:    pmovsxbw %xmm3, %xmm5
1056; SSE41-NEXT:    pmullw %xmm2, %xmm5
1057; SSE41-NEXT:    pand %xmm9, %xmm5
1058; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]
1059; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1060; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1061; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
1062; SSE41-NEXT:    pmullw %xmm2, %xmm3
1063; SSE41-NEXT:    pand %xmm9, %xmm3
1064; SSE41-NEXT:    packuswb %xmm3, %xmm5
1065; SSE41-NEXT:    movdqa %xmm4, %xmm2
1066; SSE41-NEXT:    movdqa %xmm5, %xmm3
1067; SSE41-NEXT:    retq
1068;
1069; AVX2-LABEL: mul_v64i8:
1070; AVX2:       # BB#0: # %entry
1071; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
1072; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
1073; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
1074; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
1075; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
1076; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1077; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1078; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1079; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1080; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1081; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
1082; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1083; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1084; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1085; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1086; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1087; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1088; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
1089; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
1090; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
1091; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
1092; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
1093; AVX2-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
1094; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
1095; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1096; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1097; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
1098; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
1099; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1100; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1101; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1102; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
1103; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
1104; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1105; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1106; AVX2-NEXT:    retq
1107;
1108; AVX512F-LABEL: mul_v64i8:
1109; AVX512F:       # BB#0: # %entry
1110; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
1111; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm5
1112; AVX512F-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
1113; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
1114; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
1115; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
1116; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
1117; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
1118; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
1119; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1120; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1121; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1122; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
1123; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm2
1124; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm4
1125; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
1126; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
1127; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
1128; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm3
1129; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm3
1130; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
1131; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
1132; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1133; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
1134; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
1135; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1136; AVX512F-NEXT:    retq
1137;
1138; AVX512BW-LABEL: mul_v64i8:
1139; AVX512BW:       # BB#0: # %entry
1140; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
1141; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
1142; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
1143; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
1144; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1145; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1146; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1147; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1148; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1149; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1150; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1151; AVX512BW-NEXT:    retq
1152entry:
1153  %A = mul <64 x i8> %i, %j
1154  ret <64 x i8> %A
1155}
1156
1157