1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { 9; SSE2-LABEL: mul_v16i8c: 10; SSE2: # BB#0: # %entry 11; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 12; SSE2-NEXT: psraw $8, %xmm1 13; SSE2-NEXT: movdqa %xmm0, %xmm2 14; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 15; SSE2-NEXT: psraw $8, %xmm2 16; SSE2-NEXT: pmullw %xmm1, %xmm2 17; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 18; SSE2-NEXT: pand %xmm3, %xmm2 19; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 20; SSE2-NEXT: psraw $8, %xmm0 21; SSE2-NEXT: pmullw %xmm1, %xmm0 22; SSE2-NEXT: pand %xmm3, %xmm0 23; SSE2-NEXT: packuswb %xmm2, %xmm0 24; SSE2-NEXT: retq 25; 26; SSE41-LABEL: mul_v16i8c: 27; SSE41: # BB#0: # %entry 28; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 29; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2 30; SSE41-NEXT: pmullw %xmm2, %xmm1 31; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 32; SSE41-NEXT: pand %xmm3, %xmm1 33; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 34; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 35; SSE41-NEXT: pmullw %xmm2, %xmm0 36; SSE41-NEXT: pand %xmm3, %xmm0 37; SSE41-NEXT: packuswb %xmm0, %xmm1 38; SSE41-NEXT: movdqa %xmm1, %xmm0 39; SSE41-NEXT: retq 40; 41; AVX2-LABEL: mul_v16i8c: 42; AVX2: # BB#0: # %entry 43; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 44; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1 45; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 46; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 47; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 48; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 49; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 50; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 51; AVX2-NEXT: vzeroupper 52; AVX2-NEXT: retq 53; 54; AVX512F-LABEL: mul_v16i8c: 55; AVX512F: # BB#0: # %entry 56; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 57; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1 58; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 59; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 60; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 61; AVX512F-NEXT: retq 62; 63; AVX512BW-LABEL: mul_v16i8c: 64; AVX512BW: # BB#0: # %entry 65; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 66; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1 67; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 68; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 69; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 70; AVX512BW-NEXT: retq 71entry: 72 %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 73 ret <16 x i8> %A 74} 75 76define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind { 77; SSE-LABEL: mul_v8i16c: 78; SSE: # BB#0: # %entry 79; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 80; SSE-NEXT: retq 81; 82; AVX-LABEL: mul_v8i16c: 83; AVX: # BB#0: # %entry 84; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 85; AVX-NEXT: retq 86entry: 87 %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 88 ret <8 x i16> %A 89} 90 91define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { 92; SSE2-LABEL: mul_v4i32c: 93; SSE2: # BB#0: # %entry 94; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] 95; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 96; SSE2-NEXT: pmuludq %xmm1, %xmm0 97; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 98; SSE2-NEXT: pmuludq %xmm1, %xmm2 99; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 100; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 101; SSE2-NEXT: retq 102; 103; SSE41-LABEL: mul_v4i32c: 104; SSE41: # BB#0: # %entry 105; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 106; SSE41-NEXT: retq 107; 108; AVX-LABEL: mul_v4i32c: 109; AVX: # BB#0: # %entry 110; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 111; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 112; AVX-NEXT: retq 113entry: 114 %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > 115 ret <4 x i32> %A 116} 117 118define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { 119; SSE-LABEL: mul_v2i64c: 120; SSE: # BB#0: # %entry 121; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] 122; SSE-NEXT: movdqa %xmm0, %xmm2 123; SSE-NEXT: pmuludq %xmm1, %xmm2 124; SSE-NEXT: psrlq $32, %xmm0 125; SSE-NEXT: pmuludq %xmm1, %xmm0 126; SSE-NEXT: psllq $32, %xmm0 127; SSE-NEXT: paddq %xmm2, %xmm0 128; SSE-NEXT: retq 129; 130; AVX-LABEL: mul_v2i64c: 131; AVX: # BB#0: # %entry 132; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] 133; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 134; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 135; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 136; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 137; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 138; AVX-NEXT: retq 139entry: 140 %A = mul <2 x i64> %i, < i64 117, i64 117 > 141 ret <2 x i64> %A 142} 143 144define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { 145; SSE2-LABEL: mul_v16i8: 146; SSE2: # BB#0: # %entry 147; SSE2-NEXT: movdqa %xmm1, %xmm2 148; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 149; SSE2-NEXT: psraw $8, %xmm2 150; SSE2-NEXT: movdqa %xmm0, %xmm3 151; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 152; SSE2-NEXT: psraw $8, %xmm3 153; SSE2-NEXT: pmullw %xmm2, %xmm3 154; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 155; SSE2-NEXT: pand %xmm2, %xmm3 156; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 157; SSE2-NEXT: psraw $8, %xmm1 158; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 159; SSE2-NEXT: psraw $8, %xmm0 160; SSE2-NEXT: pmullw %xmm1, %xmm0 161; SSE2-NEXT: pand %xmm2, %xmm0 162; SSE2-NEXT: packuswb %xmm3, %xmm0 163; SSE2-NEXT: retq 164; 165; SSE41-LABEL: mul_v16i8: 166; SSE41: # BB#0: # %entry 167; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 168; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 169; SSE41-NEXT: pmullw %xmm3, %xmm2 170; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 171; SSE41-NEXT: pand %xmm3, %xmm2 172; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 173; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 174; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 175; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 176; SSE41-NEXT: pmullw %xmm1, %xmm0 177; SSE41-NEXT: pand %xmm3, %xmm0 178; SSE41-NEXT: packuswb %xmm0, %xmm2 179; SSE41-NEXT: movdqa %xmm2, %xmm0 180; SSE41-NEXT: retq 181; 182; AVX2-LABEL: mul_v16i8: 183; AVX2: # BB#0: # %entry 184; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 185; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 186; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 187; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 188; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 189; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 190; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 191; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 192; AVX2-NEXT: vzeroupper 193; AVX2-NEXT: retq 194; 195; AVX512F-LABEL: mul_v16i8: 196; AVX512F: # BB#0: # %entry 197; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 198; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 199; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 200; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 201; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 202; AVX512F-NEXT: retq 203; 204; AVX512BW-LABEL: mul_v16i8: 205; AVX512BW: # BB#0: # %entry 206; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 207; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 208; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 209; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 210; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 211; AVX512BW-NEXT: retq 212entry: 213 %A = mul <16 x i8> %i, %j 214 ret <16 x i8> %A 215} 216 217define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind { 218; SSE-LABEL: mul_v8i16: 219; SSE: # BB#0: # %entry 220; SSE-NEXT: pmullw %xmm1, %xmm0 221; SSE-NEXT: retq 222; 223; AVX-LABEL: mul_v8i16: 224; AVX: # BB#0: # %entry 225; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 226; AVX-NEXT: retq 227entry: 228 %A = mul <8 x i16> %i, %j 229 ret <8 x i16> %A 230} 231 232define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind { 233; SSE2-LABEL: mul_v4i32: 234; SSE2: # BB#0: # %entry 235; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 236; SSE2-NEXT: pmuludq %xmm1, %xmm0 237; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 238; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 239; SSE2-NEXT: pmuludq %xmm2, %xmm1 240; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 241; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 242; SSE2-NEXT: retq 243; 244; SSE41-LABEL: mul_v4i32: 245; SSE41: # BB#0: # %entry 246; SSE41-NEXT: pmulld %xmm1, %xmm0 247; SSE41-NEXT: retq 248; 249; AVX-LABEL: mul_v4i32: 250; AVX: # BB#0: # %entry 251; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 252; AVX-NEXT: retq 253entry: 254 %A = mul <4 x i32> %i, %j 255 ret <4 x i32> %A 256} 257 258define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind { 259; SSE-LABEL: mul_v2i64: 260; SSE: # BB#0: # %entry 261; SSE-NEXT: movdqa %xmm0, %xmm2 262; SSE-NEXT: pmuludq %xmm1, %xmm2 263; SSE-NEXT: movdqa %xmm1, %xmm3 264; SSE-NEXT: psrlq $32, %xmm3 265; SSE-NEXT: pmuludq %xmm0, %xmm3 266; SSE-NEXT: psllq $32, %xmm3 267; SSE-NEXT: paddq %xmm3, %xmm2 268; SSE-NEXT: psrlq $32, %xmm0 269; SSE-NEXT: pmuludq %xmm1, %xmm0 270; SSE-NEXT: psllq $32, %xmm0 271; SSE-NEXT: paddq %xmm2, %xmm0 272; SSE-NEXT: retq 273; 274; AVX-LABEL: mul_v2i64: 275; AVX: # BB#0: # %entry 276; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 277; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 278; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 279; AVX-NEXT: vpsllq $32, %xmm3, %xmm3 280; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 281; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 282; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 283; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 284; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 285; AVX-NEXT: retq 286entry: 287 %A = mul <2 x i64> %i, %j 288 ret <2 x i64> %A 289} 290 291declare void @foo() 292 293define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind { 294; SSE2-LABEL: mul_v4i32spill: 295; SSE2: # BB#0: # %entry 296; SSE2-NEXT: subq $40, %rsp 297; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 298; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 299; SSE2-NEXT: callq foo 300; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 301; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 302; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 303; SSE2-NEXT: pmuludq %xmm2, %xmm0 304; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 305; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 306; SSE2-NEXT: pmuludq %xmm1, %xmm2 307; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 308; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 309; SSE2-NEXT: addq $40, %rsp 310; SSE2-NEXT: retq 311; 312; SSE41-LABEL: mul_v4i32spill: 313; SSE41: # BB#0: # %entry 314; SSE41-NEXT: subq $40, %rsp 315; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 316; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 317; SSE41-NEXT: callq foo 318; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 319; SSE41-NEXT: pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 320; SSE41-NEXT: addq $40, %rsp 321; SSE41-NEXT: retq 322; 323; AVX-LABEL: mul_v4i32spill: 324; AVX: # BB#0: # %entry 325; AVX-NEXT: subq $40, %rsp 326; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 327; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 328; AVX-NEXT: callq foo 329; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 330; AVX-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 331; AVX-NEXT: addq $40, %rsp 332; AVX-NEXT: retq 333entry: 334 ; Use a call to force spills. 335 call void @foo() 336 %A = mul <4 x i32> %i, %j 337 ret <4 x i32> %A 338} 339 340define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { 341; SSE-LABEL: mul_v2i64spill: 342; SSE: # BB#0: # %entry 343; SSE-NEXT: subq $40, %rsp 344; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 345; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 346; SSE-NEXT: callq foo 347; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 348; SSE-NEXT: movdqa %xmm0, %xmm2 349; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 350; SSE-NEXT: pmuludq %xmm3, %xmm2 351; SSE-NEXT: movdqa %xmm3, %xmm1 352; SSE-NEXT: psrlq $32, %xmm1 353; SSE-NEXT: pmuludq %xmm0, %xmm1 354; SSE-NEXT: psllq $32, %xmm1 355; SSE-NEXT: paddq %xmm1, %xmm2 356; SSE-NEXT: psrlq $32, %xmm0 357; SSE-NEXT: pmuludq %xmm3, %xmm0 358; SSE-NEXT: psllq $32, %xmm0 359; SSE-NEXT: paddq %xmm2, %xmm0 360; SSE-NEXT: addq $40, %rsp 361; SSE-NEXT: retq 362; 363; AVX-LABEL: mul_v2i64spill: 364; AVX: # BB#0: # %entry 365; AVX-NEXT: subq $40, %rsp 366; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 367; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 368; AVX-NEXT: callq foo 369; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 370; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 371; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm0 372; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1 373; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 374; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 375; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 376; AVX-NEXT: vpsrlq $32, %xmm3, %xmm1 377; AVX-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 378; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 379; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 380; AVX-NEXT: addq $40, %rsp 381; AVX-NEXT: retq 382entry: 383 ; Use a call to force spills. 384 call void @foo() 385 %A = mul <2 x i64> %i, %j 386 ret <2 x i64> %A 387} 388 389define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { 390; SSE2-LABEL: mul_v32i8c: 391; SSE2: # BB#0: # %entry 392; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 393; SSE2-NEXT: psraw $8, %xmm2 394; SSE2-NEXT: movdqa %xmm0, %xmm3 395; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 396; SSE2-NEXT: psraw $8, %xmm3 397; SSE2-NEXT: pmullw %xmm2, %xmm3 398; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 399; SSE2-NEXT: pand %xmm4, %xmm3 400; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 401; SSE2-NEXT: psraw $8, %xmm0 402; SSE2-NEXT: pmullw %xmm2, %xmm0 403; SSE2-NEXT: pand %xmm4, %xmm0 404; SSE2-NEXT: packuswb %xmm3, %xmm0 405; SSE2-NEXT: movdqa %xmm1, %xmm3 406; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 407; SSE2-NEXT: psraw $8, %xmm3 408; SSE2-NEXT: pmullw %xmm2, %xmm3 409; SSE2-NEXT: pand %xmm4, %xmm3 410; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 411; SSE2-NEXT: psraw $8, %xmm1 412; SSE2-NEXT: pmullw %xmm2, %xmm1 413; SSE2-NEXT: pand %xmm4, %xmm1 414; SSE2-NEXT: packuswb %xmm3, %xmm1 415; SSE2-NEXT: retq 416; 417; SSE41-LABEL: mul_v32i8c: 418; SSE41: # BB#0: # %entry 419; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 420; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm4 421; SSE41-NEXT: pmullw %xmm4, %xmm2 422; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 423; SSE41-NEXT: pand %xmm5, %xmm2 424; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 425; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 426; SSE41-NEXT: pmullw %xmm4, %xmm0 427; SSE41-NEXT: pand %xmm5, %xmm0 428; SSE41-NEXT: packuswb %xmm0, %xmm2 429; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 430; SSE41-NEXT: pmullw %xmm4, %xmm3 431; SSE41-NEXT: pand %xmm5, %xmm3 432; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 433; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 434; SSE41-NEXT: pmullw %xmm4, %xmm0 435; SSE41-NEXT: pand %xmm5, %xmm0 436; SSE41-NEXT: packuswb %xmm0, %xmm3 437; SSE41-NEXT: movdqa %xmm2, %xmm0 438; SSE41-NEXT: movdqa %xmm3, %xmm1 439; SSE41-NEXT: retq 440; 441; AVX2-LABEL: mul_v32i8c: 442; AVX2: # BB#0: # %entry 443; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 444; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 445; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2 446; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 447; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 448; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 449; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 450; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 451; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 452; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 453; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 454; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 455; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 456; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 457; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 458; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 459; AVX2-NEXT: retq 460; 461; AVX512F-LABEL: mul_v32i8c: 462; AVX512F: # BB#0: # %entry 463; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 464; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2 465; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 466; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 467; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 468; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 469; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 470; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 471; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 472; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 473; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 474; AVX512F-NEXT: retq 475; 476; AVX512BW-LABEL: mul_v32i8c: 477; AVX512BW: # BB#0: # %entry 478; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 479; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 480; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 481; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 482; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 483; AVX512BW-NEXT: retq 484entry: 485 %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 486 ret <32 x i8> %A 487} 488 489define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { 490; SSE-LABEL: mul_v16i16c: 491; SSE: # BB#0: # %entry 492; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 493; SSE-NEXT: pmullw %xmm2, %xmm0 494; SSE-NEXT: pmullw %xmm2, %xmm1 495; SSE-NEXT: retq 496; 497; AVX-LABEL: mul_v16i16c: 498; AVX: # BB#0: # %entry 499; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 500; AVX-NEXT: retq 501entry: 502 %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 503 ret <16 x i16> %A 504} 505 506define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { 507; SSE2-LABEL: mul_v8i32c: 508; SSE2: # BB#0: # %entry 509; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 510; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 511; SSE2-NEXT: pmuludq %xmm2, %xmm0 512; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 513; SSE2-NEXT: pmuludq %xmm2, %xmm3 514; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 515; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 516; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 517; SSE2-NEXT: pmuludq %xmm2, %xmm1 518; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 519; SSE2-NEXT: pmuludq %xmm2, %xmm3 520; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 521; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 522; SSE2-NEXT: retq 523; 524; SSE41-LABEL: mul_v8i32c: 525; SSE41: # BB#0: # %entry 526; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 527; SSE41-NEXT: pmulld %xmm2, %xmm0 528; SSE41-NEXT: pmulld %xmm2, %xmm1 529; SSE41-NEXT: retq 530; 531; AVX-LABEL: mul_v8i32c: 532; AVX: # BB#0: # %entry 533; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 534; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 535; AVX-NEXT: retq 536entry: 537 %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 > 538 ret <8 x i32> %A 539} 540 541define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { 542; SSE-LABEL: mul_v4i64c: 543; SSE: # BB#0: # %entry 544; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] 545; SSE-NEXT: movdqa %xmm0, %xmm3 546; SSE-NEXT: pmuludq %xmm2, %xmm3 547; SSE-NEXT: psrlq $32, %xmm0 548; SSE-NEXT: pmuludq %xmm2, %xmm0 549; SSE-NEXT: psllq $32, %xmm0 550; SSE-NEXT: paddq %xmm3, %xmm0 551; SSE-NEXT: movdqa %xmm1, %xmm3 552; SSE-NEXT: pmuludq %xmm2, %xmm3 553; SSE-NEXT: psrlq $32, %xmm1 554; SSE-NEXT: pmuludq %xmm2, %xmm1 555; SSE-NEXT: psllq $32, %xmm1 556; SSE-NEXT: paddq %xmm3, %xmm1 557; SSE-NEXT: retq 558; 559; AVX-LABEL: mul_v4i64c: 560; AVX: # BB#0: # %entry 561; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 562; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 563; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 564; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 565; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 566; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 567; AVX-NEXT: retq 568entry: 569 %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > 570 ret <4 x i64> %A 571} 572 573define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { 574; SSE2-LABEL: mul_v32i8: 575; SSE2: # BB#0: # %entry 576; SSE2-NEXT: movdqa %xmm2, %xmm4 577; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 578; SSE2-NEXT: psraw $8, %xmm4 579; SSE2-NEXT: movdqa %xmm0, %xmm5 580; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 581; SSE2-NEXT: psraw $8, %xmm5 582; SSE2-NEXT: pmullw %xmm4, %xmm5 583; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 584; SSE2-NEXT: pand %xmm4, %xmm5 585; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 586; SSE2-NEXT: psraw $8, %xmm2 587; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 588; SSE2-NEXT: psraw $8, %xmm0 589; SSE2-NEXT: pmullw %xmm2, %xmm0 590; SSE2-NEXT: pand %xmm4, %xmm0 591; SSE2-NEXT: packuswb %xmm5, %xmm0 592; SSE2-NEXT: movdqa %xmm3, %xmm2 593; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 594; SSE2-NEXT: psraw $8, %xmm2 595; SSE2-NEXT: movdqa %xmm1, %xmm5 596; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 597; SSE2-NEXT: psraw $8, %xmm5 598; SSE2-NEXT: pmullw %xmm2, %xmm5 599; SSE2-NEXT: pand %xmm4, %xmm5 600; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 601; SSE2-NEXT: psraw $8, %xmm3 602; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 603; SSE2-NEXT: psraw $8, %xmm1 604; SSE2-NEXT: pmullw %xmm3, %xmm1 605; SSE2-NEXT: pand %xmm4, %xmm1 606; SSE2-NEXT: packuswb %xmm5, %xmm1 607; SSE2-NEXT: retq 608; 609; SSE41-LABEL: mul_v32i8: 610; SSE41: # BB#0: # %entry 611; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 612; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 613; SSE41-NEXT: pmullw %xmm5, %xmm4 614; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 615; SSE41-NEXT: pand %xmm5, %xmm4 616; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 617; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 618; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 619; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 620; SSE41-NEXT: pmullw %xmm2, %xmm0 621; SSE41-NEXT: pand %xmm5, %xmm0 622; SSE41-NEXT: packuswb %xmm0, %xmm4 623; SSE41-NEXT: pmovsxbw %xmm3, %xmm0 624; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 625; SSE41-NEXT: pmullw %xmm0, %xmm2 626; SSE41-NEXT: pand %xmm5, %xmm2 627; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 628; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 629; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 630; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 631; SSE41-NEXT: pmullw %xmm0, %xmm1 632; SSE41-NEXT: pand %xmm5, %xmm1 633; SSE41-NEXT: packuswb %xmm1, %xmm2 634; SSE41-NEXT: movdqa %xmm4, %xmm0 635; SSE41-NEXT: movdqa %xmm2, %xmm1 636; SSE41-NEXT: retq 637; 638; AVX2-LABEL: mul_v32i8: 639; AVX2: # BB#0: # %entry 640; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 641; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 642; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 643; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 644; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 645; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 646; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 647; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 648; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 649; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 650; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 651; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 652; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 653; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 654; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 655; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 656; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 657; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 658; AVX2-NEXT: retq 659; 660; AVX512F-LABEL: mul_v32i8: 661; AVX512F: # BB#0: # %entry 662; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2 663; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm3 664; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 665; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 666; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 667; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 668; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 669; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 670; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 671; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 672; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 673; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 674; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 675; AVX512F-NEXT: retq 676; 677; AVX512BW-LABEL: mul_v32i8: 678; AVX512BW: # BB#0: # %entry 679; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 680; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 681; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 682; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 683; AVX512BW-NEXT: retq 684entry: 685 %A = mul <32 x i8> %i, %j 686 ret <32 x i8> %A 687} 688 689define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind { 690; SSE-LABEL: mul_v16i16: 691; SSE: # BB#0: # %entry 692; SSE-NEXT: pmullw %xmm2, %xmm0 693; SSE-NEXT: pmullw %xmm3, %xmm1 694; SSE-NEXT: retq 695; 696; AVX-LABEL: mul_v16i16: 697; AVX: # BB#0: # %entry 698; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 699; AVX-NEXT: retq 700entry: 701 %A = mul <16 x i16> %i, %j 702 ret <16 x i16> %A 703} 704 705define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind { 706; SSE2-LABEL: mul_v8i32: 707; SSE2: # BB#0: # %entry 708; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 709; SSE2-NEXT: pmuludq %xmm2, %xmm0 710; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 711; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 712; SSE2-NEXT: pmuludq %xmm4, %xmm2 713; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 714; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 715; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 716; SSE2-NEXT: pmuludq %xmm3, %xmm1 717; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 718; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 719; SSE2-NEXT: pmuludq %xmm2, %xmm3 720; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 721; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 722; SSE2-NEXT: retq 723; 724; SSE41-LABEL: mul_v8i32: 725; SSE41: # BB#0: # %entry 726; SSE41-NEXT: pmulld %xmm2, %xmm0 727; SSE41-NEXT: pmulld %xmm3, %xmm1 728; SSE41-NEXT: retq 729; 730; AVX-LABEL: mul_v8i32: 731; AVX: # BB#0: # %entry 732; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 733; AVX-NEXT: retq 734entry: 735 %A = mul <8 x i32> %i, %j 736 ret <8 x i32> %A 737} 738 739define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind { 740; SSE-LABEL: mul_v4i64: 741; SSE: # BB#0: # %entry 742; SSE-NEXT: movdqa %xmm0, %xmm4 743; SSE-NEXT: pmuludq %xmm2, %xmm4 744; SSE-NEXT: movdqa %xmm2, %xmm5 745; SSE-NEXT: psrlq $32, %xmm5 746; SSE-NEXT: pmuludq %xmm0, %xmm5 747; SSE-NEXT: psllq $32, %xmm5 748; SSE-NEXT: paddq %xmm5, %xmm4 749; SSE-NEXT: psrlq $32, %xmm0 750; SSE-NEXT: pmuludq %xmm2, %xmm0 751; SSE-NEXT: psllq $32, %xmm0 752; SSE-NEXT: paddq %xmm4, %xmm0 753; SSE-NEXT: movdqa %xmm1, %xmm2 754; SSE-NEXT: pmuludq %xmm3, %xmm2 755; SSE-NEXT: movdqa %xmm3, %xmm4 756; SSE-NEXT: psrlq $32, %xmm4 757; SSE-NEXT: pmuludq %xmm1, %xmm4 758; SSE-NEXT: psllq $32, %xmm4 759; SSE-NEXT: paddq %xmm4, %xmm2 760; SSE-NEXT: psrlq $32, %xmm1 761; SSE-NEXT: pmuludq %xmm3, %xmm1 762; SSE-NEXT: psllq $32, %xmm1 763; SSE-NEXT: paddq %xmm2, %xmm1 764; SSE-NEXT: retq 765; 766; AVX-LABEL: mul_v4i64: 767; AVX: # BB#0: # %entry 768; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 769; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3 770; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 771; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 772; AVX-NEXT: vpaddq %ymm3, %ymm2, %ymm2 773; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 774; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 775; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 776; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 777; AVX-NEXT: retq 778entry: 779 %A = mul <4 x i64> %i, %j 780 ret <4 x i64> %A 781} 782 783define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { 784; SSE2-LABEL: mul_v64i8c: 785; SSE2: # BB#0: # %entry 786; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 787; SSE2-NEXT: psraw $8, %xmm4 788; SSE2-NEXT: movdqa %xmm0, %xmm6 789; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 790; SSE2-NEXT: psraw $8, %xmm6 791; SSE2-NEXT: pmullw %xmm4, %xmm6 792; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 793; SSE2-NEXT: pand %xmm5, %xmm6 794; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 795; SSE2-NEXT: psraw $8, %xmm0 796; SSE2-NEXT: pmullw %xmm4, %xmm0 797; SSE2-NEXT: pand %xmm5, %xmm0 798; SSE2-NEXT: packuswb %xmm6, %xmm0 799; SSE2-NEXT: movdqa %xmm1, %xmm6 800; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 801; SSE2-NEXT: psraw $8, %xmm6 802; SSE2-NEXT: pmullw %xmm4, %xmm6 803; SSE2-NEXT: pand %xmm5, %xmm6 804; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 805; SSE2-NEXT: psraw $8, %xmm1 806; SSE2-NEXT: pmullw %xmm4, %xmm1 807; SSE2-NEXT: pand %xmm5, %xmm1 808; SSE2-NEXT: packuswb %xmm6, %xmm1 809; SSE2-NEXT: movdqa %xmm2, %xmm6 810; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 811; SSE2-NEXT: psraw $8, %xmm6 812; SSE2-NEXT: pmullw %xmm4, %xmm6 813; SSE2-NEXT: pand %xmm5, %xmm6 814; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 815; SSE2-NEXT: psraw $8, %xmm2 816; SSE2-NEXT: pmullw %xmm4, %xmm2 817; SSE2-NEXT: pand %xmm5, %xmm2 818; SSE2-NEXT: packuswb %xmm6, %xmm2 819; SSE2-NEXT: movdqa %xmm3, %xmm6 820; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 821; SSE2-NEXT: psraw $8, %xmm6 822; SSE2-NEXT: pmullw %xmm4, %xmm6 823; SSE2-NEXT: pand %xmm5, %xmm6 824; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 825; SSE2-NEXT: psraw $8, %xmm3 826; SSE2-NEXT: pmullw %xmm4, %xmm3 827; SSE2-NEXT: pand %xmm5, %xmm3 828; SSE2-NEXT: packuswb %xmm6, %xmm3 829; SSE2-NEXT: retq 830; 831; SSE41-LABEL: mul_v64i8c: 832; SSE41: # BB#0: # %entry 833; SSE41-NEXT: movdqa %xmm1, %xmm4 834; SSE41-NEXT: movdqa %xmm0, %xmm1 835; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 836; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm6 837; SSE41-NEXT: pmullw %xmm6, %xmm0 838; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 839; SSE41-NEXT: pand %xmm7, %xmm0 840; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 841; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 842; SSE41-NEXT: pmullw %xmm6, %xmm1 843; SSE41-NEXT: pand %xmm7, %xmm1 844; SSE41-NEXT: packuswb %xmm1, %xmm0 845; SSE41-NEXT: pmovsxbw %xmm4, %xmm1 846; SSE41-NEXT: pmullw %xmm6, %xmm1 847; SSE41-NEXT: pand %xmm7, %xmm1 848; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 849; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 850; SSE41-NEXT: pmullw %xmm6, %xmm4 851; SSE41-NEXT: pand %xmm7, %xmm4 852; SSE41-NEXT: packuswb %xmm4, %xmm1 853; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 854; SSE41-NEXT: pmullw %xmm6, %xmm4 855; SSE41-NEXT: pand %xmm7, %xmm4 856; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 857; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 858; SSE41-NEXT: pmullw %xmm6, %xmm2 859; SSE41-NEXT: pand %xmm7, %xmm2 860; SSE41-NEXT: packuswb %xmm2, %xmm4 861; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 862; SSE41-NEXT: pmullw %xmm6, %xmm5 863; SSE41-NEXT: pand %xmm7, %xmm5 864; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 865; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 866; SSE41-NEXT: pmullw %xmm6, %xmm2 867; SSE41-NEXT: pand %xmm7, %xmm2 868; SSE41-NEXT: packuswb %xmm2, %xmm5 869; SSE41-NEXT: movdqa %xmm4, %xmm2 870; SSE41-NEXT: movdqa %xmm5, %xmm3 871; SSE41-NEXT: retq 872; 873; AVX2-LABEL: mul_v64i8c: 874; AVX2: # BB#0: # %entry 875; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 876; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 877; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3 878; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 879; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 880; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 881; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 882; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 883; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 884; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 885; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 886; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 887; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 888; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 889; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 890; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 891; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 892; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 893; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 894; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 895; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 896; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 897; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 898; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 899; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 900; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 901; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 902; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 903; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 904; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 905; AVX2-NEXT: retq 906; 907; AVX512F-LABEL: mul_v64i8c: 908; AVX512F: # BB#0: # %entry 909; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 910; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3 911; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 912; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 913; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 914; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 915; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 916; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 917; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 918; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 919; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 920; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2 921; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 922; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 923; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 924; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 925; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 926; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 927; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 928; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 929; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 930; AVX512F-NEXT: retq 931; 932; AVX512BW-LABEL: mul_v64i8c: 933; AVX512BW: # BB#0: # %entry 934; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 935; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 936; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 937; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm2 938; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 939; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 940; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 941; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 942; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 943; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 944; AVX512BW-NEXT: retq 945entry: 946 %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 947 ret <64 x i8> %A 948} 949 950define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { 951; SSE2-LABEL: mul_v64i8: 952; SSE2: # BB#0: # %entry 953; SSE2-NEXT: movdqa %xmm4, %xmm8 954; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 955; SSE2-NEXT: psraw $8, %xmm8 956; SSE2-NEXT: movdqa %xmm0, %xmm9 957; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 958; SSE2-NEXT: psraw $8, %xmm9 959; SSE2-NEXT: pmullw %xmm8, %xmm9 960; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 961; SSE2-NEXT: pand %xmm8, %xmm9 962; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 963; SSE2-NEXT: psraw $8, %xmm4 964; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 965; SSE2-NEXT: psraw $8, %xmm0 966; SSE2-NEXT: pmullw %xmm4, %xmm0 967; SSE2-NEXT: pand %xmm8, %xmm0 968; SSE2-NEXT: packuswb %xmm9, %xmm0 969; SSE2-NEXT: movdqa %xmm5, %xmm9 970; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 971; SSE2-NEXT: psraw $8, %xmm9 972; SSE2-NEXT: movdqa %xmm1, %xmm4 973; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 974; SSE2-NEXT: psraw $8, %xmm4 975; SSE2-NEXT: pmullw %xmm9, %xmm4 976; SSE2-NEXT: pand %xmm8, %xmm4 977; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 978; SSE2-NEXT: psraw $8, %xmm5 979; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 980; SSE2-NEXT: psraw $8, %xmm1 981; SSE2-NEXT: pmullw %xmm5, %xmm1 982; SSE2-NEXT: pand %xmm8, %xmm1 983; SSE2-NEXT: packuswb %xmm4, %xmm1 984; SSE2-NEXT: movdqa %xmm6, %xmm4 985; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 986; SSE2-NEXT: psraw $8, %xmm4 987; SSE2-NEXT: movdqa %xmm2, %xmm5 988; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 989; SSE2-NEXT: psraw $8, %xmm5 990; SSE2-NEXT: pmullw %xmm4, %xmm5 991; SSE2-NEXT: pand %xmm8, %xmm5 992; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 993; SSE2-NEXT: psraw $8, %xmm6 994; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 995; SSE2-NEXT: psraw $8, %xmm2 996; SSE2-NEXT: pmullw %xmm6, %xmm2 997; SSE2-NEXT: pand %xmm8, %xmm2 998; SSE2-NEXT: packuswb %xmm5, %xmm2 999; SSE2-NEXT: movdqa %xmm7, %xmm4 1000; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1001; SSE2-NEXT: psraw $8, %xmm4 1002; SSE2-NEXT: movdqa %xmm3, %xmm5 1003; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1004; SSE2-NEXT: psraw $8, %xmm5 1005; SSE2-NEXT: pmullw %xmm4, %xmm5 1006; SSE2-NEXT: pand %xmm8, %xmm5 1007; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1008; SSE2-NEXT: psraw $8, %xmm7 1009; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1010; SSE2-NEXT: psraw $8, %xmm3 1011; SSE2-NEXT: pmullw %xmm7, %xmm3 1012; SSE2-NEXT: pand %xmm8, %xmm3 1013; SSE2-NEXT: packuswb %xmm5, %xmm3 1014; SSE2-NEXT: retq 1015; 1016; SSE41-LABEL: mul_v64i8: 1017; SSE41: # BB#0: # %entry 1018; SSE41-NEXT: movdqa %xmm1, %xmm8 1019; SSE41-NEXT: movdqa %xmm0, %xmm1 1020; SSE41-NEXT: pmovsxbw %xmm4, %xmm9 1021; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 1022; SSE41-NEXT: pmullw %xmm9, %xmm0 1023; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1024; SSE41-NEXT: pand %xmm9, %xmm0 1025; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 1026; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 1027; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1028; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 1029; SSE41-NEXT: pmullw %xmm4, %xmm1 1030; SSE41-NEXT: pand %xmm9, %xmm1 1031; SSE41-NEXT: packuswb %xmm1, %xmm0 1032; SSE41-NEXT: pmovsxbw %xmm5, %xmm4 1033; SSE41-NEXT: pmovsxbw %xmm8, %xmm1 1034; SSE41-NEXT: pmullw %xmm4, %xmm1 1035; SSE41-NEXT: pand %xmm9, %xmm1 1036; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] 1037; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 1038; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] 1039; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 1040; SSE41-NEXT: pmullw %xmm4, %xmm5 1041; SSE41-NEXT: pand %xmm9, %xmm5 1042; SSE41-NEXT: packuswb %xmm5, %xmm1 1043; SSE41-NEXT: pmovsxbw %xmm6, %xmm5 1044; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 1045; SSE41-NEXT: pmullw %xmm5, %xmm4 1046; SSE41-NEXT: pand %xmm9, %xmm4 1047; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] 1048; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 1049; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1050; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1051; SSE41-NEXT: pmullw %xmm5, %xmm2 1052; SSE41-NEXT: pand %xmm9, %xmm2 1053; SSE41-NEXT: packuswb %xmm2, %xmm4 1054; SSE41-NEXT: pmovsxbw %xmm7, %xmm2 1055; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 1056; SSE41-NEXT: pmullw %xmm2, %xmm5 1057; SSE41-NEXT: pand %xmm9, %xmm5 1058; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1] 1059; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1060; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1061; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 1062; SSE41-NEXT: pmullw %xmm2, %xmm3 1063; SSE41-NEXT: pand %xmm9, %xmm3 1064; SSE41-NEXT: packuswb %xmm3, %xmm5 1065; SSE41-NEXT: movdqa %xmm4, %xmm2 1066; SSE41-NEXT: movdqa %xmm5, %xmm3 1067; SSE41-NEXT: retq 1068; 1069; AVX2-LABEL: mul_v64i8: 1070; AVX2: # BB#0: # %entry 1071; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 1072; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4 1073; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 1074; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 1075; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 1076; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1077; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1078; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1079; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1080; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 1081; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 1082; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1083; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1084; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1085; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1086; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1087; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1088; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 1089; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 1090; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 1091; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 1092; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 1093; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2 1094; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 1095; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1096; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1097; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 1098; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 1099; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1100; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1101; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1102; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 1103; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 1104; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 1105; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1106; AVX2-NEXT: retq 1107; 1108; AVX512F-LABEL: mul_v64i8: 1109; AVX512F: # BB#0: # %entry 1110; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4 1111; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5 1112; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 1113; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 1114; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 1115; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 1116; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 1117; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 1118; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 1119; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1120; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1121; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1122; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 1123; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm2 1124; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4 1125; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 1126; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 1127; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 1128; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 1129; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 1130; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 1131; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 1132; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1133; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 1134; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 1135; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1136; AVX512F-NEXT: retq 1137; 1138; AVX512BW-LABEL: mul_v64i8: 1139; AVX512BW: # BB#0: # %entry 1140; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 1141; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3 1142; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 1143; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 1144; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1145; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1146; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1147; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1148; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1149; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1150; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 1151; AVX512BW-NEXT: retq 1152entry: 1153 %A = mul <64 x i8> %i, %j 1154 ret <64 x i8> %A 1155} 1156 1157