1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2011 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <stdint.h>
18*32afb93cSXin Li #include <x86intrin.h>
19*32afb93cSXin Li 
20*32afb93cSXin Li namespace renderscript {
21*32afb93cSXin Li 
22*32afb93cSXin Li /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)23*32afb93cSXin Li static inline __m128i cvtepu8_epi32(__m128i x) {
24*32afb93cSXin Li #if defined(__SSE4_1__)
25*32afb93cSXin Li     return _mm_cvtepu8_epi32(x);
26*32afb93cSXin Li #elif defined(__SSSE3__)
27*32afb93cSXin Li     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
28*32afb93cSXin Li     x = _mm_shuffle_epi8(x, M8to32);
29*32afb93cSXin Li     return x;
30*32afb93cSXin Li #else
31*32afb93cSXin Li #   error "Require at least SSSE3"
32*32afb93cSXin Li #endif
33*32afb93cSXin Li }
34*32afb93cSXin Li 
packus_epi32(__m128i lo,__m128i hi)35*32afb93cSXin Li static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
36*32afb93cSXin Li #if defined(__SSE4_1__)
37*32afb93cSXin Li     return _mm_packus_epi32(lo, hi);
38*32afb93cSXin Li #elif defined(__SSSE3__)
39*32afb93cSXin Li     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
40*32afb93cSXin Li     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
41*32afb93cSXin Li     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
42*32afb93cSXin Li     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
43*32afb93cSXin Li     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
44*32afb93cSXin Li     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
45*32afb93cSXin Li     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
46*32afb93cSXin Li     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
47*32afb93cSXin Li     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
48*32afb93cSXin Li                         _mm_shuffle_epi8(hi, M32to16H));
49*32afb93cSXin Li #else
50*32afb93cSXin Li #   error "Require at least SSSE3"
51*32afb93cSXin Li #endif
52*32afb93cSXin Li }
53*32afb93cSXin Li 
mullo_epi32(__m128i x,__m128i y)54*32afb93cSXin Li static inline __m128i mullo_epi32(__m128i x, __m128i y) {
55*32afb93cSXin Li #if defined(__SSE4_1__)
56*32afb93cSXin Li     return _mm_mullo_epi32(x, y);
57*32afb93cSXin Li #elif defined(__SSSE3__)
58*32afb93cSXin Li     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
59*32afb93cSXin Li     __m128i even = _mm_mul_epu32(x, y);
60*32afb93cSXin Li     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
61*32afb93cSXin Li                                 _mm_srli_si128(y, 4));
62*32afb93cSXin Li     even = _mm_and_si128(even, Meven);
63*32afb93cSXin Li     odd = _mm_and_si128(odd, Meven);
64*32afb93cSXin Li     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
65*32afb93cSXin Li #else
66*32afb93cSXin Li #   error "Require at least SSSE3"
67*32afb93cSXin Li #endif
68*32afb93cSXin Li }
69*32afb93cSXin Li 
70*32afb93cSXin Li /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)71*32afb93cSXin Li static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
72*32afb93cSXin Li #if defined(__SSE4_1__)
73*32afb93cSXin Li     return _mm_blendv_epi8(x, y, mask);
74*32afb93cSXin Li #elif defined(__SSSE3__)
75*32afb93cSXin Li     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
76*32afb93cSXin Li #else
77*32afb93cSXin Li #   error "Require at least SSSE3"
78*32afb93cSXin Li #endif
79*32afb93cSXin Li }
80*32afb93cSXin Li 
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)81*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
82*32afb93cSXin Li                                           const void *y1, const void *y2,
83*32afb93cSXin Li                                           const short *coef, uint32_t count) {
84*32afb93cSXin Li     __m128i x;
85*32afb93cSXin Li     __m128i c0, c2, c4, c6, c8;
86*32afb93cSXin Li     __m128i r0, r1, r2;
87*32afb93cSXin Li     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
88*32afb93cSXin Li     __m128i o0, o1;
89*32afb93cSXin Li     uint32_t i;
90*32afb93cSXin Li 
91*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+0));
92*32afb93cSXin Li     c0 = _mm_shuffle_epi32(x, 0x00);
93*32afb93cSXin Li     c2 = _mm_shuffle_epi32(x, 0x55);
94*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+4));
95*32afb93cSXin Li     c4 = _mm_shuffle_epi32(x, 0x00);
96*32afb93cSXin Li     c6 = _mm_shuffle_epi32(x, 0x55);
97*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+8));
98*32afb93cSXin Li     c8 = _mm_shuffle_epi32(x, 0x00);
99*32afb93cSXin Li 
100*32afb93cSXin Li     for (i = 0; i < count; ++i) {
101*32afb93cSXin Li 
102*32afb93cSXin Li         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
103*32afb93cSXin Li         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
104*32afb93cSXin Li         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
105*32afb93cSXin Li         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
106*32afb93cSXin Li         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
107*32afb93cSXin Li         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
108*32afb93cSXin Li         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
109*32afb93cSXin Li         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
110*32afb93cSXin Li         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
111*32afb93cSXin Li         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
112*32afb93cSXin Li         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
113*32afb93cSXin Li         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
114*32afb93cSXin Li 
115*32afb93cSXin Li         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
116*32afb93cSXin Li         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
117*32afb93cSXin Li 
118*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
119*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
120*32afb93cSXin Li 
121*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
122*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
123*32afb93cSXin Li 
124*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
125*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
126*32afb93cSXin Li 
127*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
128*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
129*32afb93cSXin Li 
130*32afb93cSXin Li         o0 = _mm_srai_epi32(o0, 8);
131*32afb93cSXin Li         o1 = _mm_srai_epi32(o1, 8);
132*32afb93cSXin Li 
133*32afb93cSXin Li         o0 = packus_epi32(o0, o1);
134*32afb93cSXin Li         o0 = _mm_packus_epi16(o0, o0);
135*32afb93cSXin Li         _mm_storel_epi64((__m128i *)dst, o0);
136*32afb93cSXin Li 
137*32afb93cSXin Li         y0 = (const char *)y0 + 8;
138*32afb93cSXin Li         y1 = (const char *)y1 + 8;
139*32afb93cSXin Li         y2 = (const char *)y2 + 8;
140*32afb93cSXin Li         dst = (char *)dst + 8;
141*32afb93cSXin Li     }
142*32afb93cSXin Li }
143*32afb93cSXin Li 
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)144*32afb93cSXin Li void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
145*32afb93cSXin Li                                   const short *coef, uint32_t count) {
146*32afb93cSXin Li     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
147*32afb93cSXin Li                                       14, 10, 6, 2,
148*32afb93cSXin Li                                       13,  9, 5, 1,
149*32afb93cSXin Li                                       12,  8, 4, 0);
150*32afb93cSXin Li 
151*32afb93cSXin Li     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
152*32afb93cSXin Li     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
153*32afb93cSXin Li     __m128i c0, c1, c2, c3;
154*32afb93cSXin Li     __m128i i4, o4;
155*32afb93cSXin Li     __m128i xy, zw;
156*32afb93cSXin Li     __m128i x2, y2, z2, w2;
157*32afb93cSXin Li     uint32_t i;
158*32afb93cSXin Li 
159*32afb93cSXin Li     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
160*32afb93cSXin Li     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
161*32afb93cSXin Li     c0 = _mm_unpacklo_epi16(c0, c1);
162*32afb93cSXin Li 
163*32afb93cSXin Li     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
164*32afb93cSXin Li     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
165*32afb93cSXin Li     c2 = _mm_unpacklo_epi16(c2, c3);
166*32afb93cSXin Li 
167*32afb93cSXin Li     for (i = 0; i < count; ++i) {
168*32afb93cSXin Li         i4 = _mm_load_si128((const __m128i *)src);
169*32afb93cSXin Li         xy = _mm_shuffle_epi8(i4, Mxy);
170*32afb93cSXin Li         zw = _mm_shuffle_epi8(i4, Mzw);
171*32afb93cSXin Li 
172*32afb93cSXin Li         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
173*32afb93cSXin Li         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
174*32afb93cSXin Li         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
175*32afb93cSXin Li         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
176*32afb93cSXin Li 
177*32afb93cSXin Li         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
178*32afb93cSXin Li         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
179*32afb93cSXin Li         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
180*32afb93cSXin Li         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
181*32afb93cSXin Li 
182*32afb93cSXin Li         x2 = _mm_srai_epi32(x2, 8);
183*32afb93cSXin Li         y2 = _mm_srai_epi32(y2, 8);
184*32afb93cSXin Li         z2 = _mm_srai_epi32(z2, 8);
185*32afb93cSXin Li         w2 = _mm_srai_epi32(w2, 8);
186*32afb93cSXin Li 
187*32afb93cSXin Li         x2 = packus_epi32(x2, y2);
188*32afb93cSXin Li         z2 = packus_epi32(z2, w2);
189*32afb93cSXin Li         o4 = _mm_packus_epi16(x2, z2);
190*32afb93cSXin Li 
191*32afb93cSXin Li         o4 = _mm_shuffle_epi8(o4, T4x4);
192*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, o4);
193*32afb93cSXin Li 
194*32afb93cSXin Li         src = (const char *)src + 16;
195*32afb93cSXin Li         dst = (char *)dst + 16;
196*32afb93cSXin Li     }
197*32afb93cSXin Li }
198*32afb93cSXin Li 
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)199*32afb93cSXin Li void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
200*32afb93cSXin Li                                   const short *coef, uint32_t count) {
201*32afb93cSXin Li     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
202*32afb93cSXin Li                                       14, 10, 6, 2,
203*32afb93cSXin Li                                       13,  9, 5, 1,
204*32afb93cSXin Li                                       12,  8, 4, 0);
205*32afb93cSXin Li 
206*32afb93cSXin Li     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
207*32afb93cSXin Li     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
208*32afb93cSXin Li 
209*32afb93cSXin Li     __m128i c0, c1, c2, c3;
210*32afb93cSXin Li     __m128i i4, o4;
211*32afb93cSXin Li     __m128i xy, zw;
212*32afb93cSXin Li     __m128i x2, y2, z2, w2;
213*32afb93cSXin Li     uint32_t i;
214*32afb93cSXin Li 
215*32afb93cSXin Li     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
216*32afb93cSXin Li     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
217*32afb93cSXin Li     c0 = _mm_unpacklo_epi16(c0, c1);
218*32afb93cSXin Li 
219*32afb93cSXin Li     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
220*32afb93cSXin Li     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
221*32afb93cSXin Li     c2 = _mm_unpacklo_epi16(c2, c3);
222*32afb93cSXin Li 
223*32afb93cSXin Li     for (i = 0; i < count; ++i) {
224*32afb93cSXin Li         i4 = _mm_loadu_si128((const __m128i *)src);
225*32afb93cSXin Li         xy = _mm_shuffle_epi8(i4, Mxy);
226*32afb93cSXin Li         zw = _mm_shuffle_epi8(i4, Mzw);
227*32afb93cSXin Li 
228*32afb93cSXin Li         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
229*32afb93cSXin Li         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
230*32afb93cSXin Li         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
231*32afb93cSXin Li 
232*32afb93cSXin Li         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
233*32afb93cSXin Li         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
234*32afb93cSXin Li         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
235*32afb93cSXin Li 
236*32afb93cSXin Li         x2 = _mm_srai_epi32(x2, 8);
237*32afb93cSXin Li         y2 = _mm_srai_epi32(y2, 8);
238*32afb93cSXin Li         z2 = _mm_srai_epi32(z2, 8);
239*32afb93cSXin Li         w2 = _mm_srli_epi32(zw, 16);
240*32afb93cSXin Li 
241*32afb93cSXin Li         x2 = packus_epi32(x2, y2);
242*32afb93cSXin Li         z2 = packus_epi32(z2, w2);
243*32afb93cSXin Li         o4 = _mm_packus_epi16(x2, z2);
244*32afb93cSXin Li 
245*32afb93cSXin Li         o4 = _mm_shuffle_epi8(o4, T4x4);
246*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, o4);
247*32afb93cSXin Li 
248*32afb93cSXin Li         src = (const char *)src + 16;
249*32afb93cSXin Li         dst = (char *)dst + 16;
250*32afb93cSXin Li     }
251*32afb93cSXin Li }
252*32afb93cSXin Li 
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)253*32afb93cSXin Li void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
254*32afb93cSXin Li                                   const short *coef, uint32_t count) {
255*32afb93cSXin Li     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
256*32afb93cSXin Li                                       14, 10, 6, 2,
257*32afb93cSXin Li                                       13,  9, 5, 1,
258*32afb93cSXin Li                                       12,  8, 4, 0);
259*32afb93cSXin Li     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
260*32afb93cSXin Li     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
261*32afb93cSXin Li     __m128i c0, c1, c2, c3;
262*32afb93cSXin Li     __m128i i4, o4;
263*32afb93cSXin Li     __m128i xy, zw;
264*32afb93cSXin Li     __m128i x2, y2, z2, w2;
265*32afb93cSXin Li     uint32_t i;
266*32afb93cSXin Li 
267*32afb93cSXin Li     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
268*32afb93cSXin Li     c0 = _mm_shufflelo_epi16(c0, 0);
269*32afb93cSXin Li     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
270*32afb93cSXin Li     c1 = _mm_shufflelo_epi16(c1, 0);
271*32afb93cSXin Li     c0 = _mm_unpacklo_epi16(c0, c1);
272*32afb93cSXin Li 
273*32afb93cSXin Li     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
274*32afb93cSXin Li     c2 = _mm_shufflelo_epi16(c2, 0);
275*32afb93cSXin Li     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
276*32afb93cSXin Li     c3 = _mm_shufflelo_epi16(c3, 0);
277*32afb93cSXin Li     c2 = _mm_unpacklo_epi16(c2, c3);
278*32afb93cSXin Li 
279*32afb93cSXin Li     for (i = 0; i < count; ++i) {
280*32afb93cSXin Li         i4 = _mm_loadu_si128((const __m128i *)src);
281*32afb93cSXin Li 
282*32afb93cSXin Li         xy = _mm_shuffle_epi8(i4, Mxy);
283*32afb93cSXin Li         zw = _mm_shuffle_epi8(i4, Mzw);
284*32afb93cSXin Li 
285*32afb93cSXin Li         x2 =  _mm_madd_epi16(xy, c0);
286*32afb93cSXin Li         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
287*32afb93cSXin Li 
288*32afb93cSXin Li         x2 = _mm_srai_epi32(x2, 8);
289*32afb93cSXin Li         y2 = x2;
290*32afb93cSXin Li         z2 = x2;
291*32afb93cSXin Li         w2 = _mm_srli_epi32(zw, 16);
292*32afb93cSXin Li 
293*32afb93cSXin Li         x2 = packus_epi32(x2, y2);
294*32afb93cSXin Li         z2 = packus_epi32(z2, w2);
295*32afb93cSXin Li         o4 = _mm_packus_epi16(x2, z2);
296*32afb93cSXin Li 
297*32afb93cSXin Li         o4 = _mm_shuffle_epi8(o4, T4x4);
298*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, o4);
299*32afb93cSXin Li 
300*32afb93cSXin Li         src = (const char *)src + 16;
301*32afb93cSXin Li         dst = (char *)dst + 16;
302*32afb93cSXin Li     }
303*32afb93cSXin Li }
304*32afb93cSXin Li 
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)305*32afb93cSXin Li void rsdIntrinsicBlurVFU4_K(void *dst,
306*32afb93cSXin Li                           const void *pin, int stride, const void *gptr,
307*32afb93cSXin Li                           int rct, int x1, int x2) {
308*32afb93cSXin Li     const char *pi;
309*32afb93cSXin Li     __m128i pi0, pi1;
310*32afb93cSXin Li     __m128 pf0, pf1;
311*32afb93cSXin Li     __m128 bp0, bp1;
312*32afb93cSXin Li     __m128 x;
313*32afb93cSXin Li     int r;
314*32afb93cSXin Li 
315*32afb93cSXin Li     for (; x1 < x2; x1 += 2) {
316*32afb93cSXin Li         pi = (const char *)pin + (x1 << 2);
317*32afb93cSXin Li         bp0 = _mm_setzero_ps();
318*32afb93cSXin Li         bp1 = _mm_setzero_ps();
319*32afb93cSXin Li 
320*32afb93cSXin Li         for (r = 0; r < rct; ++r) {
321*32afb93cSXin Li             x = _mm_load_ss((const float *)gptr + r);
322*32afb93cSXin Li             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
323*32afb93cSXin Li 
324*32afb93cSXin Li             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
325*32afb93cSXin Li             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
326*32afb93cSXin Li 
327*32afb93cSXin Li             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
328*32afb93cSXin Li             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
329*32afb93cSXin Li 
330*32afb93cSXin Li             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
331*32afb93cSXin Li             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
332*32afb93cSXin Li 
333*32afb93cSXin Li             pi += stride;
334*32afb93cSXin Li         }
335*32afb93cSXin Li 
336*32afb93cSXin Li         _mm_storeu_ps((float *)dst, bp0);
337*32afb93cSXin Li         _mm_storeu_ps((float *)dst + 4, bp1);
338*32afb93cSXin Li         dst = (char *)dst + 32;
339*32afb93cSXin Li     }
340*32afb93cSXin Li }
341*32afb93cSXin Li 
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)342*32afb93cSXin Li void rsdIntrinsicBlurHFU4_K(void *dst,
343*32afb93cSXin Li                           const void *pin, const void *gptr,
344*32afb93cSXin Li                           int rct, int x1, int x2) {
345*32afb93cSXin Li     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
346*32afb93cSXin Li     const float *pi;
347*32afb93cSXin Li     __m128 pf, x, y;
348*32afb93cSXin Li     __m128i o;
349*32afb93cSXin Li     int r;
350*32afb93cSXin Li 
351*32afb93cSXin Li     for (; x1 < x2; ++x1) {
352*32afb93cSXin Li         /* rct is define as 2*r+1 by the caller */
353*32afb93cSXin Li         x = _mm_load_ss((const float *)gptr);
354*32afb93cSXin Li         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
355*32afb93cSXin Li 
356*32afb93cSXin Li         pi = (const float *)pin + (x1 << 2);
357*32afb93cSXin Li         pf = _mm_mul_ps(x, _mm_load_ps(pi));
358*32afb93cSXin Li 
359*32afb93cSXin Li         for (r = 1; r < rct; r += 2) {
360*32afb93cSXin Li             x = _mm_load_ss((const float *)gptr + r);
361*32afb93cSXin Li             y = _mm_load_ss((const float *)gptr + r + 1);
362*32afb93cSXin Li             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
363*32afb93cSXin Li             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
364*32afb93cSXin Li 
365*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
366*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
367*32afb93cSXin Li         }
368*32afb93cSXin Li 
369*32afb93cSXin Li         o = _mm_cvtps_epi32(pf);
370*32afb93cSXin Li         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
371*32afb93cSXin Li         dst = (char *)dst + 4;
372*32afb93cSXin Li     }
373*32afb93cSXin Li }
374*32afb93cSXin Li 
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)375*32afb93cSXin Li void rsdIntrinsicBlurHFU1_K(void *dst,
376*32afb93cSXin Li                           const void *pin, const void *gptr,
377*32afb93cSXin Li                           int rct, int x1, int x2) {
378*32afb93cSXin Li     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
379*32afb93cSXin Li     const float *pi;
380*32afb93cSXin Li     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
381*32afb93cSXin Li     __m128i o;
382*32afb93cSXin Li     int r;
383*32afb93cSXin Li 
384*32afb93cSXin Li     for (; x1 < x2; x1+=4) {
385*32afb93cSXin Li         g0 = _mm_load_ss((const float *)gptr);
386*32afb93cSXin Li         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
387*32afb93cSXin Li 
388*32afb93cSXin Li         pi = (const float *)pin + x1;
389*32afb93cSXin Li         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
390*32afb93cSXin Li 
391*32afb93cSXin Li         for (r = 1; r < rct; r += 4) {
392*32afb93cSXin Li             gx = _mm_loadu_ps((const float *)gptr + r);
393*32afb93cSXin Li             p0 = _mm_loadu_ps(pi + r);
394*32afb93cSXin Li             p1 = _mm_loadu_ps(pi + r + 4);
395*32afb93cSXin Li 
396*32afb93cSXin Li             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
397*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
398*32afb93cSXin Li             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
399*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
400*32afb93cSXin Li             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
401*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
402*32afb93cSXin Li             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
403*32afb93cSXin Li             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
404*32afb93cSXin Li         }
405*32afb93cSXin Li 
406*32afb93cSXin Li         o = _mm_cvtps_epi32(pf);
407*32afb93cSXin Li         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
408*32afb93cSXin Li         dst = (char *)dst + 4;
409*32afb93cSXin Li     }
410*32afb93cSXin Li }
411*32afb93cSXin Li 
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)412*32afb93cSXin Li void rsdIntrinsicYuv_K(void *dst,
413*32afb93cSXin Li                        const unsigned char *pY, const unsigned char *pUV,
414*32afb93cSXin Li                        uint32_t count, const short *param) {
415*32afb93cSXin Li     __m128i biasY, biasUV;
416*32afb93cSXin Li     __m128i c0, c1, c2, c3, c4;
417*32afb93cSXin Li 
418*32afb93cSXin Li     biasY = _mm_set1_epi32(param[8]);   /*  16 */
419*32afb93cSXin Li     biasUV = _mm_set1_epi32(param[16]); /* 128 */
420*32afb93cSXin Li 
421*32afb93cSXin Li     c0 = _mm_set1_epi32(param[0]);  /*  298 */
422*32afb93cSXin Li     c1 = _mm_set1_epi32(param[1]);  /*  409 */
423*32afb93cSXin Li     c2 = _mm_set1_epi32(param[2]);  /* -100 */
424*32afb93cSXin Li     c3 = _mm_set1_epi32(param[3]);  /*  516 */
425*32afb93cSXin Li     c4 = _mm_set1_epi32(param[4]);  /* -208 */
426*32afb93cSXin Li 
427*32afb93cSXin Li     __m128i Y, UV, U, V, R, G, B, A;
428*32afb93cSXin Li 
429*32afb93cSXin Li     A = _mm_set1_epi32(255);
430*32afb93cSXin Li     uint32_t i;
431*32afb93cSXin Li 
432*32afb93cSXin Li     for (i = 0; i < (count << 1); ++i) {
433*32afb93cSXin Li         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
434*32afb93cSXin Li         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
435*32afb93cSXin Li 
436*32afb93cSXin Li         Y = _mm_sub_epi32(Y, biasY);
437*32afb93cSXin Li         UV = _mm_sub_epi32(UV, biasUV);
438*32afb93cSXin Li 
439*32afb93cSXin Li         U = _mm_shuffle_epi32(UV, 0xf5);
440*32afb93cSXin Li         V = _mm_shuffle_epi32(UV, 0xa0);
441*32afb93cSXin Li 
442*32afb93cSXin Li         Y = mullo_epi32(Y, c0);
443*32afb93cSXin Li 
444*32afb93cSXin Li         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
445*32afb93cSXin Li         R = _mm_add_epi32(R, biasUV);
446*32afb93cSXin Li         R = _mm_srai_epi32(R, 8);
447*32afb93cSXin Li 
448*32afb93cSXin Li         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
449*32afb93cSXin Li         G = _mm_add_epi32(G, mullo_epi32(V, c4));
450*32afb93cSXin Li         G = _mm_add_epi32(G, biasUV);
451*32afb93cSXin Li         G = _mm_srai_epi32(G, 8);
452*32afb93cSXin Li 
453*32afb93cSXin Li         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
454*32afb93cSXin Li         B = _mm_add_epi32(B, biasUV);
455*32afb93cSXin Li         B = _mm_srai_epi32(B, 8);
456*32afb93cSXin Li 
457*32afb93cSXin Li         __m128i y1, y2, y3, y4;
458*32afb93cSXin Li 
459*32afb93cSXin Li         y1 = packus_epi32(R, G);
460*32afb93cSXin Li         y2 = packus_epi32(B, A);
461*32afb93cSXin Li         y3 = _mm_packus_epi16(y1, y2);
462*32afb93cSXin Li         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
463*32afb93cSXin Li                                           14, 10, 6, 2,
464*32afb93cSXin Li                                           13,  9, 5, 1,
465*32afb93cSXin Li                                           12,  8, 4, 0);
466*32afb93cSXin Li         y4 = _mm_shuffle_epi8(y3, T4x4);
467*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, y4);
468*32afb93cSXin Li         pY += 4;
469*32afb93cSXin Li         pUV += 4;
470*32afb93cSXin Li         dst = (__m128i *)dst + 1;
471*32afb93cSXin Li     }
472*32afb93cSXin Li }
473*32afb93cSXin Li 
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)474*32afb93cSXin Li void rsdIntrinsicYuvR_K(void *dst,
475*32afb93cSXin Li                        const unsigned char *pY, const unsigned char *pUV,
476*32afb93cSXin Li                        uint32_t count, const short *param) {
477*32afb93cSXin Li     __m128i biasY, biasUV;
478*32afb93cSXin Li     __m128i c0, c1, c2, c3, c4;
479*32afb93cSXin Li 
480*32afb93cSXin Li     biasY = _mm_set1_epi32(param[8]);   /*  16 */
481*32afb93cSXin Li     biasUV = _mm_set1_epi32(param[16]); /* 128 */
482*32afb93cSXin Li 
483*32afb93cSXin Li     c0 = _mm_set1_epi32(param[0]);  /*  298 */
484*32afb93cSXin Li     c1 = _mm_set1_epi32(param[1]);  /*  409 */
485*32afb93cSXin Li     c2 = _mm_set1_epi32(param[2]);  /* -100 */
486*32afb93cSXin Li     c3 = _mm_set1_epi32(param[3]);  /*  516 */
487*32afb93cSXin Li     c4 = _mm_set1_epi32(param[4]);  /* -208 */
488*32afb93cSXin Li 
489*32afb93cSXin Li     __m128i Y, UV, U, V, R, G, B, A;
490*32afb93cSXin Li 
491*32afb93cSXin Li     A = _mm_set1_epi32(255);
492*32afb93cSXin Li     uint32_t i;
493*32afb93cSXin Li 
494*32afb93cSXin Li     for (i = 0; i < (count << 1); ++i) {
495*32afb93cSXin Li         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
496*32afb93cSXin Li         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
497*32afb93cSXin Li 
498*32afb93cSXin Li         Y = _mm_sub_epi32(Y, biasY);
499*32afb93cSXin Li         UV = _mm_sub_epi32(UV, biasUV);
500*32afb93cSXin Li 
501*32afb93cSXin Li         V = _mm_shuffle_epi32(UV, 0xf5);
502*32afb93cSXin Li         U = _mm_shuffle_epi32(UV, 0xa0);
503*32afb93cSXin Li 
504*32afb93cSXin Li         Y = mullo_epi32(Y, c0);
505*32afb93cSXin Li 
506*32afb93cSXin Li         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
507*32afb93cSXin Li         R = _mm_add_epi32(R, biasUV);
508*32afb93cSXin Li         R = _mm_srai_epi32(R, 8);
509*32afb93cSXin Li 
510*32afb93cSXin Li         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
511*32afb93cSXin Li         G = _mm_add_epi32(G, mullo_epi32(V, c4));
512*32afb93cSXin Li         G = _mm_add_epi32(G, biasUV);
513*32afb93cSXin Li         G = _mm_srai_epi32(G, 8);
514*32afb93cSXin Li 
515*32afb93cSXin Li         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
516*32afb93cSXin Li         B = _mm_add_epi32(B, biasUV);
517*32afb93cSXin Li         B = _mm_srai_epi32(B, 8);
518*32afb93cSXin Li 
519*32afb93cSXin Li         __m128i y1, y2, y3, y4;
520*32afb93cSXin Li 
521*32afb93cSXin Li         y1 = packus_epi32(R, G);
522*32afb93cSXin Li         y2 = packus_epi32(B, A);
523*32afb93cSXin Li         y3 = _mm_packus_epi16(y1, y2);
524*32afb93cSXin Li         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
525*32afb93cSXin Li                                           14, 10, 6, 2,
526*32afb93cSXin Li                                           13,  9, 5, 1,
527*32afb93cSXin Li                                           12,  8, 4, 0);
528*32afb93cSXin Li         y4 = _mm_shuffle_epi8(y3, T4x4);
529*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, y4);
530*32afb93cSXin Li         pY += 4;
531*32afb93cSXin Li         pUV += 4;
532*32afb93cSXin Li         dst = (__m128i *)dst + 1;
533*32afb93cSXin Li     }
534*32afb93cSXin Li }
535*32afb93cSXin Li 
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)536*32afb93cSXin Li void rsdIntrinsicYuv2_K(void *dst,
537*32afb93cSXin Li                        const unsigned char *pY, const unsigned char *pU,
538*32afb93cSXin Li                        const unsigned char *pV, uint32_t count, const short *param) {
539*32afb93cSXin Li     __m128i biasY, biasUV;
540*32afb93cSXin Li     __m128i c0, c1, c2, c3, c4;
541*32afb93cSXin Li 
542*32afb93cSXin Li     biasY = _mm_set1_epi32(param[8]);   /*  16 */
543*32afb93cSXin Li     biasUV = _mm_set1_epi32(param[16]); /* 128 */
544*32afb93cSXin Li 
545*32afb93cSXin Li     c0 = _mm_set1_epi32(param[0]);  /*  298 */
546*32afb93cSXin Li     c1 = _mm_set1_epi32(param[1]);  /*  409 */
547*32afb93cSXin Li     c2 = _mm_set1_epi32(param[2]);  /* -100 */
548*32afb93cSXin Li     c3 = _mm_set1_epi32(param[3]);  /*  516 */
549*32afb93cSXin Li     c4 = _mm_set1_epi32(param[4]);  /* -208 */
550*32afb93cSXin Li 
551*32afb93cSXin Li     __m128i Y, U, V, R, G, B, A;
552*32afb93cSXin Li 
553*32afb93cSXin Li     A = _mm_set1_epi32(255);
554*32afb93cSXin Li     uint32_t i;
555*32afb93cSXin Li 
556*32afb93cSXin Li     for (i = 0; i < (count << 1); ++i) {
557*32afb93cSXin Li         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
558*32afb93cSXin Li         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
559*32afb93cSXin Li 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
560*32afb93cSXin Li 
561*32afb93cSXin Li         Y = _mm_sub_epi32(Y, biasY);
562*32afb93cSXin Li         U = _mm_sub_epi32(U, biasUV);
563*32afb93cSXin Li 		V = _mm_sub_epi32(V, biasUV);
564*32afb93cSXin Li 
565*32afb93cSXin Li         Y = mullo_epi32(Y, c0);
566*32afb93cSXin Li 
567*32afb93cSXin Li         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
568*32afb93cSXin Li         R = _mm_add_epi32(R, biasUV);
569*32afb93cSXin Li         R = _mm_srai_epi32(R, 8);
570*32afb93cSXin Li 
571*32afb93cSXin Li         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
572*32afb93cSXin Li         G = _mm_add_epi32(G, mullo_epi32(V, c4));
573*32afb93cSXin Li         G = _mm_add_epi32(G, biasUV);
574*32afb93cSXin Li         G = _mm_srai_epi32(G, 8);
575*32afb93cSXin Li 
576*32afb93cSXin Li         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
577*32afb93cSXin Li         B = _mm_add_epi32(B, biasUV);
578*32afb93cSXin Li         B = _mm_srai_epi32(B, 8);
579*32afb93cSXin Li 
580*32afb93cSXin Li         __m128i y1, y2, y3, y4;
581*32afb93cSXin Li 
582*32afb93cSXin Li         y1 = packus_epi32(R, G);
583*32afb93cSXin Li         y2 = packus_epi32(B, A);
584*32afb93cSXin Li         y3 = _mm_packus_epi16(y1, y2);
585*32afb93cSXin Li         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
586*32afb93cSXin Li                                           14, 10, 6, 2,
587*32afb93cSXin Li                                           13,  9, 5, 1,
588*32afb93cSXin Li                                           12,  8, 4, 0);
589*32afb93cSXin Li         y4 = _mm_shuffle_epi8(y3, T4x4);
590*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, y4);
591*32afb93cSXin Li         pY += 4;
592*32afb93cSXin Li         pU += 4;
593*32afb93cSXin Li 		pV += 4;
594*32afb93cSXin Li         dst = (__m128i *)dst + 1;
595*32afb93cSXin Li     }
596*32afb93cSXin Li }
597*32afb93cSXin Li 
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)598*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
599*32afb93cSXin Li                                           const void *y1, const void *y2,
600*32afb93cSXin Li                                           const void *y3, const void *y4,
601*32afb93cSXin Li                                           const short *coef, uint32_t count) {
602*32afb93cSXin Li     __m128i x;
603*32afb93cSXin Li     __m128i c0, c2, c4, c6, c8, c10, c12;
604*32afb93cSXin Li     __m128i c14, c16, c18, c20, c22, c24;
605*32afb93cSXin Li     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
606*32afb93cSXin Li     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
607*32afb93cSXin Li     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
608*32afb93cSXin Li     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
609*32afb93cSXin Li     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
610*32afb93cSXin Li     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
611*32afb93cSXin Li     __m128i o0, o1, o2, o3;
612*32afb93cSXin Li     uint32_t i;
613*32afb93cSXin Li 
614*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+0));
615*32afb93cSXin Li     c0  = _mm_shuffle_epi32(x, 0x00);
616*32afb93cSXin Li     c2  = _mm_shuffle_epi32(x, 0x55);
617*32afb93cSXin Li 
618*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+4));
619*32afb93cSXin Li     c4  = _mm_shuffle_epi32(x, 0x00);
620*32afb93cSXin Li     c6  = _mm_shuffle_epi32(x, 0x55);
621*32afb93cSXin Li 
622*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+8));
623*32afb93cSXin Li     c8  = _mm_shuffle_epi32(x, 0x00);
624*32afb93cSXin Li     c10  = _mm_shuffle_epi32(x, 0x55);
625*32afb93cSXin Li 
626*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+12));
627*32afb93cSXin Li     c12  = _mm_shuffle_epi32(x, 0x00);
628*32afb93cSXin Li     c14  = _mm_shuffle_epi32(x, 0x55);
629*32afb93cSXin Li 
630*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+16));
631*32afb93cSXin Li     c16  = _mm_shuffle_epi32(x, 0x00);
632*32afb93cSXin Li     c18  = _mm_shuffle_epi32(x, 0x55);
633*32afb93cSXin Li 
634*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+20));
635*32afb93cSXin Li     c20  = _mm_shuffle_epi32(x, 0x00);
636*32afb93cSXin Li     c22  = _mm_shuffle_epi32(x, 0x55);
637*32afb93cSXin Li 
638*32afb93cSXin Li     x = _mm_loadl_epi64((const __m128i *)(coef+24));
639*32afb93cSXin Li     c24  = _mm_shuffle_epi32(x, 0x00);
640*32afb93cSXin Li 
641*32afb93cSXin Li     for (i = 0; i < count; ++i) {
642*32afb93cSXin Li 
643*32afb93cSXin Li         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
644*32afb93cSXin Li         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
645*32afb93cSXin Li         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
646*32afb93cSXin Li         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
647*32afb93cSXin Li         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
648*32afb93cSXin Li         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
649*32afb93cSXin Li         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
650*32afb93cSXin Li         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
651*32afb93cSXin Li 
652*32afb93cSXin Li         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
653*32afb93cSXin Li         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
654*32afb93cSXin Li         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
655*32afb93cSXin Li         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
656*32afb93cSXin Li         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
657*32afb93cSXin Li         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
658*32afb93cSXin Li         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
659*32afb93cSXin Li         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
660*32afb93cSXin Li 
661*32afb93cSXin Li         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
662*32afb93cSXin Li         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
663*32afb93cSXin Li         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
664*32afb93cSXin Li         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
665*32afb93cSXin Li         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
666*32afb93cSXin Li         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
667*32afb93cSXin Li         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
668*32afb93cSXin Li         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
669*32afb93cSXin Li 
670*32afb93cSXin Li         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
671*32afb93cSXin Li         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
672*32afb93cSXin Li         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
673*32afb93cSXin Li         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
674*32afb93cSXin Li         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
675*32afb93cSXin Li         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
676*32afb93cSXin Li         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
677*32afb93cSXin Li         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
678*32afb93cSXin Li 
679*32afb93cSXin Li         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
680*32afb93cSXin Li         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
681*32afb93cSXin Li         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
682*32afb93cSXin Li         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
683*32afb93cSXin Li         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
684*32afb93cSXin Li         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
685*32afb93cSXin Li         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
686*32afb93cSXin Li         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
687*32afb93cSXin Li 
688*32afb93cSXin Li         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
689*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
690*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
691*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
692*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
693*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
694*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
695*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
696*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
697*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
698*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
699*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
700*32afb93cSXin Li         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
701*32afb93cSXin Li         o0 = _mm_srai_epi32(o0, 8);
702*32afb93cSXin Li 
703*32afb93cSXin Li         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
704*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
705*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
706*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
707*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
708*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
709*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
710*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
711*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
712*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
713*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
714*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
715*32afb93cSXin Li         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
716*32afb93cSXin Li         o1 = _mm_srai_epi32(o1, 8);
717*32afb93cSXin Li 
718*32afb93cSXin Li         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
719*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
720*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
721*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
722*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
723*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
724*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
725*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
726*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
727*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
728*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
729*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
730*32afb93cSXin Li         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
731*32afb93cSXin Li         o2 = _mm_srai_epi32(o2, 8);
732*32afb93cSXin Li 
733*32afb93cSXin Li         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
734*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
735*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
736*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
737*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
738*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
739*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
740*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
741*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
742*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
743*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
744*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
745*32afb93cSXin Li         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
746*32afb93cSXin Li         o3 = _mm_srai_epi32(o3, 8);
747*32afb93cSXin Li 
748*32afb93cSXin Li         o0 = packus_epi32(o0, o1);
749*32afb93cSXin Li         o2 = packus_epi32(o2, o3);
750*32afb93cSXin Li         o0 = _mm_packus_epi16(o0, o2);
751*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, o0);
752*32afb93cSXin Li 
753*32afb93cSXin Li         y0 = (const char *)y0 + 16;
754*32afb93cSXin Li         y1 = (const char *)y1 + 16;
755*32afb93cSXin Li         y2 = (const char *)y2 + 16;
756*32afb93cSXin Li         y3 = (const char *)y3 + 16;
757*32afb93cSXin Li         y4 = (const char *)y4 + 16;
758*32afb93cSXin Li         dst = (char *)dst + 16;
759*32afb93cSXin Li     }
760*32afb93cSXin Li }
761*32afb93cSXin Li 
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)762*32afb93cSXin Li void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
763*32afb93cSXin Li     __m128i all1s, ina, ins;
764*32afb93cSXin Li     __m128i in0, in1, out0, out1;
765*32afb93cSXin Li     __m128i t0, t1, t2, t3;
766*32afb93cSXin Li     uint32_t i;
767*32afb93cSXin Li 
768*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
769*32afb93cSXin Li 
770*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
771*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
772*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
773*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
774*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
775*32afb93cSXin Li 
776*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
777*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
778*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
779*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
780*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
781*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
782*32afb93cSXin Li         t0 = _mm_add_epi16(t0, ins);
783*32afb93cSXin Li 
784*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
785*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
786*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
787*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
788*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
789*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
790*32afb93cSXin Li         t1 = _mm_add_epi16(t1, ins);
791*32afb93cSXin Li 
792*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
793*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
794*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
795*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
796*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
797*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
798*32afb93cSXin Li         t2 = _mm_add_epi16(t2, ins);
799*32afb93cSXin Li 
800*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
801*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
802*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
803*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
804*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
805*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
806*32afb93cSXin Li         t3 = _mm_add_epi16(t3, ins);
807*32afb93cSXin Li 
808*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
809*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
810*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
811*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
812*32afb93cSXin Li 
813*32afb93cSXin Li         src = (const __m128i *)src + 2;
814*32afb93cSXin Li         dst = (__m128i *)dst + 2;
815*32afb93cSXin Li     }
816*32afb93cSXin Li }
817*32afb93cSXin Li 
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)818*32afb93cSXin Li void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
819*32afb93cSXin Li     __m128i all1s, outa, outs;
820*32afb93cSXin Li     __m128i in0, in1, out0, out1;
821*32afb93cSXin Li     __m128i t0, t1, t2, t3;
822*32afb93cSXin Li     uint32_t i;
823*32afb93cSXin Li 
824*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
825*32afb93cSXin Li 
826*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
827*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
828*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
829*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
830*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
831*32afb93cSXin Li 
832*32afb93cSXin Li 
833*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
834*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
835*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
836*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
837*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
838*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
839*32afb93cSXin Li         t0 = _mm_add_epi16(t0, outs);
840*32afb93cSXin Li 
841*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
842*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
843*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
844*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
845*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
846*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
847*32afb93cSXin Li         t1 = _mm_add_epi16(t1, outs);
848*32afb93cSXin Li 
849*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
850*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
851*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
852*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
853*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
854*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
855*32afb93cSXin Li         t2 = _mm_add_epi16(t2, outs);
856*32afb93cSXin Li 
857*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
858*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
859*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
860*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
861*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
862*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
863*32afb93cSXin Li         t3 = _mm_add_epi16(t3, outs);
864*32afb93cSXin Li 
865*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
866*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
867*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
868*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
869*32afb93cSXin Li 
870*32afb93cSXin Li         src = (const __m128i *)src + 2;
871*32afb93cSXin Li         dst = (__m128i *)dst + 2;
872*32afb93cSXin Li     }
873*32afb93cSXin Li }
874*32afb93cSXin Li 
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)875*32afb93cSXin Li void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
876*32afb93cSXin Li     __m128i outa;
877*32afb93cSXin Li     __m128i in0, in1, out0, out1;
878*32afb93cSXin Li     __m128i t0, t1, t2, t3;
879*32afb93cSXin Li     uint32_t i;
880*32afb93cSXin Li 
881*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
882*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
883*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
884*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
885*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
886*32afb93cSXin Li 
887*32afb93cSXin Li         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
888*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
889*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
890*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
891*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, outa);
892*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
893*32afb93cSXin Li 
894*32afb93cSXin Li         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
895*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
896*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
897*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
898*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, outa);
899*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
900*32afb93cSXin Li 
901*32afb93cSXin Li         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
902*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
903*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
904*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
905*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, outa);
906*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
907*32afb93cSXin Li 
908*32afb93cSXin Li         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
909*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
910*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
911*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
912*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, outa);
913*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
914*32afb93cSXin Li 
915*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
916*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
917*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
918*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
919*32afb93cSXin Li 
920*32afb93cSXin Li         src = (const __m128i *)src + 2;
921*32afb93cSXin Li         dst = (__m128i *)dst + 2;
922*32afb93cSXin Li     }
923*32afb93cSXin Li }
924*32afb93cSXin Li 
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)925*32afb93cSXin Li void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
926*32afb93cSXin Li     __m128i ina;
927*32afb93cSXin Li     __m128i in0, in1, out0, out1;
928*32afb93cSXin Li     __m128i t0, t1, t2, t3;
929*32afb93cSXin Li     uint32_t i;
930*32afb93cSXin Li 
931*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
932*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
933*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
934*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
935*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
936*32afb93cSXin Li 
937*32afb93cSXin Li         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
938*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
939*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
940*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
941*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, ina);
942*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
943*32afb93cSXin Li 
944*32afb93cSXin Li         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
945*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
946*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
947*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
948*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, ina);
949*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
950*32afb93cSXin Li 
951*32afb93cSXin Li         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
952*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
953*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
954*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
955*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, ina);
956*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
957*32afb93cSXin Li 
958*32afb93cSXin Li         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
959*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
960*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
961*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
962*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, ina);
963*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
964*32afb93cSXin Li 
965*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
966*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
967*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
968*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
969*32afb93cSXin Li 
970*32afb93cSXin Li         src = (const __m128i *)src + 2;
971*32afb93cSXin Li         dst = (__m128i *)dst + 2;
972*32afb93cSXin Li     }
973*32afb93cSXin Li }
974*32afb93cSXin Li 
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)975*32afb93cSXin Li void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
976*32afb93cSXin Li     __m128i all1s, outa;
977*32afb93cSXin Li     __m128i in0, in1, out0, out1;
978*32afb93cSXin Li     __m128i t0, t1, t2, t3;
979*32afb93cSXin Li     uint32_t i;
980*32afb93cSXin Li 
981*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
982*32afb93cSXin Li 
983*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
984*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
985*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
986*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
987*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
988*32afb93cSXin Li 
989*32afb93cSXin Li         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
990*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
991*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
992*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
993*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
994*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
995*32afb93cSXin Li 
996*32afb93cSXin Li         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
997*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
998*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
999*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1000*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1001*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
1002*32afb93cSXin Li 
1003*32afb93cSXin Li         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1004*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
1005*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1006*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1007*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1008*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
1009*32afb93cSXin Li 
1010*32afb93cSXin Li         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1011*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outa, 0xFF);
1012*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1013*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1014*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1015*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
1016*32afb93cSXin Li 
1017*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
1018*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
1019*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
1020*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
1021*32afb93cSXin Li 
1022*32afb93cSXin Li         src = (const __m128i *)src + 2;
1023*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1024*32afb93cSXin Li     }
1025*32afb93cSXin Li }
1026*32afb93cSXin Li 
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1027*32afb93cSXin Li void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1028*32afb93cSXin Li     __m128i all1s, ina;
1029*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1030*32afb93cSXin Li     __m128i t0, t1, t2, t3;
1031*32afb93cSXin Li     uint32_t i;
1032*32afb93cSXin Li 
1033*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
1034*32afb93cSXin Li 
1035*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1036*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1037*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1038*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1039*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1040*32afb93cSXin Li 
1041*32afb93cSXin Li         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1042*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
1043*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1044*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1045*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1046*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
1047*32afb93cSXin Li 
1048*32afb93cSXin Li         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1049*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
1050*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1051*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1052*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1053*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
1054*32afb93cSXin Li 
1055*32afb93cSXin Li         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1056*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
1057*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1058*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1059*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1060*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
1061*32afb93cSXin Li 
1062*32afb93cSXin Li         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1063*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ina, 0xFF);
1064*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1065*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1066*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1067*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
1068*32afb93cSXin Li 
1069*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
1070*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
1071*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
1072*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
1073*32afb93cSXin Li 
1074*32afb93cSXin Li         src = (const __m128i *)src + 2;
1075*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1076*32afb93cSXin Li     }
1077*32afb93cSXin Li }
1078*32afb93cSXin Li 
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1079*32afb93cSXin Li void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1080*32afb93cSXin Li     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1081*32afb93cSXin Li     __m128i all1s, ina, outa, ins, outs;
1082*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1083*32afb93cSXin Li     __m128i t0, t1, t2, t3;
1084*32afb93cSXin Li     uint32_t i;
1085*32afb93cSXin Li 
1086*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
1087*32afb93cSXin Li 
1088*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1089*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1090*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1091*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1092*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1093*32afb93cSXin Li 
1094*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1095*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1096*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1097*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1098*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1099*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1100*32afb93cSXin Li         t0 = _mm_sub_epi16(all1s, ina);
1101*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, outs);
1102*32afb93cSXin Li         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1103*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
1104*32afb93cSXin Li 
1105*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1106*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1107*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1108*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1109*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1110*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1111*32afb93cSXin Li         t1 = _mm_sub_epi16(all1s, ina);
1112*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, outs);
1113*32afb93cSXin Li         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1114*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
1115*32afb93cSXin Li 
1116*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1117*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1118*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1119*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1120*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1121*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1122*32afb93cSXin Li         t2 = _mm_sub_epi16(all1s, ina);
1123*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, outs);
1124*32afb93cSXin Li         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1125*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
1126*32afb93cSXin Li 
1127*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1128*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1129*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1130*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1131*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1132*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1133*32afb93cSXin Li         t3 = _mm_sub_epi16(all1s, ina);
1134*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, outs);
1135*32afb93cSXin Li         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1136*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
1137*32afb93cSXin Li 
1138*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
1139*32afb93cSXin Li         t0 = blendv_epi8(t0, out0, M0001);
1140*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
1141*32afb93cSXin Li         t2 = blendv_epi8(t2, out1, M0001);
1142*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
1143*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
1144*32afb93cSXin Li 
1145*32afb93cSXin Li         src = (const __m128i *)src + 2;
1146*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1147*32afb93cSXin Li     }
1148*32afb93cSXin Li }
1149*32afb93cSXin Li 
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1150*32afb93cSXin Li void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1151*32afb93cSXin Li     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1152*32afb93cSXin Li     __m128i all1s, ina, ins, outa, outs;
1153*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1154*32afb93cSXin Li     __m128i t0, t1, t2, t3;
1155*32afb93cSXin Li     uint32_t i;
1156*32afb93cSXin Li 
1157*32afb93cSXin Li     all1s = _mm_set1_epi16(255);
1158*32afb93cSXin Li 
1159*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1160*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1161*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1162*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1163*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1164*32afb93cSXin Li 
1165*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1166*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1167*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1168*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1169*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1170*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1171*32afb93cSXin Li         t0 = _mm_sub_epi16(all1s, outa);
1172*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, ins);
1173*32afb93cSXin Li         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1174*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
1175*32afb93cSXin Li 
1176*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1177*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1178*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1179*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1180*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1181*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1182*32afb93cSXin Li         t1 = _mm_sub_epi16(all1s, outa);
1183*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, ins);
1184*32afb93cSXin Li         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1185*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
1186*32afb93cSXin Li 
1187*32afb93cSXin Li         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1188*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1189*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1190*32afb93cSXin Li         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1191*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1192*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1193*32afb93cSXin Li         t2 = _mm_sub_epi16(all1s, outa);
1194*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, ins);
1195*32afb93cSXin Li         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1196*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
1197*32afb93cSXin Li 
1198*32afb93cSXin Li         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1199*32afb93cSXin Li         ina = _mm_shufflelo_epi16(ins, 0xFF);
1200*32afb93cSXin Li         ina = _mm_shufflehi_epi16(ina, 0xFF);
1201*32afb93cSXin Li         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1202*32afb93cSXin Li         outa = _mm_shufflelo_epi16(outs, 0xFF);
1203*32afb93cSXin Li         outa = _mm_shufflehi_epi16(outa, 0xFF);
1204*32afb93cSXin Li         t3 = _mm_sub_epi16(all1s, outa);
1205*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, ins);
1206*32afb93cSXin Li         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1207*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
1208*32afb93cSXin Li 
1209*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
1210*32afb93cSXin Li         t0 = blendv_epi8(t0, in0, M0001);
1211*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
1212*32afb93cSXin Li         t2 = blendv_epi8(t2, in1, M0001);
1213*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
1214*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
1215*32afb93cSXin Li 
1216*32afb93cSXin Li         src = (const __m128i *)src + 2;
1217*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1218*32afb93cSXin Li     }
1219*32afb93cSXin Li }
1220*32afb93cSXin Li 
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1221*32afb93cSXin Li void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1222*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1223*32afb93cSXin Li     uint32_t i;
1224*32afb93cSXin Li 
1225*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1226*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1227*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1228*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1229*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1230*32afb93cSXin Li 
1231*32afb93cSXin Li         out0 = _mm_xor_si128(out0, in0);
1232*32afb93cSXin Li         out1 = _mm_xor_si128(out1, in1);
1233*32afb93cSXin Li 
1234*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, out0);
1235*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, out1);
1236*32afb93cSXin Li 
1237*32afb93cSXin Li         src = (const __m128i *)src + 2;
1238*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1239*32afb93cSXin Li     }
1240*32afb93cSXin Li }
1241*32afb93cSXin Li 
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1242*32afb93cSXin Li void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1243*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1244*32afb93cSXin Li     __m128i t0, t1, t2, t3;
1245*32afb93cSXin Li     uint32_t i;
1246*32afb93cSXin Li 
1247*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1248*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1249*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1250*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1251*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1252*32afb93cSXin Li 
1253*32afb93cSXin Li         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1254*32afb93cSXin Li         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1255*32afb93cSXin Li         t0 = _mm_srli_epi16(t0, 8);
1256*32afb93cSXin Li 
1257*32afb93cSXin Li         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1258*32afb93cSXin Li         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1259*32afb93cSXin Li         t1 = _mm_srli_epi16(t1, 8);
1260*32afb93cSXin Li 
1261*32afb93cSXin Li         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1262*32afb93cSXin Li         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1263*32afb93cSXin Li         t2 = _mm_srli_epi16(t2, 8);
1264*32afb93cSXin Li 
1265*32afb93cSXin Li         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1266*32afb93cSXin Li         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1267*32afb93cSXin Li         t3 = _mm_srli_epi16(t3, 8);
1268*32afb93cSXin Li 
1269*32afb93cSXin Li         t0 = _mm_packus_epi16(t0, t1);
1270*32afb93cSXin Li         t2 = _mm_packus_epi16(t2, t3);
1271*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, t0);
1272*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, t2);
1273*32afb93cSXin Li 
1274*32afb93cSXin Li         src = (const __m128i *)src + 2;
1275*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1276*32afb93cSXin Li     }
1277*32afb93cSXin Li }
1278*32afb93cSXin Li 
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1279*32afb93cSXin Li void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1280*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1281*32afb93cSXin Li     uint32_t i;
1282*32afb93cSXin Li 
1283*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1284*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1285*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1286*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1287*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1288*32afb93cSXin Li 
1289*32afb93cSXin Li         out0 = _mm_adds_epu8(out0, in0);
1290*32afb93cSXin Li         out1 = _mm_adds_epu8(out1, in1);
1291*32afb93cSXin Li 
1292*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, out0);
1293*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, out1);
1294*32afb93cSXin Li 
1295*32afb93cSXin Li         src = (const __m128i *)src + 2;
1296*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1297*32afb93cSXin Li     }
1298*32afb93cSXin Li }
1299*32afb93cSXin Li 
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1300*32afb93cSXin Li void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1301*32afb93cSXin Li     __m128i in0, in1, out0, out1;
1302*32afb93cSXin Li     uint32_t i;
1303*32afb93cSXin Li 
1304*32afb93cSXin Li     for (i = 0; i < count8; ++i) {
1305*32afb93cSXin Li         in0 = _mm_loadu_si128((const __m128i *)src);
1306*32afb93cSXin Li         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1307*32afb93cSXin Li         out0 = _mm_loadu_si128((const __m128i *)dst);
1308*32afb93cSXin Li         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1309*32afb93cSXin Li 
1310*32afb93cSXin Li         out0 = _mm_subs_epu8(out0, in0);
1311*32afb93cSXin Li         out1 = _mm_subs_epu8(out1, in1);
1312*32afb93cSXin Li 
1313*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst, out0);
1314*32afb93cSXin Li         _mm_storeu_si128((__m128i *)dst + 1, out1);
1315*32afb93cSXin Li 
1316*32afb93cSXin Li         src = (const __m128i *)src + 2;
1317*32afb93cSXin Li         dst = (__m128i *)dst + 2;
1318*32afb93cSXin Li     }
1319*32afb93cSXin Li }
1320*32afb93cSXin Li 
1321*32afb93cSXin Li }  // namespace renderscript
1322