1*32afb93cSXin Li /*
2*32afb93cSXin Li * Copyright (C) 2011 The Android Open Source Project
3*32afb93cSXin Li *
4*32afb93cSXin Li * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li * you may not use this file except in compliance with the License.
6*32afb93cSXin Li * You may obtain a copy of the License at
7*32afb93cSXin Li *
8*32afb93cSXin Li * http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li *
10*32afb93cSXin Li * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li * See the License for the specific language governing permissions and
14*32afb93cSXin Li * limitations under the License.
15*32afb93cSXin Li */
16*32afb93cSXin Li
17*32afb93cSXin Li #include <stdint.h>
18*32afb93cSXin Li #include <x86intrin.h>
19*32afb93cSXin Li
20*32afb93cSXin Li namespace renderscript {
21*32afb93cSXin Li
22*32afb93cSXin Li /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)23*32afb93cSXin Li static inline __m128i cvtepu8_epi32(__m128i x) {
24*32afb93cSXin Li #if defined(__SSE4_1__)
25*32afb93cSXin Li return _mm_cvtepu8_epi32(x);
26*32afb93cSXin Li #elif defined(__SSSE3__)
27*32afb93cSXin Li const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
28*32afb93cSXin Li x = _mm_shuffle_epi8(x, M8to32);
29*32afb93cSXin Li return x;
30*32afb93cSXin Li #else
31*32afb93cSXin Li # error "Require at least SSSE3"
32*32afb93cSXin Li #endif
33*32afb93cSXin Li }
34*32afb93cSXin Li
packus_epi32(__m128i lo,__m128i hi)35*32afb93cSXin Li static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
36*32afb93cSXin Li #if defined(__SSE4_1__)
37*32afb93cSXin Li return _mm_packus_epi32(lo, hi);
38*32afb93cSXin Li #elif defined(__SSSE3__)
39*32afb93cSXin Li const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
40*32afb93cSXin Li const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
41*32afb93cSXin Li const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
42*32afb93cSXin Li const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
43*32afb93cSXin Li lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
44*32afb93cSXin Li lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
45*32afb93cSXin Li hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
46*32afb93cSXin Li hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
47*32afb93cSXin Li return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
48*32afb93cSXin Li _mm_shuffle_epi8(hi, M32to16H));
49*32afb93cSXin Li #else
50*32afb93cSXin Li # error "Require at least SSSE3"
51*32afb93cSXin Li #endif
52*32afb93cSXin Li }
53*32afb93cSXin Li
mullo_epi32(__m128i x,__m128i y)54*32afb93cSXin Li static inline __m128i mullo_epi32(__m128i x, __m128i y) {
55*32afb93cSXin Li #if defined(__SSE4_1__)
56*32afb93cSXin Li return _mm_mullo_epi32(x, y);
57*32afb93cSXin Li #elif defined(__SSSE3__)
58*32afb93cSXin Li const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
59*32afb93cSXin Li __m128i even = _mm_mul_epu32(x, y);
60*32afb93cSXin Li __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
61*32afb93cSXin Li _mm_srli_si128(y, 4));
62*32afb93cSXin Li even = _mm_and_si128(even, Meven);
63*32afb93cSXin Li odd = _mm_and_si128(odd, Meven);
64*32afb93cSXin Li return _mm_or_si128(even, _mm_slli_si128(odd, 4));
65*32afb93cSXin Li #else
66*32afb93cSXin Li # error "Require at least SSSE3"
67*32afb93cSXin Li #endif
68*32afb93cSXin Li }
69*32afb93cSXin Li
70*32afb93cSXin Li /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)71*32afb93cSXin Li static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
72*32afb93cSXin Li #if defined(__SSE4_1__)
73*32afb93cSXin Li return _mm_blendv_epi8(x, y, mask);
74*32afb93cSXin Li #elif defined(__SSSE3__)
75*32afb93cSXin Li return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
76*32afb93cSXin Li #else
77*32afb93cSXin Li # error "Require at least SSSE3"
78*32afb93cSXin Li #endif
79*32afb93cSXin Li }
80*32afb93cSXin Li
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)81*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
82*32afb93cSXin Li const void *y1, const void *y2,
83*32afb93cSXin Li const short *coef, uint32_t count) {
84*32afb93cSXin Li __m128i x;
85*32afb93cSXin Li __m128i c0, c2, c4, c6, c8;
86*32afb93cSXin Li __m128i r0, r1, r2;
87*32afb93cSXin Li __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
88*32afb93cSXin Li __m128i o0, o1;
89*32afb93cSXin Li uint32_t i;
90*32afb93cSXin Li
91*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+0));
92*32afb93cSXin Li c0 = _mm_shuffle_epi32(x, 0x00);
93*32afb93cSXin Li c2 = _mm_shuffle_epi32(x, 0x55);
94*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+4));
95*32afb93cSXin Li c4 = _mm_shuffle_epi32(x, 0x00);
96*32afb93cSXin Li c6 = _mm_shuffle_epi32(x, 0x55);
97*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+8));
98*32afb93cSXin Li c8 = _mm_shuffle_epi32(x, 0x00);
99*32afb93cSXin Li
100*32afb93cSXin Li for (i = 0; i < count; ++i) {
101*32afb93cSXin Li
102*32afb93cSXin Li p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
103*32afb93cSXin Li p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
104*32afb93cSXin Li p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
105*32afb93cSXin Li p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
106*32afb93cSXin Li p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
107*32afb93cSXin Li p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
108*32afb93cSXin Li p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
109*32afb93cSXin Li p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
110*32afb93cSXin Li p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
111*32afb93cSXin Li p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
112*32afb93cSXin Li p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
113*32afb93cSXin Li p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
114*32afb93cSXin Li
115*32afb93cSXin Li o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
116*32afb93cSXin Li o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
117*32afb93cSXin Li
118*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
119*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
120*32afb93cSXin Li
121*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
122*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
123*32afb93cSXin Li
124*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
125*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
126*32afb93cSXin Li
127*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
128*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
129*32afb93cSXin Li
130*32afb93cSXin Li o0 = _mm_srai_epi32(o0, 8);
131*32afb93cSXin Li o1 = _mm_srai_epi32(o1, 8);
132*32afb93cSXin Li
133*32afb93cSXin Li o0 = packus_epi32(o0, o1);
134*32afb93cSXin Li o0 = _mm_packus_epi16(o0, o0);
135*32afb93cSXin Li _mm_storel_epi64((__m128i *)dst, o0);
136*32afb93cSXin Li
137*32afb93cSXin Li y0 = (const char *)y0 + 8;
138*32afb93cSXin Li y1 = (const char *)y1 + 8;
139*32afb93cSXin Li y2 = (const char *)y2 + 8;
140*32afb93cSXin Li dst = (char *)dst + 8;
141*32afb93cSXin Li }
142*32afb93cSXin Li }
143*32afb93cSXin Li
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)144*32afb93cSXin Li void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
145*32afb93cSXin Li const short *coef, uint32_t count) {
146*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
147*32afb93cSXin Li 14, 10, 6, 2,
148*32afb93cSXin Li 13, 9, 5, 1,
149*32afb93cSXin Li 12, 8, 4, 0);
150*32afb93cSXin Li
151*32afb93cSXin Li const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
152*32afb93cSXin Li const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
153*32afb93cSXin Li __m128i c0, c1, c2, c3;
154*32afb93cSXin Li __m128i i4, o4;
155*32afb93cSXin Li __m128i xy, zw;
156*32afb93cSXin Li __m128i x2, y2, z2, w2;
157*32afb93cSXin Li uint32_t i;
158*32afb93cSXin Li
159*32afb93cSXin Li c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
160*32afb93cSXin Li c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
161*32afb93cSXin Li c0 = _mm_unpacklo_epi16(c0, c1);
162*32afb93cSXin Li
163*32afb93cSXin Li c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
164*32afb93cSXin Li c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
165*32afb93cSXin Li c2 = _mm_unpacklo_epi16(c2, c3);
166*32afb93cSXin Li
167*32afb93cSXin Li for (i = 0; i < count; ++i) {
168*32afb93cSXin Li i4 = _mm_load_si128((const __m128i *)src);
169*32afb93cSXin Li xy = _mm_shuffle_epi8(i4, Mxy);
170*32afb93cSXin Li zw = _mm_shuffle_epi8(i4, Mzw);
171*32afb93cSXin Li
172*32afb93cSXin Li x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
173*32afb93cSXin Li y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
174*32afb93cSXin Li z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
175*32afb93cSXin Li w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
176*32afb93cSXin Li
177*32afb93cSXin Li x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
178*32afb93cSXin Li y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
179*32afb93cSXin Li z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
180*32afb93cSXin Li w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
181*32afb93cSXin Li
182*32afb93cSXin Li x2 = _mm_srai_epi32(x2, 8);
183*32afb93cSXin Li y2 = _mm_srai_epi32(y2, 8);
184*32afb93cSXin Li z2 = _mm_srai_epi32(z2, 8);
185*32afb93cSXin Li w2 = _mm_srai_epi32(w2, 8);
186*32afb93cSXin Li
187*32afb93cSXin Li x2 = packus_epi32(x2, y2);
188*32afb93cSXin Li z2 = packus_epi32(z2, w2);
189*32afb93cSXin Li o4 = _mm_packus_epi16(x2, z2);
190*32afb93cSXin Li
191*32afb93cSXin Li o4 = _mm_shuffle_epi8(o4, T4x4);
192*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, o4);
193*32afb93cSXin Li
194*32afb93cSXin Li src = (const char *)src + 16;
195*32afb93cSXin Li dst = (char *)dst + 16;
196*32afb93cSXin Li }
197*32afb93cSXin Li }
198*32afb93cSXin Li
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)199*32afb93cSXin Li void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
200*32afb93cSXin Li const short *coef, uint32_t count) {
201*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
202*32afb93cSXin Li 14, 10, 6, 2,
203*32afb93cSXin Li 13, 9, 5, 1,
204*32afb93cSXin Li 12, 8, 4, 0);
205*32afb93cSXin Li
206*32afb93cSXin Li const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
207*32afb93cSXin Li const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
208*32afb93cSXin Li
209*32afb93cSXin Li __m128i c0, c1, c2, c3;
210*32afb93cSXin Li __m128i i4, o4;
211*32afb93cSXin Li __m128i xy, zw;
212*32afb93cSXin Li __m128i x2, y2, z2, w2;
213*32afb93cSXin Li uint32_t i;
214*32afb93cSXin Li
215*32afb93cSXin Li c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
216*32afb93cSXin Li c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
217*32afb93cSXin Li c0 = _mm_unpacklo_epi16(c0, c1);
218*32afb93cSXin Li
219*32afb93cSXin Li c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
220*32afb93cSXin Li c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
221*32afb93cSXin Li c2 = _mm_unpacklo_epi16(c2, c3);
222*32afb93cSXin Li
223*32afb93cSXin Li for (i = 0; i < count; ++i) {
224*32afb93cSXin Li i4 = _mm_loadu_si128((const __m128i *)src);
225*32afb93cSXin Li xy = _mm_shuffle_epi8(i4, Mxy);
226*32afb93cSXin Li zw = _mm_shuffle_epi8(i4, Mzw);
227*32afb93cSXin Li
228*32afb93cSXin Li x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
229*32afb93cSXin Li y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
230*32afb93cSXin Li z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
231*32afb93cSXin Li
232*32afb93cSXin Li x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
233*32afb93cSXin Li y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
234*32afb93cSXin Li z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
235*32afb93cSXin Li
236*32afb93cSXin Li x2 = _mm_srai_epi32(x2, 8);
237*32afb93cSXin Li y2 = _mm_srai_epi32(y2, 8);
238*32afb93cSXin Li z2 = _mm_srai_epi32(z2, 8);
239*32afb93cSXin Li w2 = _mm_srli_epi32(zw, 16);
240*32afb93cSXin Li
241*32afb93cSXin Li x2 = packus_epi32(x2, y2);
242*32afb93cSXin Li z2 = packus_epi32(z2, w2);
243*32afb93cSXin Li o4 = _mm_packus_epi16(x2, z2);
244*32afb93cSXin Li
245*32afb93cSXin Li o4 = _mm_shuffle_epi8(o4, T4x4);
246*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, o4);
247*32afb93cSXin Li
248*32afb93cSXin Li src = (const char *)src + 16;
249*32afb93cSXin Li dst = (char *)dst + 16;
250*32afb93cSXin Li }
251*32afb93cSXin Li }
252*32afb93cSXin Li
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)253*32afb93cSXin Li void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
254*32afb93cSXin Li const short *coef, uint32_t count) {
255*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
256*32afb93cSXin Li 14, 10, 6, 2,
257*32afb93cSXin Li 13, 9, 5, 1,
258*32afb93cSXin Li 12, 8, 4, 0);
259*32afb93cSXin Li const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
260*32afb93cSXin Li const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
261*32afb93cSXin Li __m128i c0, c1, c2, c3;
262*32afb93cSXin Li __m128i i4, o4;
263*32afb93cSXin Li __m128i xy, zw;
264*32afb93cSXin Li __m128i x2, y2, z2, w2;
265*32afb93cSXin Li uint32_t i;
266*32afb93cSXin Li
267*32afb93cSXin Li c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
268*32afb93cSXin Li c0 = _mm_shufflelo_epi16(c0, 0);
269*32afb93cSXin Li c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
270*32afb93cSXin Li c1 = _mm_shufflelo_epi16(c1, 0);
271*32afb93cSXin Li c0 = _mm_unpacklo_epi16(c0, c1);
272*32afb93cSXin Li
273*32afb93cSXin Li c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
274*32afb93cSXin Li c2 = _mm_shufflelo_epi16(c2, 0);
275*32afb93cSXin Li c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
276*32afb93cSXin Li c3 = _mm_shufflelo_epi16(c3, 0);
277*32afb93cSXin Li c2 = _mm_unpacklo_epi16(c2, c3);
278*32afb93cSXin Li
279*32afb93cSXin Li for (i = 0; i < count; ++i) {
280*32afb93cSXin Li i4 = _mm_loadu_si128((const __m128i *)src);
281*32afb93cSXin Li
282*32afb93cSXin Li xy = _mm_shuffle_epi8(i4, Mxy);
283*32afb93cSXin Li zw = _mm_shuffle_epi8(i4, Mzw);
284*32afb93cSXin Li
285*32afb93cSXin Li x2 = _mm_madd_epi16(xy, c0);
286*32afb93cSXin Li x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
287*32afb93cSXin Li
288*32afb93cSXin Li x2 = _mm_srai_epi32(x2, 8);
289*32afb93cSXin Li y2 = x2;
290*32afb93cSXin Li z2 = x2;
291*32afb93cSXin Li w2 = _mm_srli_epi32(zw, 16);
292*32afb93cSXin Li
293*32afb93cSXin Li x2 = packus_epi32(x2, y2);
294*32afb93cSXin Li z2 = packus_epi32(z2, w2);
295*32afb93cSXin Li o4 = _mm_packus_epi16(x2, z2);
296*32afb93cSXin Li
297*32afb93cSXin Li o4 = _mm_shuffle_epi8(o4, T4x4);
298*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, o4);
299*32afb93cSXin Li
300*32afb93cSXin Li src = (const char *)src + 16;
301*32afb93cSXin Li dst = (char *)dst + 16;
302*32afb93cSXin Li }
303*32afb93cSXin Li }
304*32afb93cSXin Li
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)305*32afb93cSXin Li void rsdIntrinsicBlurVFU4_K(void *dst,
306*32afb93cSXin Li const void *pin, int stride, const void *gptr,
307*32afb93cSXin Li int rct, int x1, int x2) {
308*32afb93cSXin Li const char *pi;
309*32afb93cSXin Li __m128i pi0, pi1;
310*32afb93cSXin Li __m128 pf0, pf1;
311*32afb93cSXin Li __m128 bp0, bp1;
312*32afb93cSXin Li __m128 x;
313*32afb93cSXin Li int r;
314*32afb93cSXin Li
315*32afb93cSXin Li for (; x1 < x2; x1 += 2) {
316*32afb93cSXin Li pi = (const char *)pin + (x1 << 2);
317*32afb93cSXin Li bp0 = _mm_setzero_ps();
318*32afb93cSXin Li bp1 = _mm_setzero_ps();
319*32afb93cSXin Li
320*32afb93cSXin Li for (r = 0; r < rct; ++r) {
321*32afb93cSXin Li x = _mm_load_ss((const float *)gptr + r);
322*32afb93cSXin Li x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
323*32afb93cSXin Li
324*32afb93cSXin Li pi0 = _mm_cvtsi32_si128(*(const int *)pi);
325*32afb93cSXin Li pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
326*32afb93cSXin Li
327*32afb93cSXin Li pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
328*32afb93cSXin Li pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
329*32afb93cSXin Li
330*32afb93cSXin Li bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
331*32afb93cSXin Li bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
332*32afb93cSXin Li
333*32afb93cSXin Li pi += stride;
334*32afb93cSXin Li }
335*32afb93cSXin Li
336*32afb93cSXin Li _mm_storeu_ps((float *)dst, bp0);
337*32afb93cSXin Li _mm_storeu_ps((float *)dst + 4, bp1);
338*32afb93cSXin Li dst = (char *)dst + 32;
339*32afb93cSXin Li }
340*32afb93cSXin Li }
341*32afb93cSXin Li
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)342*32afb93cSXin Li void rsdIntrinsicBlurHFU4_K(void *dst,
343*32afb93cSXin Li const void *pin, const void *gptr,
344*32afb93cSXin Li int rct, int x1, int x2) {
345*32afb93cSXin Li const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
346*32afb93cSXin Li const float *pi;
347*32afb93cSXin Li __m128 pf, x, y;
348*32afb93cSXin Li __m128i o;
349*32afb93cSXin Li int r;
350*32afb93cSXin Li
351*32afb93cSXin Li for (; x1 < x2; ++x1) {
352*32afb93cSXin Li /* rct is define as 2*r+1 by the caller */
353*32afb93cSXin Li x = _mm_load_ss((const float *)gptr);
354*32afb93cSXin Li x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
355*32afb93cSXin Li
356*32afb93cSXin Li pi = (const float *)pin + (x1 << 2);
357*32afb93cSXin Li pf = _mm_mul_ps(x, _mm_load_ps(pi));
358*32afb93cSXin Li
359*32afb93cSXin Li for (r = 1; r < rct; r += 2) {
360*32afb93cSXin Li x = _mm_load_ss((const float *)gptr + r);
361*32afb93cSXin Li y = _mm_load_ss((const float *)gptr + r + 1);
362*32afb93cSXin Li x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
363*32afb93cSXin Li y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
364*32afb93cSXin Li
365*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
366*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
367*32afb93cSXin Li }
368*32afb93cSXin Li
369*32afb93cSXin Li o = _mm_cvtps_epi32(pf);
370*32afb93cSXin Li *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
371*32afb93cSXin Li dst = (char *)dst + 4;
372*32afb93cSXin Li }
373*32afb93cSXin Li }
374*32afb93cSXin Li
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)375*32afb93cSXin Li void rsdIntrinsicBlurHFU1_K(void *dst,
376*32afb93cSXin Li const void *pin, const void *gptr,
377*32afb93cSXin Li int rct, int x1, int x2) {
378*32afb93cSXin Li const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
379*32afb93cSXin Li const float *pi;
380*32afb93cSXin Li __m128 pf, g0, g1, g2, g3, gx, p0, p1;
381*32afb93cSXin Li __m128i o;
382*32afb93cSXin Li int r;
383*32afb93cSXin Li
384*32afb93cSXin Li for (; x1 < x2; x1+=4) {
385*32afb93cSXin Li g0 = _mm_load_ss((const float *)gptr);
386*32afb93cSXin Li g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
387*32afb93cSXin Li
388*32afb93cSXin Li pi = (const float *)pin + x1;
389*32afb93cSXin Li pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
390*32afb93cSXin Li
391*32afb93cSXin Li for (r = 1; r < rct; r += 4) {
392*32afb93cSXin Li gx = _mm_loadu_ps((const float *)gptr + r);
393*32afb93cSXin Li p0 = _mm_loadu_ps(pi + r);
394*32afb93cSXin Li p1 = _mm_loadu_ps(pi + r + 4);
395*32afb93cSXin Li
396*32afb93cSXin Li g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
397*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
398*32afb93cSXin Li g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
399*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
400*32afb93cSXin Li g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
401*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
402*32afb93cSXin Li g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
403*32afb93cSXin Li pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
404*32afb93cSXin Li }
405*32afb93cSXin Li
406*32afb93cSXin Li o = _mm_cvtps_epi32(pf);
407*32afb93cSXin Li *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
408*32afb93cSXin Li dst = (char *)dst + 4;
409*32afb93cSXin Li }
410*32afb93cSXin Li }
411*32afb93cSXin Li
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)412*32afb93cSXin Li void rsdIntrinsicYuv_K(void *dst,
413*32afb93cSXin Li const unsigned char *pY, const unsigned char *pUV,
414*32afb93cSXin Li uint32_t count, const short *param) {
415*32afb93cSXin Li __m128i biasY, biasUV;
416*32afb93cSXin Li __m128i c0, c1, c2, c3, c4;
417*32afb93cSXin Li
418*32afb93cSXin Li biasY = _mm_set1_epi32(param[8]); /* 16 */
419*32afb93cSXin Li biasUV = _mm_set1_epi32(param[16]); /* 128 */
420*32afb93cSXin Li
421*32afb93cSXin Li c0 = _mm_set1_epi32(param[0]); /* 298 */
422*32afb93cSXin Li c1 = _mm_set1_epi32(param[1]); /* 409 */
423*32afb93cSXin Li c2 = _mm_set1_epi32(param[2]); /* -100 */
424*32afb93cSXin Li c3 = _mm_set1_epi32(param[3]); /* 516 */
425*32afb93cSXin Li c4 = _mm_set1_epi32(param[4]); /* -208 */
426*32afb93cSXin Li
427*32afb93cSXin Li __m128i Y, UV, U, V, R, G, B, A;
428*32afb93cSXin Li
429*32afb93cSXin Li A = _mm_set1_epi32(255);
430*32afb93cSXin Li uint32_t i;
431*32afb93cSXin Li
432*32afb93cSXin Li for (i = 0; i < (count << 1); ++i) {
433*32afb93cSXin Li Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
434*32afb93cSXin Li UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
435*32afb93cSXin Li
436*32afb93cSXin Li Y = _mm_sub_epi32(Y, biasY);
437*32afb93cSXin Li UV = _mm_sub_epi32(UV, biasUV);
438*32afb93cSXin Li
439*32afb93cSXin Li U = _mm_shuffle_epi32(UV, 0xf5);
440*32afb93cSXin Li V = _mm_shuffle_epi32(UV, 0xa0);
441*32afb93cSXin Li
442*32afb93cSXin Li Y = mullo_epi32(Y, c0);
443*32afb93cSXin Li
444*32afb93cSXin Li R = _mm_add_epi32(Y, mullo_epi32(V, c1));
445*32afb93cSXin Li R = _mm_add_epi32(R, biasUV);
446*32afb93cSXin Li R = _mm_srai_epi32(R, 8);
447*32afb93cSXin Li
448*32afb93cSXin Li G = _mm_add_epi32(Y, mullo_epi32(U, c2));
449*32afb93cSXin Li G = _mm_add_epi32(G, mullo_epi32(V, c4));
450*32afb93cSXin Li G = _mm_add_epi32(G, biasUV);
451*32afb93cSXin Li G = _mm_srai_epi32(G, 8);
452*32afb93cSXin Li
453*32afb93cSXin Li B = _mm_add_epi32(Y, mullo_epi32(U, c3));
454*32afb93cSXin Li B = _mm_add_epi32(B, biasUV);
455*32afb93cSXin Li B = _mm_srai_epi32(B, 8);
456*32afb93cSXin Li
457*32afb93cSXin Li __m128i y1, y2, y3, y4;
458*32afb93cSXin Li
459*32afb93cSXin Li y1 = packus_epi32(R, G);
460*32afb93cSXin Li y2 = packus_epi32(B, A);
461*32afb93cSXin Li y3 = _mm_packus_epi16(y1, y2);
462*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
463*32afb93cSXin Li 14, 10, 6, 2,
464*32afb93cSXin Li 13, 9, 5, 1,
465*32afb93cSXin Li 12, 8, 4, 0);
466*32afb93cSXin Li y4 = _mm_shuffle_epi8(y3, T4x4);
467*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, y4);
468*32afb93cSXin Li pY += 4;
469*32afb93cSXin Li pUV += 4;
470*32afb93cSXin Li dst = (__m128i *)dst + 1;
471*32afb93cSXin Li }
472*32afb93cSXin Li }
473*32afb93cSXin Li
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)474*32afb93cSXin Li void rsdIntrinsicYuvR_K(void *dst,
475*32afb93cSXin Li const unsigned char *pY, const unsigned char *pUV,
476*32afb93cSXin Li uint32_t count, const short *param) {
477*32afb93cSXin Li __m128i biasY, biasUV;
478*32afb93cSXin Li __m128i c0, c1, c2, c3, c4;
479*32afb93cSXin Li
480*32afb93cSXin Li biasY = _mm_set1_epi32(param[8]); /* 16 */
481*32afb93cSXin Li biasUV = _mm_set1_epi32(param[16]); /* 128 */
482*32afb93cSXin Li
483*32afb93cSXin Li c0 = _mm_set1_epi32(param[0]); /* 298 */
484*32afb93cSXin Li c1 = _mm_set1_epi32(param[1]); /* 409 */
485*32afb93cSXin Li c2 = _mm_set1_epi32(param[2]); /* -100 */
486*32afb93cSXin Li c3 = _mm_set1_epi32(param[3]); /* 516 */
487*32afb93cSXin Li c4 = _mm_set1_epi32(param[4]); /* -208 */
488*32afb93cSXin Li
489*32afb93cSXin Li __m128i Y, UV, U, V, R, G, B, A;
490*32afb93cSXin Li
491*32afb93cSXin Li A = _mm_set1_epi32(255);
492*32afb93cSXin Li uint32_t i;
493*32afb93cSXin Li
494*32afb93cSXin Li for (i = 0; i < (count << 1); ++i) {
495*32afb93cSXin Li Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
496*32afb93cSXin Li UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
497*32afb93cSXin Li
498*32afb93cSXin Li Y = _mm_sub_epi32(Y, biasY);
499*32afb93cSXin Li UV = _mm_sub_epi32(UV, biasUV);
500*32afb93cSXin Li
501*32afb93cSXin Li V = _mm_shuffle_epi32(UV, 0xf5);
502*32afb93cSXin Li U = _mm_shuffle_epi32(UV, 0xa0);
503*32afb93cSXin Li
504*32afb93cSXin Li Y = mullo_epi32(Y, c0);
505*32afb93cSXin Li
506*32afb93cSXin Li R = _mm_add_epi32(Y, mullo_epi32(V, c1));
507*32afb93cSXin Li R = _mm_add_epi32(R, biasUV);
508*32afb93cSXin Li R = _mm_srai_epi32(R, 8);
509*32afb93cSXin Li
510*32afb93cSXin Li G = _mm_add_epi32(Y, mullo_epi32(U, c2));
511*32afb93cSXin Li G = _mm_add_epi32(G, mullo_epi32(V, c4));
512*32afb93cSXin Li G = _mm_add_epi32(G, biasUV);
513*32afb93cSXin Li G = _mm_srai_epi32(G, 8);
514*32afb93cSXin Li
515*32afb93cSXin Li B = _mm_add_epi32(Y, mullo_epi32(U, c3));
516*32afb93cSXin Li B = _mm_add_epi32(B, biasUV);
517*32afb93cSXin Li B = _mm_srai_epi32(B, 8);
518*32afb93cSXin Li
519*32afb93cSXin Li __m128i y1, y2, y3, y4;
520*32afb93cSXin Li
521*32afb93cSXin Li y1 = packus_epi32(R, G);
522*32afb93cSXin Li y2 = packus_epi32(B, A);
523*32afb93cSXin Li y3 = _mm_packus_epi16(y1, y2);
524*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
525*32afb93cSXin Li 14, 10, 6, 2,
526*32afb93cSXin Li 13, 9, 5, 1,
527*32afb93cSXin Li 12, 8, 4, 0);
528*32afb93cSXin Li y4 = _mm_shuffle_epi8(y3, T4x4);
529*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, y4);
530*32afb93cSXin Li pY += 4;
531*32afb93cSXin Li pUV += 4;
532*32afb93cSXin Li dst = (__m128i *)dst + 1;
533*32afb93cSXin Li }
534*32afb93cSXin Li }
535*32afb93cSXin Li
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)536*32afb93cSXin Li void rsdIntrinsicYuv2_K(void *dst,
537*32afb93cSXin Li const unsigned char *pY, const unsigned char *pU,
538*32afb93cSXin Li const unsigned char *pV, uint32_t count, const short *param) {
539*32afb93cSXin Li __m128i biasY, biasUV;
540*32afb93cSXin Li __m128i c0, c1, c2, c3, c4;
541*32afb93cSXin Li
542*32afb93cSXin Li biasY = _mm_set1_epi32(param[8]); /* 16 */
543*32afb93cSXin Li biasUV = _mm_set1_epi32(param[16]); /* 128 */
544*32afb93cSXin Li
545*32afb93cSXin Li c0 = _mm_set1_epi32(param[0]); /* 298 */
546*32afb93cSXin Li c1 = _mm_set1_epi32(param[1]); /* 409 */
547*32afb93cSXin Li c2 = _mm_set1_epi32(param[2]); /* -100 */
548*32afb93cSXin Li c3 = _mm_set1_epi32(param[3]); /* 516 */
549*32afb93cSXin Li c4 = _mm_set1_epi32(param[4]); /* -208 */
550*32afb93cSXin Li
551*32afb93cSXin Li __m128i Y, U, V, R, G, B, A;
552*32afb93cSXin Li
553*32afb93cSXin Li A = _mm_set1_epi32(255);
554*32afb93cSXin Li uint32_t i;
555*32afb93cSXin Li
556*32afb93cSXin Li for (i = 0; i < (count << 1); ++i) {
557*32afb93cSXin Li Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
558*32afb93cSXin Li U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
559*32afb93cSXin Li V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
560*32afb93cSXin Li
561*32afb93cSXin Li Y = _mm_sub_epi32(Y, biasY);
562*32afb93cSXin Li U = _mm_sub_epi32(U, biasUV);
563*32afb93cSXin Li V = _mm_sub_epi32(V, biasUV);
564*32afb93cSXin Li
565*32afb93cSXin Li Y = mullo_epi32(Y, c0);
566*32afb93cSXin Li
567*32afb93cSXin Li R = _mm_add_epi32(Y, mullo_epi32(V, c1));
568*32afb93cSXin Li R = _mm_add_epi32(R, biasUV);
569*32afb93cSXin Li R = _mm_srai_epi32(R, 8);
570*32afb93cSXin Li
571*32afb93cSXin Li G = _mm_add_epi32(Y, mullo_epi32(U, c2));
572*32afb93cSXin Li G = _mm_add_epi32(G, mullo_epi32(V, c4));
573*32afb93cSXin Li G = _mm_add_epi32(G, biasUV);
574*32afb93cSXin Li G = _mm_srai_epi32(G, 8);
575*32afb93cSXin Li
576*32afb93cSXin Li B = _mm_add_epi32(Y, mullo_epi32(U, c3));
577*32afb93cSXin Li B = _mm_add_epi32(B, biasUV);
578*32afb93cSXin Li B = _mm_srai_epi32(B, 8);
579*32afb93cSXin Li
580*32afb93cSXin Li __m128i y1, y2, y3, y4;
581*32afb93cSXin Li
582*32afb93cSXin Li y1 = packus_epi32(R, G);
583*32afb93cSXin Li y2 = packus_epi32(B, A);
584*32afb93cSXin Li y3 = _mm_packus_epi16(y1, y2);
585*32afb93cSXin Li const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
586*32afb93cSXin Li 14, 10, 6, 2,
587*32afb93cSXin Li 13, 9, 5, 1,
588*32afb93cSXin Li 12, 8, 4, 0);
589*32afb93cSXin Li y4 = _mm_shuffle_epi8(y3, T4x4);
590*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, y4);
591*32afb93cSXin Li pY += 4;
592*32afb93cSXin Li pU += 4;
593*32afb93cSXin Li pV += 4;
594*32afb93cSXin Li dst = (__m128i *)dst + 1;
595*32afb93cSXin Li }
596*32afb93cSXin Li }
597*32afb93cSXin Li
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)598*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
599*32afb93cSXin Li const void *y1, const void *y2,
600*32afb93cSXin Li const void *y3, const void *y4,
601*32afb93cSXin Li const short *coef, uint32_t count) {
602*32afb93cSXin Li __m128i x;
603*32afb93cSXin Li __m128i c0, c2, c4, c6, c8, c10, c12;
604*32afb93cSXin Li __m128i c14, c16, c18, c20, c22, c24;
605*32afb93cSXin Li __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
606*32afb93cSXin Li __m128i p0, p1, p2, p3, p4, p5, p6, p7;
607*32afb93cSXin Li __m128i p8, p9, p10, p11, p12, p13, p14, p15;
608*32afb93cSXin Li __m128i p16, p17, p18, p19, p20, p21, p22, p23;
609*32afb93cSXin Li __m128i p24, p25, p26, p27, p28, p29, p30, p31;
610*32afb93cSXin Li __m128i p32, p33, p34, p35, p36, p37, p38, p39;
611*32afb93cSXin Li __m128i o0, o1, o2, o3;
612*32afb93cSXin Li uint32_t i;
613*32afb93cSXin Li
614*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+0));
615*32afb93cSXin Li c0 = _mm_shuffle_epi32(x, 0x00);
616*32afb93cSXin Li c2 = _mm_shuffle_epi32(x, 0x55);
617*32afb93cSXin Li
618*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+4));
619*32afb93cSXin Li c4 = _mm_shuffle_epi32(x, 0x00);
620*32afb93cSXin Li c6 = _mm_shuffle_epi32(x, 0x55);
621*32afb93cSXin Li
622*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+8));
623*32afb93cSXin Li c8 = _mm_shuffle_epi32(x, 0x00);
624*32afb93cSXin Li c10 = _mm_shuffle_epi32(x, 0x55);
625*32afb93cSXin Li
626*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+12));
627*32afb93cSXin Li c12 = _mm_shuffle_epi32(x, 0x00);
628*32afb93cSXin Li c14 = _mm_shuffle_epi32(x, 0x55);
629*32afb93cSXin Li
630*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+16));
631*32afb93cSXin Li c16 = _mm_shuffle_epi32(x, 0x00);
632*32afb93cSXin Li c18 = _mm_shuffle_epi32(x, 0x55);
633*32afb93cSXin Li
634*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+20));
635*32afb93cSXin Li c20 = _mm_shuffle_epi32(x, 0x00);
636*32afb93cSXin Li c22 = _mm_shuffle_epi32(x, 0x55);
637*32afb93cSXin Li
638*32afb93cSXin Li x = _mm_loadl_epi64((const __m128i *)(coef+24));
639*32afb93cSXin Li c24 = _mm_shuffle_epi32(x, 0x00);
640*32afb93cSXin Li
641*32afb93cSXin Li for (i = 0; i < count; ++i) {
642*32afb93cSXin Li
643*32afb93cSXin Li p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
644*32afb93cSXin Li p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
645*32afb93cSXin Li p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
646*32afb93cSXin Li p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
647*32afb93cSXin Li p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
648*32afb93cSXin Li p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
649*32afb93cSXin Li p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
650*32afb93cSXin Li p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
651*32afb93cSXin Li
652*32afb93cSXin Li p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
653*32afb93cSXin Li p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
654*32afb93cSXin Li p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
655*32afb93cSXin Li p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
656*32afb93cSXin Li p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
657*32afb93cSXin Li p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
658*32afb93cSXin Li p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
659*32afb93cSXin Li p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
660*32afb93cSXin Li
661*32afb93cSXin Li p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
662*32afb93cSXin Li p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
663*32afb93cSXin Li p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
664*32afb93cSXin Li p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
665*32afb93cSXin Li p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
666*32afb93cSXin Li p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
667*32afb93cSXin Li p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
668*32afb93cSXin Li p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
669*32afb93cSXin Li
670*32afb93cSXin Li p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
671*32afb93cSXin Li p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
672*32afb93cSXin Li p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
673*32afb93cSXin Li p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
674*32afb93cSXin Li p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
675*32afb93cSXin Li p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
676*32afb93cSXin Li p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
677*32afb93cSXin Li p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
678*32afb93cSXin Li
679*32afb93cSXin Li p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
680*32afb93cSXin Li p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
681*32afb93cSXin Li p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
682*32afb93cSXin Li p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
683*32afb93cSXin Li p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
684*32afb93cSXin Li p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
685*32afb93cSXin Li p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
686*32afb93cSXin Li p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
687*32afb93cSXin Li
688*32afb93cSXin Li o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
689*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
690*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
691*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
692*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
693*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
694*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
695*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
696*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
697*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
698*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
699*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
700*32afb93cSXin Li o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
701*32afb93cSXin Li o0 = _mm_srai_epi32(o0, 8);
702*32afb93cSXin Li
703*32afb93cSXin Li o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
704*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
705*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
706*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
707*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
708*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
709*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
710*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
711*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
712*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
713*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
714*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
715*32afb93cSXin Li o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
716*32afb93cSXin Li o1 = _mm_srai_epi32(o1, 8);
717*32afb93cSXin Li
718*32afb93cSXin Li o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
719*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
720*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
721*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
722*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
723*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
724*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
725*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
726*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
727*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
728*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
729*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
730*32afb93cSXin Li o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
731*32afb93cSXin Li o2 = _mm_srai_epi32(o2, 8);
732*32afb93cSXin Li
733*32afb93cSXin Li o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
734*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
735*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
736*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
737*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
738*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
739*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
740*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
741*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
742*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
743*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
744*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
745*32afb93cSXin Li o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
746*32afb93cSXin Li o3 = _mm_srai_epi32(o3, 8);
747*32afb93cSXin Li
748*32afb93cSXin Li o0 = packus_epi32(o0, o1);
749*32afb93cSXin Li o2 = packus_epi32(o2, o3);
750*32afb93cSXin Li o0 = _mm_packus_epi16(o0, o2);
751*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, o0);
752*32afb93cSXin Li
753*32afb93cSXin Li y0 = (const char *)y0 + 16;
754*32afb93cSXin Li y1 = (const char *)y1 + 16;
755*32afb93cSXin Li y2 = (const char *)y2 + 16;
756*32afb93cSXin Li y3 = (const char *)y3 + 16;
757*32afb93cSXin Li y4 = (const char *)y4 + 16;
758*32afb93cSXin Li dst = (char *)dst + 16;
759*32afb93cSXin Li }
760*32afb93cSXin Li }
761*32afb93cSXin Li
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)762*32afb93cSXin Li void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
763*32afb93cSXin Li __m128i all1s, ina, ins;
764*32afb93cSXin Li __m128i in0, in1, out0, out1;
765*32afb93cSXin Li __m128i t0, t1, t2, t3;
766*32afb93cSXin Li uint32_t i;
767*32afb93cSXin Li
768*32afb93cSXin Li all1s = _mm_set1_epi16(255);
769*32afb93cSXin Li
770*32afb93cSXin Li for (i = 0; i < count8; ++i) {
771*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
772*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
773*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
774*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
775*32afb93cSXin Li
776*32afb93cSXin Li ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
777*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
778*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
779*32afb93cSXin Li t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
780*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
781*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
782*32afb93cSXin Li t0 = _mm_add_epi16(t0, ins);
783*32afb93cSXin Li
784*32afb93cSXin Li ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
785*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
786*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
787*32afb93cSXin Li t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
788*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
789*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
790*32afb93cSXin Li t1 = _mm_add_epi16(t1, ins);
791*32afb93cSXin Li
792*32afb93cSXin Li ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
793*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
794*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
795*32afb93cSXin Li t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
796*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
797*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
798*32afb93cSXin Li t2 = _mm_add_epi16(t2, ins);
799*32afb93cSXin Li
800*32afb93cSXin Li ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
801*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
802*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
803*32afb93cSXin Li t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
804*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
805*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
806*32afb93cSXin Li t3 = _mm_add_epi16(t3, ins);
807*32afb93cSXin Li
808*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
809*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
810*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
811*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
812*32afb93cSXin Li
813*32afb93cSXin Li src = (const __m128i *)src + 2;
814*32afb93cSXin Li dst = (__m128i *)dst + 2;
815*32afb93cSXin Li }
816*32afb93cSXin Li }
817*32afb93cSXin Li
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)818*32afb93cSXin Li void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
819*32afb93cSXin Li __m128i all1s, outa, outs;
820*32afb93cSXin Li __m128i in0, in1, out0, out1;
821*32afb93cSXin Li __m128i t0, t1, t2, t3;
822*32afb93cSXin Li uint32_t i;
823*32afb93cSXin Li
824*32afb93cSXin Li all1s = _mm_set1_epi16(255);
825*32afb93cSXin Li
826*32afb93cSXin Li for (i = 0; i < count8; ++i) {
827*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
828*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
829*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
830*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
831*32afb93cSXin Li
832*32afb93cSXin Li
833*32afb93cSXin Li outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
834*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
835*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
836*32afb93cSXin Li t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
837*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
838*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
839*32afb93cSXin Li t0 = _mm_add_epi16(t0, outs);
840*32afb93cSXin Li
841*32afb93cSXin Li outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
842*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
843*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
844*32afb93cSXin Li t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
845*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
846*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
847*32afb93cSXin Li t1 = _mm_add_epi16(t1, outs);
848*32afb93cSXin Li
849*32afb93cSXin Li outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
850*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
851*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
852*32afb93cSXin Li t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
853*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
854*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
855*32afb93cSXin Li t2 = _mm_add_epi16(t2, outs);
856*32afb93cSXin Li
857*32afb93cSXin Li outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
858*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
859*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
860*32afb93cSXin Li t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
861*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
862*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
863*32afb93cSXin Li t3 = _mm_add_epi16(t3, outs);
864*32afb93cSXin Li
865*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
866*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
867*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
868*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
869*32afb93cSXin Li
870*32afb93cSXin Li src = (const __m128i *)src + 2;
871*32afb93cSXin Li dst = (__m128i *)dst + 2;
872*32afb93cSXin Li }
873*32afb93cSXin Li }
874*32afb93cSXin Li
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)875*32afb93cSXin Li void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
876*32afb93cSXin Li __m128i outa;
877*32afb93cSXin Li __m128i in0, in1, out0, out1;
878*32afb93cSXin Li __m128i t0, t1, t2, t3;
879*32afb93cSXin Li uint32_t i;
880*32afb93cSXin Li
881*32afb93cSXin Li for (i = 0; i < count8; ++i) {
882*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
883*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
884*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
885*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
886*32afb93cSXin Li
887*32afb93cSXin Li outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
888*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
889*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
890*32afb93cSXin Li t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
891*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, outa);
892*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
893*32afb93cSXin Li
894*32afb93cSXin Li outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
895*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
896*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
897*32afb93cSXin Li t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
898*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, outa);
899*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
900*32afb93cSXin Li
901*32afb93cSXin Li outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
902*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
903*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
904*32afb93cSXin Li t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
905*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, outa);
906*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
907*32afb93cSXin Li
908*32afb93cSXin Li outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
909*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
910*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
911*32afb93cSXin Li t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
912*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, outa);
913*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
914*32afb93cSXin Li
915*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
916*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
917*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
918*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
919*32afb93cSXin Li
920*32afb93cSXin Li src = (const __m128i *)src + 2;
921*32afb93cSXin Li dst = (__m128i *)dst + 2;
922*32afb93cSXin Li }
923*32afb93cSXin Li }
924*32afb93cSXin Li
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)925*32afb93cSXin Li void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
926*32afb93cSXin Li __m128i ina;
927*32afb93cSXin Li __m128i in0, in1, out0, out1;
928*32afb93cSXin Li __m128i t0, t1, t2, t3;
929*32afb93cSXin Li uint32_t i;
930*32afb93cSXin Li
931*32afb93cSXin Li for (i = 0; i < count8; ++i) {
932*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
933*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
934*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
935*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
936*32afb93cSXin Li
937*32afb93cSXin Li ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
938*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
939*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
940*32afb93cSXin Li t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
941*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, ina);
942*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
943*32afb93cSXin Li
944*32afb93cSXin Li ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
945*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
946*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
947*32afb93cSXin Li t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
948*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, ina);
949*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
950*32afb93cSXin Li
951*32afb93cSXin Li ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
952*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
953*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
954*32afb93cSXin Li t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
955*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, ina);
956*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
957*32afb93cSXin Li
958*32afb93cSXin Li ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
959*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
960*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
961*32afb93cSXin Li t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
962*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, ina);
963*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
964*32afb93cSXin Li
965*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
966*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
967*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
968*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
969*32afb93cSXin Li
970*32afb93cSXin Li src = (const __m128i *)src + 2;
971*32afb93cSXin Li dst = (__m128i *)dst + 2;
972*32afb93cSXin Li }
973*32afb93cSXin Li }
974*32afb93cSXin Li
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)975*32afb93cSXin Li void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
976*32afb93cSXin Li __m128i all1s, outa;
977*32afb93cSXin Li __m128i in0, in1, out0, out1;
978*32afb93cSXin Li __m128i t0, t1, t2, t3;
979*32afb93cSXin Li uint32_t i;
980*32afb93cSXin Li
981*32afb93cSXin Li all1s = _mm_set1_epi16(255);
982*32afb93cSXin Li
983*32afb93cSXin Li for (i = 0; i < count8; ++i) {
984*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
985*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
986*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
987*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
988*32afb93cSXin Li
989*32afb93cSXin Li outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
990*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
991*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
992*32afb93cSXin Li t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
993*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
994*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
995*32afb93cSXin Li
996*32afb93cSXin Li outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
997*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
998*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
999*32afb93cSXin Li t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1000*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1001*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
1002*32afb93cSXin Li
1003*32afb93cSXin Li outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1004*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
1005*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1006*32afb93cSXin Li t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1007*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1008*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
1009*32afb93cSXin Li
1010*32afb93cSXin Li outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1011*32afb93cSXin Li outa = _mm_shufflelo_epi16(outa, 0xFF);
1012*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1013*32afb93cSXin Li t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1014*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1015*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
1016*32afb93cSXin Li
1017*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
1018*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
1019*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
1020*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
1021*32afb93cSXin Li
1022*32afb93cSXin Li src = (const __m128i *)src + 2;
1023*32afb93cSXin Li dst = (__m128i *)dst + 2;
1024*32afb93cSXin Li }
1025*32afb93cSXin Li }
1026*32afb93cSXin Li
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1027*32afb93cSXin Li void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1028*32afb93cSXin Li __m128i all1s, ina;
1029*32afb93cSXin Li __m128i in0, in1, out0, out1;
1030*32afb93cSXin Li __m128i t0, t1, t2, t3;
1031*32afb93cSXin Li uint32_t i;
1032*32afb93cSXin Li
1033*32afb93cSXin Li all1s = _mm_set1_epi16(255);
1034*32afb93cSXin Li
1035*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1036*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1037*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1038*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1039*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1040*32afb93cSXin Li
1041*32afb93cSXin Li ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1042*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
1043*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1044*32afb93cSXin Li t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1045*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1046*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
1047*32afb93cSXin Li
1048*32afb93cSXin Li ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1049*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
1050*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1051*32afb93cSXin Li t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1052*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1053*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
1054*32afb93cSXin Li
1055*32afb93cSXin Li ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1056*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
1057*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1058*32afb93cSXin Li t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1059*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1060*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
1061*32afb93cSXin Li
1062*32afb93cSXin Li ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1063*32afb93cSXin Li ina = _mm_shufflelo_epi16(ina, 0xFF);
1064*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1065*32afb93cSXin Li t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1066*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1067*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
1068*32afb93cSXin Li
1069*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
1070*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
1071*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
1072*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
1073*32afb93cSXin Li
1074*32afb93cSXin Li src = (const __m128i *)src + 2;
1075*32afb93cSXin Li dst = (__m128i *)dst + 2;
1076*32afb93cSXin Li }
1077*32afb93cSXin Li }
1078*32afb93cSXin Li
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1079*32afb93cSXin Li void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1080*32afb93cSXin Li const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1081*32afb93cSXin Li __m128i all1s, ina, outa, ins, outs;
1082*32afb93cSXin Li __m128i in0, in1, out0, out1;
1083*32afb93cSXin Li __m128i t0, t1, t2, t3;
1084*32afb93cSXin Li uint32_t i;
1085*32afb93cSXin Li
1086*32afb93cSXin Li all1s = _mm_set1_epi16(255);
1087*32afb93cSXin Li
1088*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1089*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1090*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1091*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1092*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1093*32afb93cSXin Li
1094*32afb93cSXin Li ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1095*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1096*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1097*32afb93cSXin Li outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1098*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1099*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1100*32afb93cSXin Li t0 = _mm_sub_epi16(all1s, ina);
1101*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, outs);
1102*32afb93cSXin Li t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1103*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
1104*32afb93cSXin Li
1105*32afb93cSXin Li ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1106*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1107*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1108*32afb93cSXin Li outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1109*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1110*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1111*32afb93cSXin Li t1 = _mm_sub_epi16(all1s, ina);
1112*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, outs);
1113*32afb93cSXin Li t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1114*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
1115*32afb93cSXin Li
1116*32afb93cSXin Li ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1117*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1118*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1119*32afb93cSXin Li outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1120*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1121*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1122*32afb93cSXin Li t2 = _mm_sub_epi16(all1s, ina);
1123*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, outs);
1124*32afb93cSXin Li t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1125*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
1126*32afb93cSXin Li
1127*32afb93cSXin Li ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1128*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1129*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1130*32afb93cSXin Li outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1131*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1132*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1133*32afb93cSXin Li t3 = _mm_sub_epi16(all1s, ina);
1134*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, outs);
1135*32afb93cSXin Li t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1136*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
1137*32afb93cSXin Li
1138*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
1139*32afb93cSXin Li t0 = blendv_epi8(t0, out0, M0001);
1140*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
1141*32afb93cSXin Li t2 = blendv_epi8(t2, out1, M0001);
1142*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
1143*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
1144*32afb93cSXin Li
1145*32afb93cSXin Li src = (const __m128i *)src + 2;
1146*32afb93cSXin Li dst = (__m128i *)dst + 2;
1147*32afb93cSXin Li }
1148*32afb93cSXin Li }
1149*32afb93cSXin Li
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1150*32afb93cSXin Li void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1151*32afb93cSXin Li const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1152*32afb93cSXin Li __m128i all1s, ina, ins, outa, outs;
1153*32afb93cSXin Li __m128i in0, in1, out0, out1;
1154*32afb93cSXin Li __m128i t0, t1, t2, t3;
1155*32afb93cSXin Li uint32_t i;
1156*32afb93cSXin Li
1157*32afb93cSXin Li all1s = _mm_set1_epi16(255);
1158*32afb93cSXin Li
1159*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1160*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1161*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1162*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1163*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1164*32afb93cSXin Li
1165*32afb93cSXin Li ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1166*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1167*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1168*32afb93cSXin Li outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1169*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1170*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1171*32afb93cSXin Li t0 = _mm_sub_epi16(all1s, outa);
1172*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, ins);
1173*32afb93cSXin Li t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1174*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
1175*32afb93cSXin Li
1176*32afb93cSXin Li ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1177*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1178*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1179*32afb93cSXin Li outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1180*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1181*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1182*32afb93cSXin Li t1 = _mm_sub_epi16(all1s, outa);
1183*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, ins);
1184*32afb93cSXin Li t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1185*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
1186*32afb93cSXin Li
1187*32afb93cSXin Li ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1188*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1189*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1190*32afb93cSXin Li outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1191*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1192*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1193*32afb93cSXin Li t2 = _mm_sub_epi16(all1s, outa);
1194*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, ins);
1195*32afb93cSXin Li t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1196*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
1197*32afb93cSXin Li
1198*32afb93cSXin Li ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1199*32afb93cSXin Li ina = _mm_shufflelo_epi16(ins, 0xFF);
1200*32afb93cSXin Li ina = _mm_shufflehi_epi16(ina, 0xFF);
1201*32afb93cSXin Li outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1202*32afb93cSXin Li outa = _mm_shufflelo_epi16(outs, 0xFF);
1203*32afb93cSXin Li outa = _mm_shufflehi_epi16(outa, 0xFF);
1204*32afb93cSXin Li t3 = _mm_sub_epi16(all1s, outa);
1205*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, ins);
1206*32afb93cSXin Li t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1207*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
1208*32afb93cSXin Li
1209*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
1210*32afb93cSXin Li t0 = blendv_epi8(t0, in0, M0001);
1211*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
1212*32afb93cSXin Li t2 = blendv_epi8(t2, in1, M0001);
1213*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
1214*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
1215*32afb93cSXin Li
1216*32afb93cSXin Li src = (const __m128i *)src + 2;
1217*32afb93cSXin Li dst = (__m128i *)dst + 2;
1218*32afb93cSXin Li }
1219*32afb93cSXin Li }
1220*32afb93cSXin Li
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1221*32afb93cSXin Li void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1222*32afb93cSXin Li __m128i in0, in1, out0, out1;
1223*32afb93cSXin Li uint32_t i;
1224*32afb93cSXin Li
1225*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1226*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1227*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1228*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1229*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1230*32afb93cSXin Li
1231*32afb93cSXin Li out0 = _mm_xor_si128(out0, in0);
1232*32afb93cSXin Li out1 = _mm_xor_si128(out1, in1);
1233*32afb93cSXin Li
1234*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, out0);
1235*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, out1);
1236*32afb93cSXin Li
1237*32afb93cSXin Li src = (const __m128i *)src + 2;
1238*32afb93cSXin Li dst = (__m128i *)dst + 2;
1239*32afb93cSXin Li }
1240*32afb93cSXin Li }
1241*32afb93cSXin Li
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1242*32afb93cSXin Li void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1243*32afb93cSXin Li __m128i in0, in1, out0, out1;
1244*32afb93cSXin Li __m128i t0, t1, t2, t3;
1245*32afb93cSXin Li uint32_t i;
1246*32afb93cSXin Li
1247*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1248*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1249*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1250*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1251*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1252*32afb93cSXin Li
1253*32afb93cSXin Li t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1254*32afb93cSXin Li t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1255*32afb93cSXin Li t0 = _mm_srli_epi16(t0, 8);
1256*32afb93cSXin Li
1257*32afb93cSXin Li t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1258*32afb93cSXin Li t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1259*32afb93cSXin Li t1 = _mm_srli_epi16(t1, 8);
1260*32afb93cSXin Li
1261*32afb93cSXin Li t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1262*32afb93cSXin Li t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1263*32afb93cSXin Li t2 = _mm_srli_epi16(t2, 8);
1264*32afb93cSXin Li
1265*32afb93cSXin Li t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1266*32afb93cSXin Li t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1267*32afb93cSXin Li t3 = _mm_srli_epi16(t3, 8);
1268*32afb93cSXin Li
1269*32afb93cSXin Li t0 = _mm_packus_epi16(t0, t1);
1270*32afb93cSXin Li t2 = _mm_packus_epi16(t2, t3);
1271*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, t0);
1272*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, t2);
1273*32afb93cSXin Li
1274*32afb93cSXin Li src = (const __m128i *)src + 2;
1275*32afb93cSXin Li dst = (__m128i *)dst + 2;
1276*32afb93cSXin Li }
1277*32afb93cSXin Li }
1278*32afb93cSXin Li
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1279*32afb93cSXin Li void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1280*32afb93cSXin Li __m128i in0, in1, out0, out1;
1281*32afb93cSXin Li uint32_t i;
1282*32afb93cSXin Li
1283*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1284*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1285*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1286*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1287*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1288*32afb93cSXin Li
1289*32afb93cSXin Li out0 = _mm_adds_epu8(out0, in0);
1290*32afb93cSXin Li out1 = _mm_adds_epu8(out1, in1);
1291*32afb93cSXin Li
1292*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, out0);
1293*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, out1);
1294*32afb93cSXin Li
1295*32afb93cSXin Li src = (const __m128i *)src + 2;
1296*32afb93cSXin Li dst = (__m128i *)dst + 2;
1297*32afb93cSXin Li }
1298*32afb93cSXin Li }
1299*32afb93cSXin Li
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1300*32afb93cSXin Li void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1301*32afb93cSXin Li __m128i in0, in1, out0, out1;
1302*32afb93cSXin Li uint32_t i;
1303*32afb93cSXin Li
1304*32afb93cSXin Li for (i = 0; i < count8; ++i) {
1305*32afb93cSXin Li in0 = _mm_loadu_si128((const __m128i *)src);
1306*32afb93cSXin Li in1 = _mm_loadu_si128((const __m128i *)src + 1);
1307*32afb93cSXin Li out0 = _mm_loadu_si128((const __m128i *)dst);
1308*32afb93cSXin Li out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1309*32afb93cSXin Li
1310*32afb93cSXin Li out0 = _mm_subs_epu8(out0, in0);
1311*32afb93cSXin Li out1 = _mm_subs_epu8(out1, in1);
1312*32afb93cSXin Li
1313*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst, out0);
1314*32afb93cSXin Li _mm_storeu_si128((__m128i *)dst + 1, out1);
1315*32afb93cSXin Li
1316*32afb93cSXin Li src = (const __m128i *)src + 2;
1317*32afb93cSXin Li dst = (__m128i *)dst + 2;
1318*32afb93cSXin Li }
1319*32afb93cSXin Li }
1320*32afb93cSXin Li
1321*32afb93cSXin Li } // namespace renderscript
1322