1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gavgpool.h>
13 #include <xnnpack/gemm.h>
14 #include <xnnpack/ibilinear.h>
15 #include <xnnpack/igemm.h>
16 #include <xnnpack/intrinsics-polyfill.h>
17 #include <xnnpack/math.h>
18 #include <xnnpack/maxpool.h>
19 #include <xnnpack/prelu.h>
20 #include <xnnpack/unaligned.h>
21 #include <xnnpack/vadd.h>
22 #include <xnnpack/vcvt.h>
23 #include <xnnpack/vlrelu.h>
24 #include <xnnpack/vmul.h>
25 #include <xnnpack/vunary.h>
26
27
xnn_f16_f32_vcvt_ukernel__sse41_int16_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])28 void xnn_f16_f32_vcvt_ukernel__sse41_int16_x16(
29 size_t n,
30 const void* input,
31 float* output,
32 const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
33 {
34 assert(n != 0);
35 assert(n % sizeof(uint16_t) == 0);
36 assert(input != NULL);
37 assert(output != NULL);
38
39 const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
40 const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
41 const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
42 const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
43 const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
44 const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
45
46 const uint16_t* i = (const uint16_t*) input;
47 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
48 const __m128i vh0 = _mm_loadu_si128((const __m128i*) i);
49 const __m128i vh1 = _mm_loadu_si128((const __m128i*) (i + 8));
50 i += 16;
51
52 const __m128i vsign0 = _mm_and_si128(vh0, vsign_mask);
53 const __m128i vsign1 = _mm_and_si128(vh1, vsign_mask);
54
55 const __m128i vnonsign0 = _mm_xor_si128(vh0, vsign0);
56 const __m128i vnonsign1 = _mm_xor_si128(vh1, vsign1);
57
58 const __m128i vprenorm0 = _mm_slli_epi16(vnonsign0, 13);
59 const __m128i vprenorm1 = _mm_add_epi16(_mm_srli_epi16(vnonsign0, 3), vexp_offset);
60 const __m128i vprenorm2 = _mm_slli_epi16(vnonsign1, 13);
61 const __m128i vprenorm3 = _mm_add_epi16(_mm_srli_epi16(vnonsign1, 3), vexp_offset);
62
63 const __m128i vnorm0 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm0, vprenorm1)), vexp_scale));
64 const __m128i vnorm1 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm0, vprenorm1)), vexp_scale));
65 const __m128i vnorm2 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm2, vprenorm3)), vexp_scale));
66 const __m128i vnorm3 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm2, vprenorm3)), vexp_scale));
67
68 const __m128i vdenorm0 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
69 const __m128i vdenorm1 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
70 const __m128i vdenorm2 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
71 const __m128i vdenorm3 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
72
73 const __m128i vmask0 = _mm_cmpgt_epi16(vnonsign0, vdenorm_cutoff);
74 const __m128i vmask1 = _mm_cmpgt_epi16(vnonsign1, vdenorm_cutoff);
75
76 const __m128i vf0 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign0),
77 _mm_blendv_epi8(vdenorm0, vnorm0, _mm_cvtepi16_epi32(vmask0)));
78 const __m128i vf1 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign0),
79 _mm_blendv_epi8(vdenorm1, vnorm1, _mm_unpackhi_epi16(vmask0, vmask0)));
80 const __m128i vf2 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign1),
81 _mm_blendv_epi8(vdenorm2, vnorm2, _mm_cvtepi16_epi32(vmask1)));
82 const __m128i vf3 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign1),
83 _mm_blendv_epi8(vdenorm3, vnorm3, _mm_unpackhi_epi16(vmask1, vmask1)));
84
85 _mm_storeu_ps(output, _mm_castsi128_ps(vf0));
86 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf1));
87 _mm_storeu_ps(output + 8, _mm_castsi128_ps(vf2));
88 _mm_storeu_ps(output + 12, _mm_castsi128_ps(vf3));
89 output += 16;
90 }
91 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
92 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
93 i += 8;
94
95 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
96
97 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
98
99 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
100 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
101
102 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
103 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
104
105 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
106 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
107
108 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
109
110 const __m128i vf_lo = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
111 _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
112
113 const __m128i vf_hi = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
114 _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
115
116 _mm_storeu_ps(output, _mm_castsi128_ps(vf_lo));
117 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf_hi));
118 output += 8;
119 }
120 if XNN_UNPREDICTABLE(n != 0) {
121 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
122
123 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
124
125 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
126
127 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
128 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
129
130 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
131 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
132
133 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
134 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
135
136 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
137
138 __m128i vf = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
139 _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
140
141 if (n & (4 * sizeof(uint16_t))) {
142 _mm_storeu_ps(output, _mm_castsi128_ps(vf));
143 output += 4;
144
145 vf = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
146 _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
147 }
148 if (n & (2 * sizeof(uint16_t))) {
149 _mm_storel_pi((__m64*) output, _mm_castsi128_ps(vf));
150 output += 2;
151
152 vf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(vf), _mm_castsi128_ps(vf)));
153 }
154 if (n & (1 * sizeof(uint16_t))) {
155 _mm_store_ss(output, _mm_castsi128_ps(vf));
156 }
157 }
158 }
159
xnn_f32_f16_vcvt_ukernel__sse41_x8(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])160 void xnn_f32_f16_vcvt_ukernel__sse41_x8(
161 size_t n,
162 const float* input,
163 void* output,
164 const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
165 {
166 assert(n != 0);
167 assert(n % sizeof(float) == 0);
168 assert(input != NULL);
169 assert(output != NULL);
170
171 const __m128 vnonsign_mask = _mm_load_ps((const float*) params->sse2.nonsign_mask);
172 const __m128i vexp_bias = _mm_load_si128((const __m128i*) params->sse2.exp_bias);
173 const __m128 vscale_to_inf = _mm_load_ps(params->sse2.scale_to_inf);
174 const __m128i vexpw_max = _mm_load_si128((const __m128i*) params->sse2.expw_max);
175 const __m128 vscale_to_zero = _mm_load_ps(params->sse2.scale_to_zero);
176 const __m128i vbias_min = _mm_load_si128((const __m128i*) params->sse2.bias_min);
177 const __m128i vmanth_mask = _mm_load_si128((const __m128i*) params->sse2.manth_mask);
178 const __m128i vexph_mask = _mm_load_si128((const __m128i*) params->sse2.exph_mask);
179 const __m128i vnanh = _mm_load_si128((const __m128i*) params->sse2.nanh);
180
181 uint16_t* o = (uint16_t*) output;
182 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
183 const __m128 vx_lo = _mm_loadu_ps(input);
184 const __m128 vx_hi = _mm_loadu_ps(input + 4);
185 input += 8;
186
187 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
188 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
189
190 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
191 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
192 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
193 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
194 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
195 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
196 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
197 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
198
199 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
200 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
201 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
202 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
203 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
204 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
205
206 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
207 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
208
209 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
210 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
211
212 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
213 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
214 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
215 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
216
217 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
218 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
219
220 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
221 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
222
223 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
224
225 const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
226
227 const __m128i vh = _mm_or_si128(vabsh, vsignh);
228
229 _mm_storeu_si128((__m128i*) o, vh);
230 o += 8;
231 }
232 if XNN_UNPREDICTABLE(n != 0) {
233 const __m128 vx_lo = _mm_loadu_ps(input);
234 const float* input_hi = (const float*) ((uintptr_t) input + (n & (4 * sizeof(float))));
235 const __m128 vx_hi = _mm_loadu_ps(input_hi);
236
237 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
238 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
239
240 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
241 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
242 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
243 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
244 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
245 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
246 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
247 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
248
249 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
250 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
251 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
252 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
253 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
254 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
255
256 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
257 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
258
259 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
260 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
261
262 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
263 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
264 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
265 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
266
267 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
268 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
269
270 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
271 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
272
273 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
274
275 const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
276
277 __m128i vh = _mm_or_si128(vabsh, vsignh);
278
279 if (n & (4 * sizeof(float))) {
280 _mm_storel_epi64((__m128i*) o, vh);
281 vh = _mm_unpackhi_epi64(vh, vh);
282 o += 4;
283 }
284 if (n & (2 * sizeof(float))) {
285 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vh));
286 vh = _mm_srli_epi64(vh, 32);
287 o += 2;
288 }
289 if (n & (1 * sizeof(float))) {
290 *o = (uint16_t) _mm_extract_epi16(vh, 0);
291 }
292 }
293 }
294
xnn_f32_prelu_ukernel__sse41_2x8(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride)295 void xnn_f32_prelu_ukernel__sse41_2x8(
296 size_t rows,
297 size_t channels,
298 const float*restrict input,
299 size_t input_stride,
300 const float*restrict weights,
301 float*restrict output,
302 size_t output_stride) XNN_OOB_READS
303 {
304 assert(rows != 0);
305 assert(channels != 0);
306 assert(channels % sizeof(float) == 0);
307
308 const float* i0 = input;
309 float* o0 = output;
310 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
311 float* o1 = (float*) ((uintptr_t) o0 + output_stride);
312
313 const size_t input_increment = input_stride * 2 - channels;
314 const size_t output_increment = output_stride * 2 - channels;
315
316 do {
317 if XNN_UNPREDICTABLE(rows < 2) {
318 i1 = i0;
319 o1 = o0;
320 }
321
322 const float* w = weights;
323 size_t c = channels;
324 for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
325 const __m128 vw0123 = _mm_load_ps(w);
326 const __m128 vw4567 = _mm_load_ps(w + 4);
327 w += 8;
328
329 const __m128 vi0x0123 = _mm_loadu_ps(i0);
330 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
331 i0 += 8;
332 const __m128 vi1x0123 = _mm_loadu_ps(i1);
333 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
334 i1 += 8;
335
336 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
337 const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567);
338 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
339 const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567);
340
341 const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
342 const __m128 vacc0x4567 = _mm_blendv_ps(vi0x4567, vprod0x4567, vi0x4567);
343 const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
344 const __m128 vacc1x4567 = _mm_blendv_ps(vi1x4567, vprod1x4567, vi1x4567);
345
346 _mm_storeu_ps(o0, vacc0x0123);
347 _mm_storeu_ps(o0 + 4, vacc0x4567);
348 o0 += 8;
349 _mm_storeu_ps(o1, vacc1x0123);
350 _mm_storeu_ps(o1 + 4, vacc1x4567);
351 o1 += 8;
352 }
353 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
354 const __m128 vw0123 = _mm_load_ps(w);
355 w += 4;
356
357 const __m128 vi0x0123 = _mm_loadu_ps(i0);
358 i0 += 4;
359 const __m128 vi1x0123 = _mm_loadu_ps(i1);
360 i1 += 4;
361
362 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
363 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
364
365 __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
366 __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
367
368 _mm_storeu_ps(o0, vacc0x0123);
369 o0 += 4;
370 _mm_storeu_ps(o1, vacc1x0123);
371 o1 += 4;
372 }
373 if XNN_UNLIKELY(c != 0) {
374 const __m128 vw0123 = _mm_load_ps(w);
375 w = (const float*) ((uintptr_t) w + c);
376
377 const __m128 vi0x0123 = _mm_loadu_ps(i0);
378 i0 = (const float*) ((uintptr_t) i0 + c);
379 const __m128 vi1x0123 = _mm_loadu_ps(i1);
380 i1 = (const float*) ((uintptr_t) i1 + c);
381
382 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
383 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
384
385 __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
386 __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
387
388 if (c & (2 * sizeof(float))) {
389 _mm_storel_pi((__m64*) o0, vacc0x0123);
390 _mm_storel_pi((__m64*) o1, vacc1x0123);
391
392 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
393 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
394
395 o0 += 2;
396 o1 += 2;
397 }
398 if (c & (1 * sizeof(float))) {
399 _mm_store_ss(o0, vacc0x0123);
400 _mm_store_ss(o1, vacc1x0123);
401
402 o0 += 1;
403 o1 += 1;
404 }
405 }
406 i0 = (const float*) ((uintptr_t) i0 + input_increment);
407 o0 = (float*) ((uintptr_t) o0 + output_increment);
408 i1 = (const float*) ((uintptr_t) i1 + input_increment);
409 o1 = (float*) ((uintptr_t) o1 + output_increment);
410 rows = doz(rows, 2);
411 } while (rows != 0);
412 }
413
xnn_f32_qs8_vcvt_ukernel__sse41_x32(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])414 void xnn_f32_qs8_vcvt_ukernel__sse41_x32(
415 size_t n,
416 const float* x,
417 int8_t* y,
418 const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
419 {
420 assert(n != 0);
421 assert(n % sizeof(float) == 0);
422 assert(x != NULL);
423 assert(y != NULL);
424
425 const __m128 vscale = _mm_load_ps(params->sse4.scale);
426 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->sse4.output_max_less_zero_point);
427 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
428 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
429
430 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
431 __m128 vx0123 = _mm_loadu_ps(x);
432 __m128 vx4567 = _mm_loadu_ps(x + 4);
433 __m128 vx89AB = _mm_loadu_ps(x + 8);
434 __m128 vxCDEF = _mm_loadu_ps(x + 12);
435 __m128 vxGHIJ = _mm_loadu_ps(x + 16);
436 __m128 vxKLMN = _mm_loadu_ps(x + 20);
437 __m128 vxOPQR = _mm_loadu_ps(x + 24);
438 __m128 vxSTUV = _mm_loadu_ps(x + 28);
439 x += 32;
440
441 vx0123 = _mm_mul_ps(vx0123, vscale);
442 vx4567 = _mm_mul_ps(vx4567, vscale);
443 vx89AB = _mm_mul_ps(vx89AB, vscale);
444 vxCDEF = _mm_mul_ps(vxCDEF, vscale);
445 vxGHIJ = _mm_mul_ps(vxGHIJ, vscale);
446 vxKLMN = _mm_mul_ps(vxKLMN, vscale);
447 vxOPQR = _mm_mul_ps(vxOPQR, vscale);
448 vxSTUV = _mm_mul_ps(vxSTUV, vscale);
449
450 vx0123 = _mm_min_ps(vx0123, voutput_max_less_zero_point);
451 vx4567 = _mm_min_ps(vx4567, voutput_max_less_zero_point);
452 vx89AB = _mm_min_ps(vx89AB, voutput_max_less_zero_point);
453 vxCDEF = _mm_min_ps(vxCDEF, voutput_max_less_zero_point);
454 vxGHIJ = _mm_min_ps(vxGHIJ, voutput_max_less_zero_point);
455 vxKLMN = _mm_min_ps(vxKLMN, voutput_max_less_zero_point);
456 vxOPQR = _mm_min_ps(vxOPQR, voutput_max_less_zero_point);
457 vxSTUV = _mm_min_ps(vxSTUV, voutput_max_less_zero_point);
458
459 const __m128i vy0123 = _mm_cvtps_epi32(vx0123);
460 const __m128i vy4567 = _mm_cvtps_epi32(vx4567);
461 const __m128i vy89AB = _mm_cvtps_epi32(vx89AB);
462 const __m128i vyCDEF = _mm_cvtps_epi32(vxCDEF);
463 const __m128i vyGHIJ = _mm_cvtps_epi32(vxGHIJ);
464 const __m128i vyKLMN = _mm_cvtps_epi32(vxKLMN);
465 const __m128i vyOPQR = _mm_cvtps_epi32(vxOPQR);
466 const __m128i vySTUV = _mm_cvtps_epi32(vxSTUV);
467
468 __m128i vy01234567 = _mm_packs_epi32(vy0123, vy4567);
469 __m128i vy89ABCDEF = _mm_packs_epi32(vy89AB, vyCDEF);
470 __m128i vyGHIJKLMN = _mm_packs_epi32(vyGHIJ, vyKLMN);
471 __m128i vyOPQRSTUV = _mm_packs_epi32(vyOPQR, vySTUV);
472
473 vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
474 vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
475 vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
476 vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
477
478
479 __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
480 __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
481
482 vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
483 vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
484
485 _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
486 _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
487 y += 32;
488 }
489 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
490 __m128 vx_lo = _mm_loadu_ps(x);
491 __m128 vx_hi = _mm_loadu_ps(x + 4);
492 x += 8;
493
494 vx_lo = _mm_mul_ps(vx_lo, vscale);
495 vx_hi = _mm_mul_ps(vx_hi, vscale);
496
497 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
498 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
499
500 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
501 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
502
503 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
504 vy = _mm_adds_epi16(vy, voutput_zero_point);
505 vy = _mm_packs_epi16(vy, vy);
506 vy = _mm_max_epi8(vy, voutput_min);
507
508 _mm_storel_epi64((__m128i*) y, vy);
509 y += 8;
510 }
511 if XNN_UNLIKELY(n != 0) {
512 __m128 vx_lo = _mm_loadu_ps(x);
513 const float* x_hi = (const float*) ((uintptr_t) x + (n & (4 * sizeof(float))));
514 __m128 vx_hi = _mm_loadu_ps(x_hi);
515
516 vx_lo = _mm_mul_ps(vx_lo, vscale);
517 vx_hi = _mm_mul_ps(vx_hi, vscale);
518
519 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
520 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
521
522 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
523 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
524
525 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
526 vy = _mm_adds_epi16(vy, voutput_zero_point);
527 vy = _mm_packs_epi16(vy, vy);
528 vy = _mm_max_epi8(vy, voutput_min);
529
530 if (n & (4 * sizeof(float))) {
531 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
532 y += 4;
533 vy = _mm_srli_epi64(vy, 32);
534 }
535 if (n & (2 * sizeof(float))) {
536 unaligned_store_u16(y, (uint16_t) _mm_extract_epi16(vy, 0));
537 y += 2;
538 vy = _mm_srli_epi32(vy, 16);
539 }
540 if (n & (1 * sizeof(float))) {
541 *y = (int8_t) _mm_extract_epi8(vy, 0);
542 }
543 }
544 }
545
xnn_f32_vlrelu_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])546 void xnn_f32_vlrelu_ukernel__sse41_x8(
547 size_t n,
548 const float* x,
549 float* y,
550 const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
551 {
552 assert(n != 0);
553 assert(n % sizeof(float) == 0);
554
555 const __m128 vslope = _mm_load_ps(params->sse.slope);
556 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
557 const __m128 vx0123 = _mm_loadu_ps(x);
558 const __m128 vx4567 = _mm_loadu_ps(x + 4);
559 x += 8;
560
561 __m128 vacc0123 = _mm_mul_ps(vx0123, vslope);
562 __m128 vacc4567 = _mm_mul_ps(vx4567, vslope);
563
564 vacc0123 = _mm_blendv_ps(vx0123, vacc0123, vx0123);
565 vacc4567 = _mm_blendv_ps(vx4567, vacc4567, vx4567);
566
567 _mm_storeu_ps(y, vacc0123);
568 _mm_storeu_ps(y + 4, vacc4567);
569 y += 8;
570 }
571 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
572 const __m128 vx = _mm_loadu_ps(x);
573 x += 4;
574
575 __m128 vacc = _mm_mul_ps(vx, vslope);
576 vacc = _mm_blendv_ps(vx, vacc, vx);
577
578 _mm_storeu_ps(y, vacc);
579 y += 4;
580 }
581 if XNN_UNLIKELY(n != 0) {
582 const __m128 vx = _mm_loadu_ps(x);
583
584 __m128 vacc = _mm_mul_ps(vx, vslope);
585 vacc = _mm_blendv_ps(vx, vacc, vx);
586
587 if (n & (2 * sizeof(float))) {
588 _mm_storel_pi((__m64*) y, vacc);
589 vacc = _mm_movehl_ps(vacc, vacc);
590 y += 2;
591 }
592 if (n & (1 * sizeof(float))) {
593 _mm_store_ss(y, vacc);
594 }
595 }
596 }
597
xnn_f32_vrndd_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])598 void xnn_f32_vrndd_ukernel__sse41_x8(
599 size_t n,
600 const float* x,
601 float* y,
602 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
603 {
604 assert(n != 0);
605 assert(n % sizeof(float) == 0);
606
607 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
608 const __m128 vx0123 = _mm_loadu_ps(x);
609 const __m128 vx4567 = _mm_loadu_ps(x + 4);
610 x += 8;
611
612 const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
613 const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
614
615 _mm_storeu_ps(y, vy0123);
616 _mm_storeu_ps(y + 4, vy4567);
617 y += 8;
618 }
619 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
620 const __m128 vx = _mm_loadu_ps(x);
621 x += 4;
622
623 const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
624
625 _mm_storeu_ps(y, vy);
626 y += 4;
627 }
628 if XNN_UNLIKELY(n != 0) {
629 const __m128 vx = _mm_loadu_ps(x);
630 __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
631 if (n & (2 * sizeof(float))) {
632 _mm_storel_pi((__m64*) y, vy);
633 vy = _mm_movehl_ps(vy, vy);
634 y += 2;
635 }
636 if (n & (1 * sizeof(float))) {
637 _mm_store_ss(y, vy);
638 }
639 }
640 }
641
xnn_f32_vrndne_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])642 void xnn_f32_vrndne_ukernel__sse41_x8(
643 size_t n,
644 const float* x,
645 float* y,
646 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
647 {
648 assert(n != 0);
649 assert(n % sizeof(float) == 0);
650
651 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
652 const __m128 vx0123 = _mm_loadu_ps(x);
653 const __m128 vx4567 = _mm_loadu_ps(x + 4);
654 x += 8;
655
656 const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
657 const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
658
659 _mm_storeu_ps(y, vy0123);
660 _mm_storeu_ps(y + 4, vy4567);
661 y += 8;
662 }
663 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
664 const __m128 vx = _mm_loadu_ps(x);
665 x += 4;
666
667 const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
668
669 _mm_storeu_ps(y, vy);
670 y += 4;
671 }
672 if XNN_UNLIKELY(n != 0) {
673 const __m128 vx = _mm_loadu_ps(x);
674 __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
675 if (n & (2 * sizeof(float))) {
676 _mm_storel_pi((__m64*) y, vy);
677 vy = _mm_movehl_ps(vy, vy);
678 y += 2;
679 }
680 if (n & (1 * sizeof(float))) {
681 _mm_store_ss(y, vy);
682 }
683 }
684 }
685
xnn_f32_vrndu_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])686 void xnn_f32_vrndu_ukernel__sse41_x8(
687 size_t n,
688 const float* x,
689 float* y,
690 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
691 {
692 assert(n != 0);
693 assert(n % sizeof(float) == 0);
694
695 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
696 const __m128 vx0123 = _mm_loadu_ps(x);
697 const __m128 vx4567 = _mm_loadu_ps(x + 4);
698 x += 8;
699
700 const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
701 const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
702
703 _mm_storeu_ps(y, vy0123);
704 _mm_storeu_ps(y + 4, vy4567);
705 y += 8;
706 }
707 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
708 const __m128 vx = _mm_loadu_ps(x);
709 x += 4;
710
711 const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
712
713 _mm_storeu_ps(y, vy);
714 y += 4;
715 }
716 if XNN_UNLIKELY(n != 0) {
717 const __m128 vx = _mm_loadu_ps(x);
718 __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
719 if (n & (2 * sizeof(float))) {
720 _mm_storel_pi((__m64*) y, vy);
721 vy = _mm_movehl_ps(vy, vy);
722 y += 2;
723 }
724 if (n & (1 * sizeof(float))) {
725 _mm_store_ss(y, vy);
726 }
727 }
728 }
729
xnn_f32_vrndz_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])730 void xnn_f32_vrndz_ukernel__sse41_x8(
731 size_t n,
732 const float* x,
733 float* y,
734 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
735 {
736 assert(n != 0);
737 assert(n % sizeof(float) == 0);
738
739 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
740 const __m128 vx0123 = _mm_loadu_ps(x);
741 const __m128 vx4567 = _mm_loadu_ps(x + 4);
742 x += 8;
743
744 const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
745 const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
746
747 _mm_storeu_ps(y, vy0123);
748 _mm_storeu_ps(y + 4, vy4567);
749 y += 8;
750 }
751 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
752 const __m128 vx = _mm_loadu_ps(x);
753 x += 4;
754
755 const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
756
757 _mm_storeu_ps(y, vy);
758 y += 4;
759 }
760 if XNN_UNLIKELY(n != 0) {
761 const __m128 vx = _mm_loadu_ps(x);
762 __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
763 if (n & (2 * sizeof(float))) {
764 _mm_storel_pi((__m64*) y, vy);
765 vy = _mm_movehl_ps(vy, vy);
766 y += 2;
767 }
768 if (n & (1 * sizeof(float))) {
769 _mm_store_ss(y, vy);
770 }
771 }
772 }
773
774 extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64];
775
xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])776 void xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8(
777 size_t n,
778 const float* x,
779 float* y,
780 const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
781 {
782 assert(n % sizeof(float) == 0);
783
784 const __m128 vsign_mask = _mm_load_ps(params->sse2_rr2_lut64_p2.sign_mask);
785 const __m128 vmagic_bias = _mm_load_ps(params->sse2_rr2_lut64_p2.magic_bias);
786 const __m128 vlog2e = _mm_load_ps(params->sse2_rr2_lut64_p2.log2e);
787 const __m128i vindex_mask = _mm_load_si128((const __m128i*) params->sse2_rr2_lut64_p2.index_mask);
788 const __m128 vminus_ln2_hi = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_hi);
789 const __m128 vminus_ln2_lo = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_lo);
790 const __m128 vc2 = _mm_load_ps(params->sse2_rr2_lut64_p2.c2);
791 const __m128 vone = _mm_load_ps(params->sse2_rr2_lut64_p2.one);
792 const __m128 vdenorm_cutoff = _mm_load_ps(params->sse2_rr2_lut64_p2.denorm_cutoff);
793
794 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
795 const __m128 vx0123 = _mm_loadu_ps(x);
796 const __m128 vx4567 = _mm_loadu_ps(x + 4);
797 x += 8;
798
799 const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
800 const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
801
802 __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
803 __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
804
805 const __m128i ve0123 = _mm_slli_epi32(_mm_castps_si128(vn0123), 17);
806 const __m128i ve4567 = _mm_slli_epi32(_mm_castps_si128(vn4567), 17);
807
808 const __m128i vidx0123 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn0123), vindex_mask), 2);
809 const __m128i vidx4567 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn4567), vindex_mask), 2);
810
811 #if XNN_ARCH_X86_64
812 const uint64_t vidx01 = (uint64_t) _mm_cvtsi128_si64(vidx0123);
813 const uint64_t vidx23 = (uint64_t) _mm_extract_epi64(vidx0123, 1);
814 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)));
815 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)));
816 const __m128i vl01 = _mm_insert_epi32(vl0, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))), 1);
817 const __m128i vl23 = _mm_insert_epi32(vl2, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))), 1);
818 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
819 const uint64_t vidx45 = (uint64_t) _mm_cvtsi128_si64(vidx4567);
820 const uint64_t vidx67 = (uint64_t) _mm_extract_epi64(vidx4567, 1);
821 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)));
822 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)));
823 const __m128i vl45 = _mm_insert_epi32(vl4, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))), 1);
824 const __m128i vl67 = _mm_insert_epi32(vl6, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))), 1);
825 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
826 #else // !XNN_ARCH_X86_64
827 const uint32_t vidx0 = (uint32_t) _mm_cvtsi128_si32(vidx0123);
828 const uint32_t vidx1 = (uint32_t) _mm_extract_epi16(vidx0123, 2);
829 const uint32_t vidx2 = (uint32_t) _mm_extract_epi16(vidx0123, 4);
830 const uint32_t vidx3 = (uint32_t) _mm_extract_epi16(vidx0123, 6);
831 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx0)));
832 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx2)));
833 const __m128i vl01 = _mm_insert_epi32(vl0, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx1)), 1);
834 const __m128i vl23 = _mm_insert_epi32(vl2, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx3)), 1);
835 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
836 const uint32_t vidx4 = (uint32_t) _mm_cvtsi128_si32(vidx4567);
837 const uint32_t vidx5 = (uint32_t) _mm_extract_epi16(vidx4567, 2);
838 const uint32_t vidx6 = (uint32_t) _mm_extract_epi16(vidx4567, 4);
839 const uint32_t vidx7 = (uint32_t) _mm_extract_epi16(vidx4567, 6);
840 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx4)));
841 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx6)));
842 const __m128i vl45 = _mm_insert_epi32(vl4, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx5)), 1);
843 const __m128i vl67 = _mm_insert_epi32(vl6, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx7)), 1);
844 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
845 #endif // XNN_ARCH_X86_64
846
847 const __m128 vs0123 = _mm_castsi128_ps(_mm_add_epi32(vl0123, ve0123));
848 const __m128 vs4567 = _mm_castsi128_ps(_mm_add_epi32(vl4567, ve4567));
849
850 vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
851 vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
852
853 __m128 vt0123 = _mm_add_ps(vz0123, _mm_mul_ps(vn0123, vminus_ln2_hi));
854 __m128 vt4567 = _mm_add_ps(vz4567, _mm_mul_ps(vn4567, vminus_ln2_hi));
855
856 vt0123 = _mm_add_ps(vt0123, _mm_mul_ps(vn0123, vminus_ln2_lo));
857 vt4567 = _mm_add_ps(vt4567, _mm_mul_ps(vn4567, vminus_ln2_lo));
858
859 __m128 vp0123 = _mm_mul_ps(vt0123, vc2);
860 __m128 vp4567 = _mm_mul_ps(vt4567, vc2);
861
862 vp0123 = _mm_add_ps(vt0123, _mm_mul_ps(vp0123, vt0123));
863 vp4567 = _mm_add_ps(vt4567, _mm_mul_ps(vp4567, vt4567));
864
865 const __m128 vy0123 = _mm_add_ps(vs0123, _mm_mul_ps(vs0123, vp0123));
866 const __m128 vy4567 = _mm_add_ps(vs4567, _mm_mul_ps(vs4567, vp4567));
867
868 __m128 vf0123 = _mm_div_ps(vy0123, _mm_add_ps(vy0123, vone));
869 __m128 vf4567 = _mm_div_ps(vy4567, _mm_add_ps(vy4567, vone));
870
871 vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
872 vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
873
874 vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
875 vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
876
877 _mm_storeu_ps(y, vf0123);
878 _mm_storeu_ps(y + 4, vf4567);
879 y += 8;
880 }
881 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
882 const __m128 vx = _mm_loadu_ps(x);
883 x += 4;
884
885 const __m128 vz = _mm_or_ps(vx, vsign_mask);
886
887 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
888 const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
889
890 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
891 #if XNN_ARCH_X86_64
892 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
893 const uint64_t vidx_hi = (uint64_t) _mm_extract_epi64(vidx, 1);
894 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
895 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
896 const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1);
897 const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))), 1);
898 #else // !XNN_ARCH_X86_64
899 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
900 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
901 const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))), 1);
902 const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))), 1);
903 #endif // XNN_ARCH_X86_64
904 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
905
906 const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
907 vn = _mm_sub_ps(vn, vmagic_bias);
908
909 __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
910 vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
911
912 __m128 vp = _mm_mul_ps(vt, vc2);
913 vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
914
915 const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
916
917 __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
918 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
919 vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
920
921 _mm_storeu_ps(y, vf);
922 y += 4;
923 }
924 if XNN_UNLIKELY(n != 0) {
925 const __m128 vx = _mm_loadu_ps(x);
926
927 const __m128 vz = _mm_or_ps(vx, vsign_mask);
928
929 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
930 const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
931
932 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
933 #if XNN_ARCH_X86_64
934 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
935 const uint64_t vidx_hi = (uint64_t) _mm_extract_epi64(vidx, 1);
936 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
937 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
938 const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1);
939 const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))), 1);
940 #else // !XNN_ARCH_X86_64
941 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
942 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
943 const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))), 1);
944 const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))), 1);
945 #endif // XNN_ARCH_X86_64
946 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
947
948 const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
949 vn = _mm_sub_ps(vn, vmagic_bias);
950
951 __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
952 vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
953
954 __m128 vp = _mm_mul_ps(vt, vc2);
955 vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
956
957 const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
958
959 __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
960 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
961 vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
962
963 if (n & (2 * sizeof(float))) {
964 _mm_storel_pi((__m64*) y, vf);
965 vf = _mm_movehl_ps(vf, vf);
966 y += 2;
967 }
968 if (n & (1 * sizeof(float))) {
969 _mm_store_ss(y, vf);
970 }
971 }
972 }
973
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])974 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(
975 size_t channels,
976 size_t output_width,
977 const int8_t** input,
978 const void* weights,
979 int8_t* output,
980 size_t input_stride,
981 size_t output_increment,
982 size_t input_offset,
983 const int8_t* zero,
984 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
985 {
986 assert(channels != 0);
987 assert(output_width != 0);
988
989 do {
990 const int8_t* i0 = input[0];
991 assert(i0 != NULL);
992 if XNN_UNPREDICTABLE(i0 != zero) {
993 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
994 }
995 const int8_t* i1 = input[1];
996 assert(i1 != NULL);
997 if XNN_UNPREDICTABLE(i1 != zero) {
998 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
999 }
1000 const int8_t* i2 = input[2];
1001 assert(i2 != NULL);
1002 if XNN_UNPREDICTABLE(i2 != zero) {
1003 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1004 }
1005 const int8_t* i3 = input[3];
1006 assert(i3 != NULL);
1007 if XNN_UNPREDICTABLE(i3 != zero) {
1008 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1009 }
1010 const int8_t* i4 = input[4];
1011 assert(i4 != NULL);
1012 if XNN_UNPREDICTABLE(i4 != zero) {
1013 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1014 }
1015 const int8_t* i5 = input[5];
1016 assert(i5 != NULL);
1017 if XNN_UNPREDICTABLE(i5 != zero) {
1018 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1019 }
1020 const int8_t* i6 = input[6];
1021 assert(i6 != NULL);
1022 if XNN_UNPREDICTABLE(i6 != zero) {
1023 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1024 }
1025 const int8_t* i7 = input[7];
1026 assert(i7 != NULL);
1027 if XNN_UNPREDICTABLE(i7 != zero) {
1028 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1029 }
1030 const int8_t* i8 = input[8];
1031 assert(i8 != NULL);
1032 if XNN_UNPREDICTABLE(i8 != zero) {
1033 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1034 }
1035 const int8_t* i9 = input[9];
1036 assert(i9 != NULL);
1037 if XNN_UNPREDICTABLE(i9 != zero) {
1038 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
1039 }
1040 const int8_t* i10 = input[10];
1041 assert(i10 != NULL);
1042 if XNN_UNPREDICTABLE(i10 != zero) {
1043 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
1044 }
1045 const int8_t* i11 = input[11];
1046 assert(i11 != NULL);
1047 if XNN_UNPREDICTABLE(i11 != zero) {
1048 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
1049 }
1050 const int8_t* i12 = input[12];
1051 assert(i12 != NULL);
1052 if XNN_UNPREDICTABLE(i12 != zero) {
1053 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
1054 }
1055 const int8_t* i13 = input[13];
1056 assert(i13 != NULL);
1057 if XNN_UNPREDICTABLE(i13 != zero) {
1058 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
1059 }
1060 const int8_t* i14 = input[14];
1061 assert(i14 != NULL);
1062 if XNN_UNPREDICTABLE(i14 != zero) {
1063 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
1064 }
1065 const int8_t* i15 = input[15];
1066 assert(i15 != NULL);
1067 if XNN_UNPREDICTABLE(i15 != zero) {
1068 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
1069 }
1070 const int8_t* i16 = input[16];
1071 assert(i16 != NULL);
1072 if XNN_UNPREDICTABLE(i16 != zero) {
1073 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
1074 }
1075 const int8_t* i17 = input[17];
1076 assert(i17 != NULL);
1077 if XNN_UNPREDICTABLE(i17 != zero) {
1078 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
1079 }
1080 const int8_t* i18 = input[18];
1081 assert(i18 != NULL);
1082 if XNN_UNPREDICTABLE(i18 != zero) {
1083 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
1084 }
1085 const int8_t* i19 = input[19];
1086 assert(i19 != NULL);
1087 if XNN_UNPREDICTABLE(i19 != zero) {
1088 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
1089 }
1090 const int8_t* i20 = input[20];
1091 assert(i20 != NULL);
1092 if XNN_UNPREDICTABLE(i20 != zero) {
1093 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
1094 }
1095 const int8_t* i21 = input[21];
1096 assert(i21 != NULL);
1097 if XNN_UNPREDICTABLE(i21 != zero) {
1098 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
1099 }
1100 const int8_t* i22 = input[22];
1101 assert(i22 != NULL);
1102 if XNN_UNPREDICTABLE(i22 != zero) {
1103 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
1104 }
1105 const int8_t* i23 = input[23];
1106 assert(i23 != NULL);
1107 if XNN_UNPREDICTABLE(i23 != zero) {
1108 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
1109 }
1110 const int8_t* i24 = input[24];
1111 assert(i24 != NULL);
1112 if XNN_UNPREDICTABLE(i24 != zero) {
1113 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
1114 }
1115 input = (const int8_t**) ((uintptr_t) input + input_stride);
1116
1117 size_t c = channels;
1118 const void* w = weights;
1119 for (; c >= 8; c -= 8) {
1120 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1121 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1122
1123
1124 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1125 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1126 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1127 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1128 i0 += 8;
1129
1130
1131 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1132
1133 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1134 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1135
1136 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1137 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1138 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1139 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1140 i1 += 8;
1141
1142
1143 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1144
1145 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1146 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1147
1148 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1149 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1150 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1151 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1152 i2 += 8;
1153
1154
1155 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1156
1157 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1158 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1159
1160 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
1161 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
1162 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1163 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
1164 i3 += 8;
1165
1166
1167 vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
1168
1169 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1170 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1171
1172 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
1173 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
1174 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
1175 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
1176 i4 += 8;
1177
1178
1179 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
1180
1181 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1182 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1183
1184 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
1185 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
1186 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
1187 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
1188 i5 += 8;
1189
1190
1191 vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
1192
1193 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1194 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1195
1196 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
1197 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
1198 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
1199 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
1200 i6 += 8;
1201
1202
1203 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
1204
1205 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1206 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1207
1208 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
1209 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
1210 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
1211 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
1212 i7 += 8;
1213
1214
1215 vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
1216
1217 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1218 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1219
1220 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
1221 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
1222 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
1223 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
1224 i8 += 8;
1225
1226
1227 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
1228
1229 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1230 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1231
1232 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
1233 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
1234 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
1235 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
1236 i9 += 8;
1237
1238
1239 vprod01234567 = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
1240
1241 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1242 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1243
1244 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
1245 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
1246 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
1247 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
1248 i10 += 8;
1249
1250
1251 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
1252
1253 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1254 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1255
1256 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
1257 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
1258 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
1259 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
1260 i11 += 8;
1261
1262
1263 vprod01234567 = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
1264
1265 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1266 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1267
1268 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
1269 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
1270 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
1271 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
1272 i12 += 8;
1273
1274
1275 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
1276
1277 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1278 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1279
1280 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
1281 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
1282 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
1283 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
1284 i13 += 8;
1285
1286
1287 vprod01234567 = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
1288
1289 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1290 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1291
1292 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
1293 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
1294 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
1295 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
1296 i14 += 8;
1297
1298
1299 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
1300
1301 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1302 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1303
1304 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
1305 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
1306 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
1307 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
1308 i15 += 8;
1309
1310
1311 vprod01234567 = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
1312
1313 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1314 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1315
1316 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
1317 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
1318 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
1319 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
1320 i16 += 8;
1321
1322
1323 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
1324
1325 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1326 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1327
1328 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
1329 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
1330 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
1331 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
1332 i17 += 8;
1333
1334
1335 vprod01234567 = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
1336
1337 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1338 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1339
1340 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
1341 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
1342 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
1343 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
1344 i18 += 8;
1345
1346
1347 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
1348
1349 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1350 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1351
1352 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
1353 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
1354 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
1355 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
1356 i19 += 8;
1357
1358
1359 vprod01234567 = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
1360
1361 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1362 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1363
1364 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
1365 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
1366 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
1367 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
1368 i20 += 8;
1369
1370
1371 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
1372
1373 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1374 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1375
1376 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
1377 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
1378 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
1379 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
1380 i21 += 8;
1381
1382
1383 vprod01234567 = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
1384
1385 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1386 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1387
1388 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
1389 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
1390 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
1391 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
1392 i22 += 8;
1393
1394
1395 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
1396
1397 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1398 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1399
1400 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
1401 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
1402 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
1403 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
1404 i23 += 8;
1405
1406
1407 vprod01234567 = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
1408
1409 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1410 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1411
1412 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
1413 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
1414 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
1415 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
1416 i24 += 8;
1417
1418
1419 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
1420
1421 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1422 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1423
1424 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
1425
1426 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1427 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1428
1429 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
1430 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
1431 w = (const void*) ((const float*) w + 8);
1432 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1433 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1434
1435 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1436 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1437 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1438
1439 vacc0123 = _mm_cvtps_epi32(vscaled0123);
1440 vacc4567 = _mm_cvtps_epi32(vscaled4567);
1441
1442 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1443 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1444
1445
1446 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1447
1448 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
1449 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1450
1451 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1452 output += 8;
1453 }
1454 if XNN_UNLIKELY(c != 0) {
1455 {
1456 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1457 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1458
1459
1460 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1461 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1462 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1463 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1464
1465
1466 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1467
1468 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1469 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1470
1471 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1472 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1473 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1474 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1475
1476
1477 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1478
1479 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1480 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1481
1482 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1483 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1484 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1485 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1486
1487
1488 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1489
1490 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1491 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1492
1493 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
1494 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
1495 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1496 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
1497
1498
1499 vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
1500
1501 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1502 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1503
1504 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
1505 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
1506 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
1507 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
1508
1509
1510 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
1511
1512 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1513 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1514
1515 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
1516 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
1517 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
1518 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
1519
1520
1521 vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
1522
1523 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1524 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1525
1526 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
1527 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
1528 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
1529 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
1530
1531
1532 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
1533
1534 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1535 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1536
1537 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
1538 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
1539 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
1540 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
1541
1542
1543 vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
1544
1545 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1546 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1547
1548 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
1549 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
1550 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
1551 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
1552
1553
1554 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
1555
1556 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1557 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1558
1559 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
1560 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
1561 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
1562 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
1563
1564
1565 vprod01234567 = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
1566
1567 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1568 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1569
1570 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
1571 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
1572 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
1573 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
1574
1575
1576 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
1577
1578 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1579 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1580
1581 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
1582 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
1583 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
1584 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
1585
1586
1587 vprod01234567 = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
1588
1589 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1590 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1591
1592 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
1593 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
1594 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
1595 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
1596
1597
1598 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
1599
1600 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1601 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1602
1603 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
1604 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
1605 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
1606 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
1607
1608
1609 vprod01234567 = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
1610
1611 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1612 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1613
1614 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
1615 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
1616 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
1617 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
1618
1619
1620 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
1621
1622 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1623 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1624
1625 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
1626 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
1627 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
1628 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
1629
1630
1631 vprod01234567 = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
1632
1633 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1634 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1635
1636 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
1637 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
1638 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
1639 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
1640
1641
1642 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
1643
1644 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1645 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1646
1647 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
1648 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
1649 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
1650 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
1651
1652
1653 vprod01234567 = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
1654
1655 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1656 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1657
1658 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
1659 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
1660 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
1661 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
1662
1663
1664 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
1665
1666 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1667 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1668
1669 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
1670 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
1671 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
1672 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
1673
1674
1675 vprod01234567 = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
1676
1677 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1678 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1679
1680 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
1681 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
1682 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
1683 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
1684
1685
1686 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
1687
1688 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1689 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1690
1691 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
1692 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
1693 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
1694 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
1695
1696
1697 vprod01234567 = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
1698
1699 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1700 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1701
1702 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
1703 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
1704 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
1705 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
1706
1707
1708 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
1709
1710 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1711 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1712
1713 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
1714 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
1715 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
1716 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
1717
1718
1719 vprod01234567 = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
1720
1721 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1722 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1723
1724 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
1725 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
1726 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
1727 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
1728
1729
1730 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
1731
1732 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1733 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1734
1735
1736 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1737 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1738
1739 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t)));
1740 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t) + 4 * sizeof(float)));
1741 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1742 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1743
1744 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1745 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1746 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1747
1748 vacc0123 = _mm_cvtps_epi32(vscaled0123);
1749 vacc4567 = _mm_cvtps_epi32(vscaled4567);
1750
1751
1752 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1753 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1754
1755
1756 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1757
1758 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
1759
1760 if (c & 4) {
1761 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
1762 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1763 output += 4;
1764 }
1765 if (c & 2) {
1766 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
1767 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1768 output += 2;
1769 }
1770 if (c & 1) {
1771 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1772 output += 1;
1773 }
1774 }
1775 }
1776
1777 output = (int8_t*) ((uintptr_t) output + output_increment);
1778 } while (--output_width != 0);
1779 }
1780
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1781 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16(
1782 size_t channels,
1783 size_t output_width,
1784 const int8_t** input,
1785 const void* weights,
1786 int8_t* output,
1787 size_t input_stride,
1788 size_t output_increment,
1789 size_t input_offset,
1790 const int8_t* zero,
1791 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1792 {
1793 assert(channels != 0);
1794 assert(output_width != 0);
1795
1796 do {
1797 const int8_t* i0 = input[0];
1798 assert(i0 != NULL);
1799 if XNN_UNPREDICTABLE(i0 != zero) {
1800 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1801 }
1802 const int8_t* i1 = input[1];
1803 assert(i1 != NULL);
1804 if XNN_UNPREDICTABLE(i1 != zero) {
1805 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1806 }
1807 const int8_t* i2 = input[2];
1808 assert(i2 != NULL);
1809 if XNN_UNPREDICTABLE(i2 != zero) {
1810 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1811 }
1812 input = (const int8_t**) ((uintptr_t) input + input_stride);
1813
1814 size_t c = channels;
1815 const void* w = weights;
1816 for (; c >= 8; c -= 8) {
1817 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1818 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1819
1820
1821 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1822 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1823 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1824 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1825 i0 += 8;
1826
1827
1828 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1829
1830 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1831 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1832
1833 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1834 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1835 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1836 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1837 i1 += 8;
1838
1839
1840 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1841
1842 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1843 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1844
1845 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1846 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1847 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1848 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1849 i2 += 8;
1850
1851
1852 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1853
1854 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1855 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1856
1857 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t));
1858
1859 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1860 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1861
1862 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
1863 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
1864 w = (const void*) ((const float*) w + 8);
1865 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1866 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1867
1868 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1869 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1870 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1871
1872 vacc0123 = _mm_cvtps_epi32(vscaled0123);
1873 vacc4567 = _mm_cvtps_epi32(vscaled4567);
1874
1875 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1876 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1877
1878
1879 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1880
1881 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
1882 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1883
1884 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1885 output += 8;
1886 }
1887 if XNN_UNLIKELY(c != 0) {
1888 {
1889 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1890 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1891
1892
1893 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1894 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1895 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1896 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1897
1898
1899 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1900
1901 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1902 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1903
1904 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1905 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1906 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1907 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1908
1909
1910 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1911
1912 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1913 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1914
1915 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1916 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1917 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1918 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1919
1920
1921 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1922
1923 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1924 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1925
1926
1927 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1928 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1929
1930 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1931 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t) + 4 * sizeof(float)));
1932 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1933 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1934
1935 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1936 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1937 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1938
1939 vacc0123 = _mm_cvtps_epi32(vscaled0123);
1940 vacc4567 = _mm_cvtps_epi32(vscaled4567);
1941
1942
1943 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1944 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1945
1946
1947 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1948
1949 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
1950
1951 if (c & 4) {
1952 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
1953 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1954 output += 4;
1955 }
1956 if (c & 2) {
1957 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
1958 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1959 output += 2;
1960 }
1961 if (c & 1) {
1962 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1963 output += 1;
1964 }
1965 }
1966 }
1967
1968 output = (int8_t*) ((uintptr_t) output + output_increment);
1969 } while (--output_width != 0);
1970 }
1971
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1972 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(
1973 size_t channels,
1974 size_t output_width,
1975 const int8_t** input,
1976 const void* weights,
1977 int8_t* output,
1978 size_t input_stride,
1979 size_t output_increment,
1980 size_t input_offset,
1981 const int8_t* zero,
1982 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1983 {
1984 assert(channels != 0);
1985 assert(output_width != 0);
1986
1987 do {
1988 const int8_t* i0 = input[0];
1989 assert(i0 != NULL);
1990 if XNN_UNPREDICTABLE(i0 != zero) {
1991 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1992 }
1993 const int8_t* i1 = input[1];
1994 assert(i1 != NULL);
1995 if XNN_UNPREDICTABLE(i1 != zero) {
1996 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1997 }
1998 const int8_t* i2 = input[2];
1999 assert(i2 != NULL);
2000 if XNN_UNPREDICTABLE(i2 != zero) {
2001 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2002 }
2003 const int8_t* i3 = input[3];
2004 assert(i3 != NULL);
2005 if XNN_UNPREDICTABLE(i3 != zero) {
2006 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2007 }
2008 const int8_t* i4 = input[4];
2009 assert(i4 != NULL);
2010 if XNN_UNPREDICTABLE(i4 != zero) {
2011 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2012 }
2013 const int8_t* i5 = input[5];
2014 assert(i5 != NULL);
2015 if XNN_UNPREDICTABLE(i5 != zero) {
2016 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2017 }
2018 const int8_t* i6 = input[6];
2019 assert(i6 != NULL);
2020 if XNN_UNPREDICTABLE(i6 != zero) {
2021 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2022 }
2023 const int8_t* i7 = input[7];
2024 assert(i7 != NULL);
2025 if XNN_UNPREDICTABLE(i7 != zero) {
2026 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2027 }
2028 const int8_t* i8 = input[8];
2029 assert(i8 != NULL);
2030 if XNN_UNPREDICTABLE(i8 != zero) {
2031 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2032 }
2033 input = (const int8_t**) ((uintptr_t) input + input_stride);
2034
2035 size_t c = channels;
2036 const void* w = weights;
2037 for (; c >= 8; c -= 8) {
2038 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
2039 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
2040
2041
2042 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
2043 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
2044 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
2045 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
2046 i0 += 8;
2047
2048
2049 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
2050
2051 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2052 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2053
2054 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
2055 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
2056 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
2057 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
2058 i1 += 8;
2059
2060
2061 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
2062
2063 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2064 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2065
2066 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
2067 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
2068 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
2069 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
2070 i2 += 8;
2071
2072
2073 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
2074
2075 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2076 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2077
2078 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
2079 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
2080 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
2081 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
2082 i3 += 8;
2083
2084
2085 vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
2086
2087 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2088 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2089
2090 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
2091 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
2092 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
2093 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
2094 i4 += 8;
2095
2096
2097 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
2098
2099 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2100 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2101
2102 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
2103 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
2104 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
2105 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
2106 i5 += 8;
2107
2108
2109 vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
2110
2111 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2112 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2113
2114 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
2115 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
2116 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
2117 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
2118 i6 += 8;
2119
2120
2121 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
2122
2123 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2124 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2125
2126 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
2127 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
2128 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
2129 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
2130 i7 += 8;
2131
2132
2133 vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
2134
2135 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2136 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2137
2138 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
2139 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
2140 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
2141 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
2142 i8 += 8;
2143
2144
2145 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
2146
2147 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2148 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2149
2150 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
2151
2152 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
2153 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
2154
2155 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
2156 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
2157 w = (const void*) ((const float*) w + 8);
2158 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
2159 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
2160
2161 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2162 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
2163 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
2164
2165 vacc0123 = _mm_cvtps_epi32(vscaled0123);
2166 vacc4567 = _mm_cvtps_epi32(vscaled4567);
2167
2168 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2169 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
2170
2171
2172 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2173
2174 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
2175 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
2176
2177 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
2178 output += 8;
2179 }
2180 if XNN_UNLIKELY(c != 0) {
2181 {
2182 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
2183 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
2184
2185
2186 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
2187 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
2188 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
2189 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
2190
2191
2192 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
2193
2194 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2195 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2196
2197 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
2198 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
2199 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
2200 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
2201
2202
2203 vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
2204
2205 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2206 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2207
2208 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
2209 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
2210 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
2211 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
2212
2213
2214 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
2215
2216 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2217 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2218
2219 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
2220 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
2221 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
2222 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
2223
2224
2225 vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
2226
2227 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2228 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2229
2230 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
2231 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
2232 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
2233 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
2234
2235
2236 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
2237
2238 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2239 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2240
2241 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
2242 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
2243 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
2244 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
2245
2246
2247 vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
2248
2249 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2250 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2251
2252 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
2253 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
2254 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
2255 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
2256
2257
2258 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
2259
2260 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2261 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2262
2263 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
2264 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
2265 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
2266 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
2267
2268
2269 vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
2270
2271 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2272 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2273
2274 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
2275 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
2276 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
2277 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
2278
2279
2280 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
2281
2282 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2283 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2284
2285
2286 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
2287 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
2288
2289 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
2290 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t) + 4 * sizeof(float)));
2291 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
2292 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
2293
2294 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2295 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
2296 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
2297
2298 vacc0123 = _mm_cvtps_epi32(vscaled0123);
2299 vacc4567 = _mm_cvtps_epi32(vscaled4567);
2300
2301
2302 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2303 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
2304
2305
2306 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2307
2308 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2309
2310 if (c & 4) {
2311 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
2312 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
2313 output += 4;
2314 }
2315 if (c & 2) {
2316 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
2317 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
2318 output += 2;
2319 }
2320 if (c & 1) {
2321 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
2322 output += 1;
2323 }
2324 }
2325 }
2326
2327 output = (int8_t*) ((uintptr_t) output + output_increment);
2328 } while (--output_width != 0);
2329 }
2330
xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2331 void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
2332 size_t mr,
2333 size_t nc,
2334 size_t kc,
2335 const int8_t* restrict a,
2336 size_t a_stride,
2337 const void* restrict w,
2338 int8_t* restrict c,
2339 size_t cm_stride,
2340 size_t cn_stride,
2341 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2342 {
2343 assert(mr != 0);
2344 assert(mr <= 1);
2345 assert(nc != 0);
2346 assert(kc != 0);
2347 assert(kc % sizeof(int8_t) == 0);
2348 assert(a != NULL);
2349 assert(w != NULL);
2350 assert(c != NULL);
2351
2352 kc = round_up_po2(kc, 8);
2353 const int8_t* a0 = a;
2354 int8_t* c0 = c;
2355
2356 do {
2357 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2358 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2359 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2360 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2361 w = (const int32_t*) w + 4;
2362
2363 size_t k = 0;
2364 while (k < kc) {
2365 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2366 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2367 a0 += 8;
2368
2369 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2370 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2371
2372 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2373 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2374 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2375
2376 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2377 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2378 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2379
2380 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2381 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2382 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2383
2384 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2385
2386 w = (const void*) ((const int8_t*) w + 32);
2387 k += 8 * sizeof(int8_t);
2388 }
2389
2390 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2391 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2392
2393 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2394
2395 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2396
2397 const __m128 vscale0123 = _mm_load_ps((const float*) w);
2398 w = (const void*) ((const float*) w + 4);
2399 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2400
2401 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2402 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2403
2404 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2405
2406 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2407 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
2408
2409
2410 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
2411
2412 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2413
2414 if (nc >= 4) {
2415 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2416
2417 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2418
2419 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2420
2421 nc -= 4;
2422 } else {
2423 if (nc & 2) {
2424 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2425 c0 += 2;
2426 vout = _mm_srli_epi32(vout, 16);
2427 }
2428 if (nc & 1) {
2429 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2430 }
2431
2432 nc = 0;
2433 }
2434 } while (nc != 0);
2435 }
2436
xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2437 void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
2438 size_t mr,
2439 size_t nc,
2440 size_t kc,
2441 const int8_t* restrict a,
2442 size_t a_stride,
2443 const void* restrict w,
2444 int8_t* restrict c,
2445 size_t cm_stride,
2446 size_t cn_stride,
2447 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2448 {
2449 assert(mr != 0);
2450 assert(mr <= 3);
2451 assert(nc != 0);
2452 assert(kc != 0);
2453 assert(kc % sizeof(int8_t) == 0);
2454 assert(a != NULL);
2455 assert(w != NULL);
2456 assert(c != NULL);
2457
2458 kc = round_up_po2(kc, 8);
2459 const int8_t* a0 = a;
2460 int8_t* c0 = c;
2461 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
2462 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2463 if XNN_UNPREDICTABLE(mr < 2) {
2464 a1 = a0;
2465 c1 = c0;
2466 }
2467 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
2468 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2469 if XNN_UNPREDICTABLE(mr <= 2) {
2470 a2 = a1;
2471 c2 = c1;
2472 }
2473
2474 do {
2475 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2476 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2477 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2478 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2479 __m128i vacc1x0 = vacc0x0;
2480 __m128i vacc1x1 = vacc0x1;
2481 __m128i vacc1x2 = vacc0x2;
2482 __m128i vacc1x3 = vacc0x3;
2483 __m128i vacc2x0 = vacc0x0;
2484 __m128i vacc2x1 = vacc0x1;
2485 __m128i vacc2x2 = vacc0x2;
2486 __m128i vacc2x3 = vacc0x3;
2487 w = (const int32_t*) w + 4;
2488
2489 size_t k = 0;
2490 while (k < kc) {
2491 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2492 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2493 a0 += 8;
2494 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
2495 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
2496 a1 += 8;
2497 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
2498 const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
2499 a2 += 8;
2500
2501 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2502 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2503
2504 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2505 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
2506 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
2507 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2508 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2509
2510 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2511 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
2512 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
2513 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2514 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2515
2516 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2517 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
2518 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
2519 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2520 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2521
2522 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2523 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
2524 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
2525
2526 w = (const void*) ((const int8_t*) w + 32);
2527 k += 8 * sizeof(int8_t);
2528 }
2529
2530 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2531 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2532 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
2533 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
2534 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
2535 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
2536
2537 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2538 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
2539 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
2540
2541 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2542 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
2543 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
2544
2545 const __m128 vscale0123 = _mm_load_ps((const float*) w);
2546 w = (const void*) ((const float*) w + 4);
2547 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2548 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
2549 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
2550
2551 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2552 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2553 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
2554 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
2555
2556 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2557 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
2558 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
2559
2560 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2561 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
2562 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
2563
2564
2565 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
2566
2567 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2568
2569 if (nc >= 4) {
2570 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2571 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
2572 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
2573
2574 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2575 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2576 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2577
2578 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2579 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
2580 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
2581
2582 nc -= 4;
2583 } else {
2584 if (nc & 2) {
2585 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2586 c0 += 2;
2587 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
2588 c1 += 2;
2589 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
2590 c2 += 2;
2591 vout = _mm_srli_epi32(vout, 16);
2592 }
2593 if (nc & 1) {
2594 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2595 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
2596 *c2 = (int8_t) _mm_extract_epi8(vout, 8);
2597 }
2598
2599 nc = 0;
2600 }
2601 } while (nc != 0);
2602 }
2603
xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2604 void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
2605 size_t mr,
2606 size_t nc,
2607 size_t kc,
2608 size_t ks,
2609 const int8_t** restrict a,
2610 const void* restrict w,
2611 int8_t* restrict c,
2612 size_t cm_stride,
2613 size_t cn_stride,
2614 size_t a_offset,
2615 const int8_t* zero,
2616 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2617 {
2618 assert(mr != 0);
2619 assert(mr <= 1);
2620 assert(nc != 0);
2621 assert(kc != 0);
2622 assert(ks != 0);
2623 assert(ks % (1 * sizeof(void*)) == 0);
2624 assert(a_offset % sizeof(int8_t) == 0);
2625 assert(a != NULL);
2626 assert(w != NULL);
2627 assert(c != NULL);
2628
2629 kc = round_up_po2(kc, 8);
2630 int8_t* c0 = c;
2631
2632 do {
2633 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2634 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2635 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2636 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2637 w = (const int32_t*) w + 4;
2638
2639 size_t p = ks;
2640 do {
2641 const int8_t* restrict a0 = a[0];
2642 if XNN_UNPREDICTABLE(a0 != zero) {
2643 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2644 }
2645 a += 1;
2646
2647 size_t k = 0;
2648 while (k < kc) {
2649 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2650 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2651 a0 += 8;
2652
2653 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2654 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2655
2656 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2657 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2658 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2659
2660 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2661 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2662 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2663
2664 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2665 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2666 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2667
2668 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2669
2670 w = (const void*) ((const int8_t*) w + 32);
2671 k += 8 * sizeof(int8_t);
2672 }
2673 p -= 1 * sizeof(void*);
2674 } while (p != 0);
2675
2676 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2677 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2678
2679 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2680
2681 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2682
2683 const __m128 vscale0123 = _mm_load_ps((const float*) w);
2684 w = (const void*) ((const float*) w + 4);
2685 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2686
2687 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2688 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2689
2690 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2691
2692 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2693 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
2694
2695
2696 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
2697
2698 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2699
2700 if (nc >= 4) {
2701 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2702 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2703
2704 a = (const int8_t**restrict) ((uintptr_t) a - ks);
2705
2706 nc -= 4;
2707 } else {
2708 if (nc & 2) {
2709 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2710 c0 += 2;
2711 vout = _mm_srli_epi32(vout, 16);
2712 }
2713 if (nc & 1) {
2714 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2715 }
2716
2717 nc = 0;
2718 }
2719 } while (nc != 0);
2720 }
2721
xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2722 void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
2723 size_t mr,
2724 size_t nc,
2725 size_t kc,
2726 size_t ks,
2727 const int8_t** restrict a,
2728 const void* restrict w,
2729 int8_t* restrict c,
2730 size_t cm_stride,
2731 size_t cn_stride,
2732 size_t a_offset,
2733 const int8_t* zero,
2734 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2735 {
2736 assert(mr != 0);
2737 assert(mr <= 3);
2738 assert(nc != 0);
2739 assert(kc != 0);
2740 assert(ks != 0);
2741 assert(ks % (3 * sizeof(void*)) == 0);
2742 assert(a_offset % sizeof(int8_t) == 0);
2743 assert(a != NULL);
2744 assert(w != NULL);
2745 assert(c != NULL);
2746
2747 kc = round_up_po2(kc, 8);
2748 int8_t* c0 = c;
2749 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2750 if XNN_UNPREDICTABLE(mr < 2) {
2751 c1 = c0;
2752 }
2753 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2754 if XNN_UNPREDICTABLE(mr <= 2) {
2755 c2 = c1;
2756 }
2757
2758 do {
2759 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2760 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2761 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2762 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2763 __m128i vacc1x0 = vacc0x0;
2764 __m128i vacc1x1 = vacc0x1;
2765 __m128i vacc1x2 = vacc0x2;
2766 __m128i vacc1x3 = vacc0x3;
2767 __m128i vacc2x0 = vacc0x0;
2768 __m128i vacc2x1 = vacc0x1;
2769 __m128i vacc2x2 = vacc0x2;
2770 __m128i vacc2x3 = vacc0x3;
2771 w = (const int32_t*) w + 4;
2772
2773 size_t p = ks;
2774 do {
2775 const int8_t* restrict a0 = a[0];
2776 if XNN_UNPREDICTABLE(a0 != zero) {
2777 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2778 }
2779 const int8_t* restrict a1 = a[1];
2780 if XNN_UNPREDICTABLE(a1 != zero) {
2781 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
2782 }
2783 const int8_t* restrict a2 = a[2];
2784 if XNN_UNPREDICTABLE(a2 != zero) {
2785 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
2786 }
2787 a += 3;
2788
2789 size_t k = 0;
2790 while (k < kc) {
2791 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2792 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2793 a0 += 8;
2794 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
2795 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
2796 a1 += 8;
2797 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
2798 const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
2799 a2 += 8;
2800
2801 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2802 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2803
2804 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2805 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
2806 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
2807 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2808 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2809
2810 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2811 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
2812 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
2813 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2814 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2815
2816 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2817 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
2818 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
2819 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2820 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2821
2822 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2823 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
2824 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
2825
2826 w = (const void*) ((const int8_t*) w + 32);
2827 k += 8 * sizeof(int8_t);
2828 }
2829 p -= 3 * sizeof(void*);
2830 } while (p != 0);
2831
2832 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2833 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2834 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
2835 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
2836 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
2837 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
2838
2839 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2840 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
2841 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
2842
2843 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2844 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
2845 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
2846
2847 const __m128 vscale0123 = _mm_load_ps((const float*) w);
2848 w = (const void*) ((const float*) w + 4);
2849 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2850 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
2851 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
2852
2853 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2854 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2855 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
2856 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
2857
2858 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2859 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
2860 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
2861
2862 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2863 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
2864 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
2865
2866
2867 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
2868
2869 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2870
2871 if (nc >= 4) {
2872 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
2873 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2874 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
2875 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2876 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2877 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2878
2879 a = (const int8_t**restrict) ((uintptr_t) a - ks);
2880
2881 nc -= 4;
2882 } else {
2883 if (nc & 2) {
2884 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
2885 c2 += 2;
2886 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
2887 c1 += 2;
2888 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2889 c0 += 2;
2890 vout = _mm_srli_epi32(vout, 16);
2891 }
2892 if (nc & 1) {
2893 *c2 = (int8_t) _mm_extract_epi8(vout, 8);
2894 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
2895 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2896 }
2897
2898 nc = 0;
2899 }
2900 } while (nc != 0);
2901 }
2902
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2903 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16(
2904 size_t channels,
2905 size_t output_width,
2906 const int8_t** input,
2907 const void* weights,
2908 int8_t* output,
2909 size_t input_stride,
2910 size_t output_increment,
2911 size_t input_offset,
2912 const int8_t* zero,
2913 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2914 {
2915 assert(channels != 0);
2916 assert(output_width != 0);
2917
2918 do {
2919 const int8_t* i0 = input[0];
2920 assert(i0 != NULL);
2921 if XNN_UNPREDICTABLE(i0 != zero) {
2922 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2923 }
2924 const int8_t* i1 = input[1];
2925 assert(i1 != NULL);
2926 if XNN_UNPREDICTABLE(i1 != zero) {
2927 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2928 }
2929 const int8_t* i2 = input[2];
2930 assert(i2 != NULL);
2931 if XNN_UNPREDICTABLE(i2 != zero) {
2932 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2933 }
2934 const int8_t* i3 = input[3];
2935 assert(i3 != NULL);
2936 if XNN_UNPREDICTABLE(i3 != zero) {
2937 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2938 }
2939 const int8_t* i4 = input[4];
2940 assert(i4 != NULL);
2941 if XNN_UNPREDICTABLE(i4 != zero) {
2942 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2943 }
2944 const int8_t* i5 = input[5];
2945 assert(i5 != NULL);
2946 if XNN_UNPREDICTABLE(i5 != zero) {
2947 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2948 }
2949 const int8_t* i6 = input[6];
2950 assert(i6 != NULL);
2951 if XNN_UNPREDICTABLE(i6 != zero) {
2952 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2953 }
2954 const int8_t* i7 = input[7];
2955 assert(i7 != NULL);
2956 if XNN_UNPREDICTABLE(i7 != zero) {
2957 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2958 }
2959 const int8_t* i8 = input[8];
2960 assert(i8 != NULL);
2961 if XNN_UNPREDICTABLE(i8 != zero) {
2962 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2963 }
2964 const int8_t* i9 = input[9];
2965 assert(i9 != NULL);
2966 if XNN_UNPREDICTABLE(i9 != zero) {
2967 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2968 }
2969 const int8_t* i10 = input[10];
2970 assert(i10 != NULL);
2971 if XNN_UNPREDICTABLE(i10 != zero) {
2972 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2973 }
2974 const int8_t* i11 = input[11];
2975 assert(i11 != NULL);
2976 if XNN_UNPREDICTABLE(i11 != zero) {
2977 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2978 }
2979 const int8_t* i12 = input[12];
2980 assert(i12 != NULL);
2981 if XNN_UNPREDICTABLE(i12 != zero) {
2982 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2983 }
2984 const int8_t* i13 = input[13];
2985 assert(i13 != NULL);
2986 if XNN_UNPREDICTABLE(i13 != zero) {
2987 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2988 }
2989 const int8_t* i14 = input[14];
2990 assert(i14 != NULL);
2991 if XNN_UNPREDICTABLE(i14 != zero) {
2992 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
2993 }
2994 const int8_t* i15 = input[15];
2995 assert(i15 != NULL);
2996 if XNN_UNPREDICTABLE(i15 != zero) {
2997 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
2998 }
2999 const int8_t* i16 = input[16];
3000 assert(i16 != NULL);
3001 if XNN_UNPREDICTABLE(i16 != zero) {
3002 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
3003 }
3004 const int8_t* i17 = input[17];
3005 assert(i17 != NULL);
3006 if XNN_UNPREDICTABLE(i17 != zero) {
3007 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
3008 }
3009 const int8_t* i18 = input[18];
3010 assert(i18 != NULL);
3011 if XNN_UNPREDICTABLE(i18 != zero) {
3012 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
3013 }
3014 const int8_t* i19 = input[19];
3015 assert(i19 != NULL);
3016 if XNN_UNPREDICTABLE(i19 != zero) {
3017 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
3018 }
3019 const int8_t* i20 = input[20];
3020 assert(i20 != NULL);
3021 if XNN_UNPREDICTABLE(i20 != zero) {
3022 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
3023 }
3024 const int8_t* i21 = input[21];
3025 assert(i21 != NULL);
3026 if XNN_UNPREDICTABLE(i21 != zero) {
3027 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
3028 }
3029 const int8_t* i22 = input[22];
3030 assert(i22 != NULL);
3031 if XNN_UNPREDICTABLE(i22 != zero) {
3032 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
3033 }
3034 const int8_t* i23 = input[23];
3035 assert(i23 != NULL);
3036 if XNN_UNPREDICTABLE(i23 != zero) {
3037 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
3038 }
3039 const int8_t* i24 = input[24];
3040 assert(i24 != NULL);
3041 if XNN_UNPREDICTABLE(i24 != zero) {
3042 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
3043 }
3044 input = (const int8_t**) ((uintptr_t) input + input_stride);
3045
3046 size_t c = channels;
3047 const void* w = weights;
3048 for (; c >= 8; c -= 8) {
3049 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3050 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3051
3052
3053 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3054 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3055 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3056 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3057 i0 += 8;
3058
3059
3060 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3061
3062
3063 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3064 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3065 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3066 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3067 i1 += 8;
3068
3069
3070 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3071
3072 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3073 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3074
3075 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3076 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3077 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3078 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3079 i2 += 8;
3080
3081
3082 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3083
3084
3085 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3086 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3087 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3088 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3089 i3 += 8;
3090
3091
3092 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3093
3094 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3095 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3096
3097 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3098 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3099 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3100 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3101 i4 += 8;
3102
3103
3104 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3105
3106
3107 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3108 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3109 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3110 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3111 i5 += 8;
3112
3113
3114 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3115
3116 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3117 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3118
3119 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3120 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3121 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3122 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3123 i6 += 8;
3124
3125
3126 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3127
3128
3129 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3130 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3131 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3132 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3133 i7 += 8;
3134
3135
3136 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3137
3138 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3139 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3140
3141 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3142 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3143 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3144 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3145 i8 += 8;
3146
3147
3148 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3149
3150
3151 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
3152 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
3153 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
3154 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
3155 i9 += 8;
3156
3157
3158 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
3159
3160 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3161 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3162
3163 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
3164 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
3165 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
3166 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
3167 i10 += 8;
3168
3169
3170 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
3171
3172
3173 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
3174 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
3175 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
3176 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
3177 i11 += 8;
3178
3179
3180 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
3181
3182 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3183 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3184
3185 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
3186 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
3187 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
3188 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
3189 i12 += 8;
3190
3191
3192 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3193
3194
3195 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3196 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
3197 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3198 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
3199 i13 += 8;
3200
3201
3202 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
3203
3204 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3205 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3206
3207 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3208 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
3209 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3210 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
3211 i14 += 8;
3212
3213
3214 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3215
3216
3217 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3218 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
3219 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3220 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
3221 i15 += 8;
3222
3223
3224 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
3225
3226 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3227 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3228
3229 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3230 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
3231 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3232 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
3233 i16 += 8;
3234
3235
3236 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3237
3238
3239 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3240 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
3241 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3242 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
3243 i17 += 8;
3244
3245
3246 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
3247
3248 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3249 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3250
3251 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3252 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
3253 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3254 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
3255 i18 += 8;
3256
3257
3258 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3259
3260
3261 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3262 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
3263 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3264 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
3265 i19 += 8;
3266
3267
3268 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
3269
3270 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3271 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3272
3273 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3274 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
3275 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3276 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
3277 i20 += 8;
3278
3279
3280 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3281
3282
3283 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3284 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
3285 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3286 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
3287 i21 += 8;
3288
3289
3290 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
3291
3292 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3293 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3294
3295 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3296 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
3297 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3298 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
3299 i22 += 8;
3300
3301
3302 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3303
3304
3305 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3306 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
3307 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3308 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
3309 i23 += 8;
3310
3311
3312 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
3313
3314 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3315 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3316
3317 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3318 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
3319 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3320 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
3321 i24 += 8;
3322
3323
3324 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3325
3326 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3327 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3328
3329 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
3330
3331 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3332 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3333
3334 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3335 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3336 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3337
3338 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3339 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3340 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3341
3342 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3343 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3344
3345 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3346 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3347
3348
3349 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3350
3351 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
3352 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3353
3354 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3355 output += 8;
3356 }
3357 if XNN_UNLIKELY(c != 0) {
3358 {
3359 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3360 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3361
3362
3363 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3364 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3365 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3366 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3367
3368
3369 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3370
3371
3372 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3373 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3374 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3375 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3376
3377
3378 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3379
3380 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3381 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3382
3383 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3384 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3385 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3386 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3387
3388
3389 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3390
3391
3392 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3393 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3394 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3395 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3396
3397
3398 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3399
3400 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3401 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3402
3403 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3404 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3405 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3406 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3407
3408
3409 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3410
3411
3412 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3413 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3414 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3415 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3416
3417
3418 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3419
3420 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3421 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3422
3423 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3424 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3425 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3426 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3427
3428
3429 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3430
3431
3432 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3433 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3434 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3435 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3436
3437
3438 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3439
3440 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3441 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3442
3443 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3444 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3445 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3446 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3447
3448
3449 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3450
3451
3452 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
3453 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
3454 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
3455 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
3456
3457
3458 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
3459
3460 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3461 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3462
3463 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
3464 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
3465 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
3466 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
3467
3468
3469 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
3470
3471
3472 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
3473 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
3474 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
3475 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
3476
3477
3478 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
3479
3480 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3481 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3482
3483 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
3484 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
3485 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
3486 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
3487
3488
3489 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3490
3491
3492 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3493 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
3494 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3495 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
3496
3497
3498 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
3499
3500 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3501 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3502
3503 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3504 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
3505 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3506 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
3507
3508
3509 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3510
3511
3512 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3513 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
3514 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3515 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
3516
3517
3518 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
3519
3520 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3521 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3522
3523 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3524 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
3525 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3526 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
3527
3528
3529 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3530
3531
3532 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3533 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
3534 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3535 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
3536
3537
3538 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
3539
3540 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3541 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3542
3543 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3544 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
3545 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3546 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
3547
3548
3549 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3550
3551
3552 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3553 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
3554 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3555 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
3556
3557
3558 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
3559
3560 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3561 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3562
3563 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3564 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
3565 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3566 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
3567
3568
3569 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3570
3571
3572 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3573 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
3574 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3575 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
3576
3577
3578 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
3579
3580 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3581 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3582
3583 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3584 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
3585 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3586 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
3587
3588
3589 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3590
3591
3592 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3593 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
3594 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3595 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
3596
3597
3598 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
3599
3600 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3601 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3602
3603 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3604 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
3605 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3606 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
3607
3608
3609 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3610
3611 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3612 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3613
3614
3615 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3616 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3617
3618 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3619 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3620 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3621
3622 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3623 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3624 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3625
3626 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3627 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3628
3629
3630 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3631 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3632
3633
3634 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3635
3636 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
3637
3638 if (c & 4) {
3639 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3640 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3641 output += 4;
3642 }
3643 if (c & 2) {
3644 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3645 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3646 output += 2;
3647 }
3648 if (c & 1) {
3649 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3650 output += 1;
3651 }
3652 }
3653 }
3654
3655 output = (int8_t*) ((uintptr_t) output + output_increment);
3656 } while (--output_width != 0);
3657 }
3658
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3659 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16(
3660 size_t channels,
3661 size_t output_width,
3662 const int8_t** input,
3663 const void* weights,
3664 int8_t* output,
3665 size_t input_stride,
3666 size_t output_increment,
3667 size_t input_offset,
3668 const int8_t* zero,
3669 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3670 {
3671 assert(channels != 0);
3672 assert(output_width != 0);
3673
3674 do {
3675 const int8_t* i0 = input[0];
3676 assert(i0 != NULL);
3677 if XNN_UNPREDICTABLE(i0 != zero) {
3678 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3679 }
3680 const int8_t* i1 = input[1];
3681 assert(i1 != NULL);
3682 if XNN_UNPREDICTABLE(i1 != zero) {
3683 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3684 }
3685 const int8_t* i2 = input[2];
3686 assert(i2 != NULL);
3687 if XNN_UNPREDICTABLE(i2 != zero) {
3688 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3689 }
3690 const int8_t* i3 = input[3];
3691 assert(i3 != NULL);
3692 if XNN_UNPREDICTABLE(i3 != zero) {
3693 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3694 }
3695 const int8_t* i4 = input[4];
3696 assert(i4 != NULL);
3697 if XNN_UNPREDICTABLE(i4 != zero) {
3698 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3699 }
3700 const int8_t* i5 = input[5];
3701 assert(i5 != NULL);
3702 if XNN_UNPREDICTABLE(i5 != zero) {
3703 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3704 }
3705 const int8_t* i6 = input[6];
3706 assert(i6 != NULL);
3707 if XNN_UNPREDICTABLE(i6 != zero) {
3708 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3709 }
3710 const int8_t* i7 = input[7];
3711 assert(i7 != NULL);
3712 if XNN_UNPREDICTABLE(i7 != zero) {
3713 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3714 }
3715 const int8_t* i8 = input[8];
3716 assert(i8 != NULL);
3717 if XNN_UNPREDICTABLE(i8 != zero) {
3718 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3719 }
3720 input = (const int8_t**) ((uintptr_t) input + input_stride);
3721
3722 size_t c = channels;
3723 const void* w = weights;
3724 for (; c >= 8; c -= 8) {
3725 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3726 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3727
3728
3729 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3730 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3731 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3732 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3733 i0 += 8;
3734
3735
3736 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3737
3738
3739 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3740 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3741 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3742 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3743 i1 += 8;
3744
3745
3746 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3747
3748 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3749 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3750
3751 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3752 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3753 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3754 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3755 i2 += 8;
3756
3757
3758 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3759
3760
3761 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3762 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3763 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3764 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3765 i3 += 8;
3766
3767
3768 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3769
3770 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3771 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3772
3773 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3774 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3775 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3776 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3777 i4 += 8;
3778
3779
3780 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3781
3782
3783 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3784 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3785 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3786 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3787 i5 += 8;
3788
3789
3790 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3791
3792 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3793 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3794
3795 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3796 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3797 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3798 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3799 i6 += 8;
3800
3801
3802 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3803
3804
3805 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3806 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3807 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3808 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3809 i7 += 8;
3810
3811
3812 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3813
3814 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3815 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3816
3817 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3818 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3819 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3820 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3821 i8 += 8;
3822
3823
3824 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3825
3826 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3827 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3828
3829 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
3830
3831 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3832 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3833
3834 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3835 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3836 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3837
3838 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3839 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3840 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3841
3842 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3843 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3844
3845 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3846 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3847
3848
3849 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3850
3851 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
3852 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3853
3854 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3855 output += 8;
3856 }
3857 if XNN_UNLIKELY(c != 0) {
3858 {
3859 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3860 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3861
3862
3863 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3864 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3865 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3866 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3867
3868
3869 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3870
3871
3872 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3873 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3874 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3875 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3876
3877
3878 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3879
3880 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3881 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3882
3883 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3884 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3885 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3886 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3887
3888
3889 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3890
3891
3892 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3893 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3894 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3895 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3896
3897
3898 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3899
3900 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3901 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3902
3903 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3904 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3905 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3906 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3907
3908
3909 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3910
3911
3912 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3913 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3914 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3915 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3916
3917
3918 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3919
3920 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3921 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3922
3923 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3924 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3925 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3926 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3927
3928
3929 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3930
3931
3932 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3933 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3934 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3935 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3936
3937
3938 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3939
3940 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3941 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3942
3943 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3944 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3945 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3946 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3947
3948
3949 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3950
3951 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3952 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3953
3954
3955 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3956 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3957
3958 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3959 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3960 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3961
3962 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3963 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3964 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3965
3966 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3967 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3968
3969
3970 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3971 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3972
3973
3974 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3975
3976 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
3977
3978 if (c & 4) {
3979 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3980 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3981 output += 4;
3982 }
3983 if (c & 2) {
3984 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3985 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3986 output += 2;
3987 }
3988 if (c & 1) {
3989 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3990 output += 1;
3991 }
3992 }
3993 }
3994
3995 output = (int8_t*) ((uintptr_t) output + output_increment);
3996 } while (--output_width != 0);
3997 }
3998
xnn_qs8_f32_vcvt_ukernel__sse41_x16(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])3999 void xnn_qs8_f32_vcvt_ukernel__sse41_x16(
4000 size_t n,
4001 const int8_t* x,
4002 float* y,
4003 const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4004 {
4005 assert(n != 0);
4006 assert(n % sizeof(int8_t) == 0);
4007 assert(x != NULL);
4008 assert(y != NULL);
4009
4010 const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->sse4.minus_zero_point);
4011 const __m128 vscale = _mm_load_ps(params->sse4.scale);
4012 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4013 __m128i vx0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4014 __m128i vx4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
4015 __m128i vx89AB = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
4016 __m128i vxCDEF = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
4017 x += 16;
4018
4019 vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
4020 vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
4021 vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
4022 vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
4023
4024 __m128 vy0123 = _mm_cvtepi32_ps(vx0123);
4025 __m128 vy4567 = _mm_cvtepi32_ps(vx4567);
4026 __m128 vy89AB = _mm_cvtepi32_ps(vx89AB);
4027 __m128 vyCDEF = _mm_cvtepi32_ps(vxCDEF);
4028
4029 vy0123 = _mm_mul_ps(vy0123, vscale);
4030 vy4567 = _mm_mul_ps(vy4567, vscale);
4031 vy89AB = _mm_mul_ps(vy89AB, vscale);
4032 vyCDEF = _mm_mul_ps(vyCDEF, vscale);
4033
4034 _mm_storeu_ps(y, vy0123);
4035 _mm_storeu_ps(y + 4, vy4567);
4036 _mm_storeu_ps(y + 8, vy89AB);
4037 _mm_storeu_ps(y + 12, vyCDEF);
4038 y += 16;
4039 }
4040 for (; n >= 4 * sizeof(int8_t); n -= 4 * sizeof(int8_t)) {
4041 __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4042 vx = _mm_add_epi32(vx, vminus_zero_point);
4043 x += 4;
4044
4045 __m128 vy = _mm_cvtepi32_ps(vx);
4046 vy = _mm_mul_ps(vy, vscale);
4047
4048 _mm_storeu_ps(y, vy);
4049 y += 4;
4050 }
4051 if XNN_UNLIKELY(n != 0) {
4052 assert(n >= 1 * sizeof(int8_t));
4053 assert(n <= 3 * sizeof(int8_t));
4054
4055 __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4056 vx = _mm_add_epi32(vx, vminus_zero_point);
4057
4058 __m128 vy = _mm_cvtepi32_ps(vx);
4059 vy = _mm_mul_ps(vy, vscale);
4060
4061 if (n & (2 * sizeof(int8_t))) {
4062 _mm_storel_pi((__m64*) y, vy);
4063 vy = _mm_movehl_ps(vy, vy);
4064 y += 2;
4065 }
4066 if (n & (1 * sizeof(int8_t))) {
4067 _mm_store_ss(y, vy);
4068 }
4069 }
4070 }
4071
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4072 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
4073 size_t rows,
4074 size_t channels,
4075 const int8_t* input,
4076 size_t input_stride,
4077 const int8_t* zero,
4078 int32_t* buffer,
4079 int8_t* output,
4080 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4081 {
4082 assert(rows > 7);
4083 assert(channels != 0);
4084
4085 const int8_t* i0 = input;
4086 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
4087 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
4088 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
4089 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
4090 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
4091 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
4092 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
4093
4094 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
4095 int32_t* b = buffer;
4096 size_t c = channels;
4097 for (; c != 0; c = doz(c, 8)) {
4098 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4099 i0 += 8;
4100 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4101 i1 += 8;
4102
4103 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4104 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4105 i2 += 8;
4106
4107 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4108 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4109 i3 += 8;
4110 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4111 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4112 i4 += 8;
4113 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4114 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4115 i5 += 8;
4116 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4117 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4118 i6 += 8;
4119
4120 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4121
4122 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4123 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4124
4125 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4126 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4127
4128 _mm_store_si128((__m128i*) b, vacc0123);
4129 _mm_store_si128((__m128i*) (b + 4), vacc4567);
4130 b += 8;
4131 }
4132
4133 for (rows -= 7; rows > 7; rows -= 7) {
4134 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
4135 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
4136 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
4137 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
4138 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
4139 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
4140 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
4141
4142 int32_t* b = buffer;
4143 size_t c = channels;
4144 for (; c != 0; c = doz(c, 8)) {
4145 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4146 i0 += 8;
4147 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4148 i1 += 8;
4149
4150 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4151 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4152 i2 += 8;
4153
4154 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4155 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4156 i3 += 8;
4157 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4158 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4159 i4 += 8;
4160 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4161 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4162 i5 += 8;
4163 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4164 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4165 i6 += 8;
4166
4167 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4168
4169 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4170 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4171
4172 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
4173 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
4174
4175 _mm_store_si128((__m128i*) b, vacc0123);
4176 _mm_store_si128((__m128i*) (b + 4), vacc4567);
4177 b += 8;
4178 }
4179 }
4180
4181 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
4182 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
4183 if XNN_UNPREDICTABLE(rows < 2) {
4184 i1 = zero;
4185 }
4186 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
4187 if XNN_UNPREDICTABLE(rows <= 2) {
4188 i2 = zero;
4189 }
4190 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
4191 if XNN_UNPREDICTABLE(rows < 4) {
4192 i3 = zero;
4193 }
4194 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
4195 if XNN_UNPREDICTABLE(rows <= 4) {
4196 i4 = zero;
4197 }
4198 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
4199 if XNN_UNPREDICTABLE(rows < 6) {
4200 i5 = zero;
4201 }
4202 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
4203 if XNN_UNPREDICTABLE(rows <= 6) {
4204 i6 = zero;
4205 }
4206
4207 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4208 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4209 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4210 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
4211 for (; channels >= 8; channels -= 8) {
4212 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4213 i0 += 8;
4214 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4215 i1 += 8;
4216
4217 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4218 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4219 i2 += 8;
4220
4221 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4222 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4223 i3 += 8;
4224 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4225 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4226 i4 += 8;
4227 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4228 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4229 i5 += 8;
4230 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4231 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4232 i6 += 8;
4233
4234 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4235
4236 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4237 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4238
4239 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
4240 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
4241 buffer += 8;
4242
4243 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4244 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4245
4246 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4247 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4248
4249 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4250 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4251
4252 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4253 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4254
4255 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4256
4257 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4258
4259 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4260
4261 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4262 output += 8;
4263 }
4264 if XNN_UNLIKELY(channels != 0) {
4265 {
4266 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4267 i0 += 8;
4268 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4269 i1 += 8;
4270
4271 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4272 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4273 i2 += 8;
4274
4275 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4276 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4277 i3 += 8;
4278 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4279 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4280 i4 += 8;
4281 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4282 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4283 i5 += 8;
4284 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4285 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4286 i6 += 8;
4287
4288 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4289
4290 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4291 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4292
4293 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
4294 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
4295 buffer += 8;
4296
4297 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4298 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4299
4300 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4301 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4302
4303 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4304 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4305
4306 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4307 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4308
4309 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4310
4311 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4312 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4313
4314 if (channels & 4) {
4315 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4316 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4317 output += 4;
4318 }
4319 if (channels & 2) {
4320 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4321 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4322 output += 2;
4323 }
4324 if (channels & 1) {
4325 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4326 }
4327 }
4328 }
4329 }
4330
xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4331 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
4332 size_t rows,
4333 size_t channels,
4334 const int8_t* input,
4335 size_t input_stride,
4336 const int8_t* zero,
4337 int8_t* output,
4338 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4339 {
4340 assert(rows != 0);
4341 assert(rows <= 7);
4342 assert(channels != 0);
4343
4344 const int8_t* i0 = input;
4345 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
4346 if XNN_UNPREDICTABLE(rows < 2) {
4347 i1 = zero;
4348 }
4349 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
4350 if XNN_UNPREDICTABLE(rows <= 2) {
4351 i2 = zero;
4352 }
4353 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
4354 if XNN_UNPREDICTABLE(rows < 4) {
4355 i3 = zero;
4356 }
4357 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
4358 if XNN_UNPREDICTABLE(rows <= 4) {
4359 i4 = zero;
4360 }
4361 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
4362 if XNN_UNPREDICTABLE(rows < 6) {
4363 i5 = zero;
4364 }
4365 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
4366 if XNN_UNPREDICTABLE(rows <= 6) {
4367 i6 = zero;
4368 }
4369
4370 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
4371 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4372 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4373 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4374 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
4375 for (; channels >= 8; channels -= 8) {
4376 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4377 i0 += 8;
4378 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4379 i1 += 8;
4380
4381 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4382 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4383 i2 += 8;
4384
4385 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4386 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4387 i3 += 8;
4388 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4389 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4390 i4 += 8;
4391 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4392 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4393 i5 += 8;
4394 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4395 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4396 i6 += 8;
4397
4398 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4399
4400 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4401 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4402
4403 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4404 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4405
4406 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4407 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4408
4409 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4410 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4411
4412 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4413 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4414
4415 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4416 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4417
4418 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4419
4420 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4421
4422 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4423
4424 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4425 output += 8;
4426 }
4427 if XNN_UNLIKELY(channels != 0) {
4428 {
4429 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4430 i0 += 8;
4431 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4432 i1 += 8;
4433
4434 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4435 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4436 i2 += 8;
4437
4438 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4439 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4440 i3 += 8;
4441 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4442 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4443 i4 += 8;
4444 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4445 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4446 i5 += 8;
4447 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4448 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4449 i6 += 8;
4450
4451 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4452
4453 __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4454 __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4455
4456 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4457 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4458
4459 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4460 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4461
4462 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4463 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4464
4465 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4466 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4467
4468 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4469 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4470
4471 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4472
4473 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4474 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4475
4476 if (channels & 4) {
4477 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4478 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4479 output += 4;
4480 }
4481 if (channels & 2) {
4482 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4483 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4484 output += 2;
4485 }
4486 if (channels & 1) {
4487 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4488 }
4489 }
4490 }
4491 }
4492
xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4493 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
4494 size_t mr,
4495 size_t nc,
4496 size_t kc,
4497 const int8_t* restrict a,
4498 size_t a_stride,
4499 const void* restrict w,
4500 int8_t* restrict c,
4501 size_t cm_stride,
4502 size_t cn_stride,
4503 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4504 {
4505 assert(mr != 0);
4506 assert(mr <= 1);
4507 assert(nc != 0);
4508 assert(kc != 0);
4509 assert(kc % sizeof(int8_t) == 0);
4510 assert(a != NULL);
4511 assert(w != NULL);
4512 assert(c != NULL);
4513
4514 kc = round_up_po2(kc, 8);
4515 const int8_t* a0 = a;
4516 int8_t* c0 = c;
4517
4518 do {
4519 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4520 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4521 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4522 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4523 w = (const int32_t*) w + 4;
4524
4525 size_t k = 0;
4526 while (k < kc) {
4527 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4528 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4529 a0 += 8;
4530
4531 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4532 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4533
4534 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4535 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4536 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4537
4538 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4539 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4540 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4541
4542 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4543 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4544 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4545
4546 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4547
4548 w = (const void*) ((const int8_t*) w + 32);
4549 k += 8 * sizeof(int8_t);
4550 }
4551
4552 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4553 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4554
4555 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4556
4557 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4558
4559 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4560 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4561
4562 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4563 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4564
4565 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4566
4567 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4568 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4569
4570
4571 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4572
4573 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4574
4575 if (nc >= 4) {
4576 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4577
4578 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4579
4580 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4581
4582 nc -= 4;
4583 } else {
4584 if (nc & 2) {
4585 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4586 c0 += 2;
4587 vout = _mm_srli_epi32(vout, 16);
4588 }
4589 if (nc & 1) {
4590 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4591 }
4592
4593 nc = 0;
4594 }
4595 } while (nc != 0);
4596 }
4597
xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4598 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
4599 size_t mr,
4600 size_t nc,
4601 size_t kc,
4602 const int8_t* restrict a,
4603 size_t a_stride,
4604 const void* restrict w,
4605 int8_t* restrict c,
4606 size_t cm_stride,
4607 size_t cn_stride,
4608 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4609 {
4610 assert(mr != 0);
4611 assert(mr <= 3);
4612 assert(nc != 0);
4613 assert(kc != 0);
4614 assert(kc % sizeof(int8_t) == 0);
4615 assert(a != NULL);
4616 assert(w != NULL);
4617 assert(c != NULL);
4618
4619 kc = round_up_po2(kc, 8);
4620 const int8_t* a0 = a;
4621 int8_t* c0 = c;
4622 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
4623 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4624 if XNN_UNPREDICTABLE(mr < 2) {
4625 a1 = a0;
4626 c1 = c0;
4627 }
4628 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
4629 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4630 if XNN_UNPREDICTABLE(mr <= 2) {
4631 a2 = a1;
4632 c2 = c1;
4633 }
4634
4635 do {
4636 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4637 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4638 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4639 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4640 __m128i vacc1x0 = vacc0x0;
4641 __m128i vacc1x1 = vacc0x1;
4642 __m128i vacc1x2 = vacc0x2;
4643 __m128i vacc1x3 = vacc0x3;
4644 __m128i vacc2x0 = vacc0x0;
4645 __m128i vacc2x1 = vacc0x1;
4646 __m128i vacc2x2 = vacc0x2;
4647 __m128i vacc2x3 = vacc0x3;
4648 w = (const int32_t*) w + 4;
4649
4650 size_t k = 0;
4651 while (k < kc) {
4652 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4653 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4654 a0 += 8;
4655 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4656 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
4657 a1 += 8;
4658 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4659 const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
4660 a2 += 8;
4661
4662 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4663 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4664
4665 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4666 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4667 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4668 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4669 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4670
4671 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4672 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4673 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4674 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4675 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4676
4677 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4678 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4679 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4680 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4681 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4682
4683 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4684 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4685 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4686
4687 w = (const void*) ((const int8_t*) w + 32);
4688 k += 8 * sizeof(int8_t);
4689 }
4690
4691 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4692 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4693 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
4694 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
4695 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
4696 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
4697
4698 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4699 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
4700 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
4701
4702 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4703 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
4704 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
4705
4706 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4707 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4708 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
4709 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
4710
4711 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4712 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4713 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
4714 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
4715
4716 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4717 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
4718 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
4719
4720 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4721 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
4722 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
4723
4724
4725 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
4726
4727 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4728
4729 if (nc >= 4) {
4730 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4731 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
4732 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
4733
4734 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4735 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4736 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4737
4738 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4739 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
4740 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
4741
4742 nc -= 4;
4743 } else {
4744 if (nc & 2) {
4745 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4746 c0 += 2;
4747 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
4748 c1 += 2;
4749 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
4750 c2 += 2;
4751 vout = _mm_srli_epi32(vout, 16);
4752 }
4753 if (nc & 1) {
4754 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4755 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
4756 *c2 = (int8_t) _mm_extract_epi8(vout, 8);
4757 }
4758
4759 nc = 0;
4760 }
4761 } while (nc != 0);
4762 }
4763
xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4764 void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
4765 size_t mr,
4766 size_t nc,
4767 size_t kc,
4768 size_t ks,
4769 const int8_t** restrict a,
4770 const void* restrict w,
4771 int8_t* restrict c,
4772 size_t cm_stride,
4773 size_t cn_stride,
4774 size_t a_offset,
4775 const int8_t* zero,
4776 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4777 {
4778 assert(mr != 0);
4779 assert(mr <= 1);
4780 assert(nc != 0);
4781 assert(kc != 0);
4782 assert(ks != 0);
4783 assert(ks % (1 * sizeof(void*)) == 0);
4784 assert(a_offset % sizeof(int8_t) == 0);
4785 assert(a != NULL);
4786 assert(w != NULL);
4787 assert(c != NULL);
4788
4789 kc = round_up_po2(kc, 8);
4790 int8_t* c0 = c;
4791
4792 do {
4793 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4794 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4795 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4796 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4797 w = (const int32_t*) w + 4;
4798
4799 size_t p = ks;
4800 do {
4801 const int8_t* restrict a0 = a[0];
4802 if XNN_UNPREDICTABLE(a0 != zero) {
4803 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4804 }
4805 a += 1;
4806
4807 size_t k = 0;
4808 while (k < kc) {
4809 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4810 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4811 a0 += 8;
4812
4813 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4814 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4815
4816 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4817 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4818 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4819
4820 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4821 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4822 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4823
4824 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4825 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4826 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4827
4828 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4829
4830 w = (const void*) ((const int8_t*) w + 32);
4831 k += 8 * sizeof(int8_t);
4832 }
4833 p -= 1 * sizeof(void*);
4834 } while (p != 0);
4835
4836 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4837 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4838
4839 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4840
4841 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4842
4843 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4844 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4845
4846 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4847 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4848
4849 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4850
4851 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4852 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4853
4854
4855 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4856
4857 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4858
4859 if (nc >= 4) {
4860 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4861 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4862
4863 a = (const int8_t**restrict) ((uintptr_t) a - ks);
4864
4865 nc -= 4;
4866 } else {
4867 if (nc & 2) {
4868 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4869 c0 += 2;
4870 vout = _mm_srli_epi32(vout, 16);
4871 }
4872 if (nc & 1) {
4873 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4874 }
4875
4876 nc = 0;
4877 }
4878 } while (nc != 0);
4879 }
4880
xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4881 void xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
4882 size_t mr,
4883 size_t nc,
4884 size_t kc,
4885 size_t ks,
4886 const int8_t** restrict a,
4887 const void* restrict w,
4888 int8_t* restrict c,
4889 size_t cm_stride,
4890 size_t cn_stride,
4891 size_t a_offset,
4892 const int8_t* zero,
4893 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4894 {
4895 assert(mr != 0);
4896 assert(mr <= 3);
4897 assert(nc != 0);
4898 assert(kc != 0);
4899 assert(ks != 0);
4900 assert(ks % (3 * sizeof(void*)) == 0);
4901 assert(a_offset % sizeof(int8_t) == 0);
4902 assert(a != NULL);
4903 assert(w != NULL);
4904 assert(c != NULL);
4905
4906 kc = round_up_po2(kc, 8);
4907 int8_t* c0 = c;
4908 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4909 if XNN_UNPREDICTABLE(mr < 2) {
4910 c1 = c0;
4911 }
4912 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4913 if XNN_UNPREDICTABLE(mr <= 2) {
4914 c2 = c1;
4915 }
4916
4917 do {
4918 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4919 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4920 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4921 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4922 __m128i vacc1x0 = vacc0x0;
4923 __m128i vacc1x1 = vacc0x1;
4924 __m128i vacc1x2 = vacc0x2;
4925 __m128i vacc1x3 = vacc0x3;
4926 __m128i vacc2x0 = vacc0x0;
4927 __m128i vacc2x1 = vacc0x1;
4928 __m128i vacc2x2 = vacc0x2;
4929 __m128i vacc2x3 = vacc0x3;
4930 w = (const int32_t*) w + 4;
4931
4932 size_t p = ks;
4933 do {
4934 const int8_t* restrict a0 = a[0];
4935 if XNN_UNPREDICTABLE(a0 != zero) {
4936 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4937 }
4938 const int8_t* restrict a1 = a[1];
4939 if XNN_UNPREDICTABLE(a1 != zero) {
4940 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
4941 }
4942 const int8_t* restrict a2 = a[2];
4943 if XNN_UNPREDICTABLE(a2 != zero) {
4944 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
4945 }
4946 a += 3;
4947
4948 size_t k = 0;
4949 while (k < kc) {
4950 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4951 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4952 a0 += 8;
4953 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4954 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
4955 a1 += 8;
4956 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4957 const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
4958 a2 += 8;
4959
4960 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4961 const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4962
4963 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4964 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4965 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4966 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4967 const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4968
4969 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4970 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4971 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4972 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4973 const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4974
4975 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4976 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4977 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4978 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4979 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4980
4981 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4982 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4983 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4984
4985 w = (const void*) ((const int8_t*) w + 32);
4986 k += 8 * sizeof(int8_t);
4987 }
4988 p -= 3 * sizeof(void*);
4989 } while (p != 0);
4990
4991 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4992 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4993 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
4994 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
4995 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
4996 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
4997
4998 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4999 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
5000 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
5001
5002 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
5003 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
5004 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
5005
5006 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5007 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
5008 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
5009 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
5010
5011 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5012 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
5013 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
5014 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
5015
5016 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
5017 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
5018 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
5019
5020 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5021 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
5022 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
5023
5024
5025 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
5026
5027 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5028
5029 if (nc >= 4) {
5030 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
5031 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
5032 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
5033 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
5034 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
5035 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5036
5037 a = (const int8_t**restrict) ((uintptr_t) a - ks);
5038
5039 nc -= 4;
5040 } else {
5041 if (nc & 2) {
5042 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
5043 c2 += 2;
5044 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
5045 c1 += 2;
5046 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
5047 c0 += 2;
5048 vout = _mm_srli_epi32(vout, 16);
5049 }
5050 if (nc & 1) {
5051 *c2 = (int8_t) _mm_extract_epi8(vout, 8);
5052 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
5053 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
5054 }
5055
5056 nc = 0;
5057 }
5058 } while (nc != 0);
5059 }
5060
xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5061 void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(
5062 size_t n,
5063 const int8_t* input_a,
5064 const int8_t* input_b,
5065 int8_t* output,
5066 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5067 {
5068 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul16.bias);
5069 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
5070 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
5071 const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
5072 const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
5073 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
5074 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
5075 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
5076 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul16.output_max);
5077
5078 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5079 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5080 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5081 input_a += 8;
5082 input_b += 8;
5083
5084
5085 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5086 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
5087 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5088 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
5089
5090 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5091 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
5092
5093 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5094 vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
5095
5096 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5097 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5098
5099 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
5100 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
5101
5102 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5103 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5104
5105 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5106
5107
5108 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5109
5110 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5111
5112 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5113
5114 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5115 output += 8;
5116 }
5117 if XNN_UNLIKELY(n != 0) {
5118 {
5119 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5120 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5121
5122
5123 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5124 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
5125 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5126 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
5127
5128 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5129 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
5130
5131 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5132 vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
5133
5134 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5135 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5136
5137 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
5138 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
5139
5140 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5141 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5142
5143 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5144
5145 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5146 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5147 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5148
5149 if (n & (4 * sizeof(int8_t))) {
5150 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5151 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5152 output += 4;
5153 }
5154 if (n & (2 * sizeof(int8_t))) {
5155 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5156 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5157 output += 2;
5158 }
5159 if (n & (1 * sizeof(int8_t))) {
5160 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5161 }
5162 }
5163 }
5164 }
5165
xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5166 void xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(
5167 size_t n,
5168 const int8_t* input_a,
5169 const int8_t* input_b,
5170 int8_t* output,
5171 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5172 {
5173 const __m128i vbias = _mm_add_epi32(
5174 _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse4_mul16.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
5175 _mm_load_si128((const __m128i*) params->sse4_mul16.bias));
5176 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
5177 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
5178 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
5179 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
5180 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
5181 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul16.output_max);
5182
5183 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5184 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5185 input_a += 8;
5186
5187
5188 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5189 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5190
5191 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5192
5193 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5194
5195 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5196 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5197
5198 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5199 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5200
5201 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5202
5203
5204 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5205
5206 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5207
5208 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5209
5210 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5211 output += 8;
5212 }
5213 if XNN_UNLIKELY(n != 0) {
5214 {
5215 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5216
5217
5218 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5219 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5220
5221 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5222
5223 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5224
5225 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5226 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5227
5228 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5229 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5230
5231 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5232
5233 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5234 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5235 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5236
5237 if (n & (4 * sizeof(int8_t))) {
5238 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5239 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5240 output += 4;
5241 }
5242 if (n & (2 * sizeof(int8_t))) {
5243 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5244 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5245 output += 2;
5246 }
5247 if (n & (1 * sizeof(int8_t))) {
5248 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5249 }
5250 }
5251 }
5252 }
5253
xnn_qs8_vcvt_ukernel__sse41_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5254 void xnn_qs8_vcvt_ukernel__sse41_x32(
5255 size_t n,
5256 const int8_t* x,
5257 int8_t* y,
5258 const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5259 {
5260 assert(n != 0);
5261 assert(n % sizeof(int8_t) == 0);
5262 assert(x != NULL);
5263 assert(y != NULL);
5264
5265 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
5266 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
5267 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
5268 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5269 __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5270 __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
5271 __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
5272 __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
5273 x += 32;
5274
5275 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
5276 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
5277 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
5278 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
5279
5280 vacc0 = _mm_slli_epi16(vacc0, 7);
5281 vacc1 = _mm_slli_epi16(vacc1, 7);
5282 vacc2 = _mm_slli_epi16(vacc2, 7);
5283 vacc3 = _mm_slli_epi16(vacc3, 7);
5284
5285 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
5286 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
5287 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
5288 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
5289
5290 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
5291 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
5292 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
5293 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
5294
5295 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
5296 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
5297
5298 _mm_storeu_si128((__m128i*) y, vy0);
5299 _mm_storeu_si128((__m128i*) (y + 16), vy1);
5300 y += 32;
5301 }
5302 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5303 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5304 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5305 vacc = _mm_slli_epi16(vacc, 7);
5306 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5307 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5308 x += 8;
5309
5310 const __m128i vy = _mm_packs_epi16(vacc, vacc);
5311 _mm_storel_epi64((__m128i*) y, vy);
5312 y += 8;
5313 }
5314 if XNN_UNLIKELY(n != 0) {
5315 assert(n >= 1 * sizeof(int8_t));
5316 assert(n <= 7 * sizeof(int8_t));
5317
5318 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5319 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5320 vacc = _mm_slli_epi16(vacc, 7);
5321 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5322 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5323
5324 __m128i vy = _mm_packs_epi16(vacc, vacc);
5325 if (n & (4 * sizeof(int8_t))) {
5326 _mm_storeu_si32(y, vy);
5327 vy = _mm_srli_epi64(vy, 32);
5328 y += 4;
5329 }
5330 if (n & (2 * sizeof(int8_t))) {
5331 _mm_storeu_si16(y, vy);
5332 vy = _mm_srli_epi32(vy, 16);
5333 y += 2;
5334 }
5335 if (n & (1 * sizeof(int8_t))) {
5336 *y = (int8_t) _mm_extract_epi8(vy, 0);
5337 }
5338 }
5339 }
5340
xnn_qs8_vlrelu_ukernel__sse41_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])5341 void xnn_qs8_vlrelu_ukernel__sse41_x32(
5342 size_t n,
5343 const int8_t* x,
5344 int8_t* y,
5345 const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5346 {
5347 assert(n != 0);
5348 assert(n % sizeof(int8_t) == 0);
5349 assert(x != NULL);
5350 assert(y != NULL);
5351
5352 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
5353 const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
5354 const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
5355 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
5356 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5357 __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5358 __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
5359 __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
5360 __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
5361 x += 32;
5362
5363 __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
5364 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
5365 __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
5366 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
5367 __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
5368 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
5369 __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
5370 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
5371
5372 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
5373 vacc0 = _mm_slli_epi16(vacc0, 7);
5374 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
5375 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
5376 vacc1 = _mm_slli_epi16(vacc1, 7);
5377 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
5378 vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
5379 vacc2 = _mm_slli_epi16(vacc2, 7);
5380 vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
5381 vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
5382 vacc3 = _mm_slli_epi16(vacc3, 7);
5383 vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
5384
5385 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
5386 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
5387 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
5388 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
5389
5390 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
5391 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
5392 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
5393 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
5394
5395 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
5396 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
5397
5398 _mm_storeu_si128((__m128i*) y, vy0);
5399 _mm_storeu_si128((__m128i*) (y + 16), vy1);
5400 y += 32;
5401 }
5402 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5403 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5404 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
5405 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5406 vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
5407 vacc = _mm_slli_epi16(vacc, 7);
5408 vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
5409 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5410 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5411 x += 8;
5412
5413 const __m128i vy = _mm_packs_epi16(vacc, vacc);
5414 _mm_storel_epi64((__m128i*) y, vy);
5415 y += 8;
5416 }
5417 if XNN_UNLIKELY(n != 0) {
5418 assert(n >= 1 * sizeof(int8_t));
5419 assert(n <= 7 * sizeof(int8_t));
5420
5421 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5422 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
5423 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5424 vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
5425 vacc = _mm_slli_epi16(vacc, 7);
5426 vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
5427 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5428 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5429
5430 __m128i vy = _mm_packs_epi16(vacc, vacc);
5431 if (n & (4 * sizeof(int8_t))) {
5432 _mm_storeu_si32(y, vy);
5433 vy = _mm_srli_epi64(vy, 32);
5434 y += 4;
5435 }
5436 if (n & (2 * sizeof(int8_t))) {
5437 _mm_storeu_si16(y, vy);
5438 vy = _mm_srli_epi32(vy, 16);
5439 y += 2;
5440 }
5441 if (n & (1 * sizeof(int8_t))) {
5442 *y = (int8_t) _mm_extract_epi8(vy, 0);
5443 }
5444 }
5445 }
5446
xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5447 void xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
5448 size_t n,
5449 const int8_t* input_a,
5450 const int8_t* input_b,
5451 int8_t* output,
5452 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5453
5454 {
5455 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
5456 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point);
5457 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5458 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5459 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5460 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
5461
5462 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5463 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5464 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5465 const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5466 const __m128i vb89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
5467 input_a += 16;
5468 input_b += 16;
5469
5470
5471 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5472 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
5473 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
5474 const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
5475
5476 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
5477 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
5478 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
5479 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
5480
5481 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5482 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5483 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5484 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5485
5486 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5487 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5488 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
5489 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
5490
5491 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5492 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5493 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
5494 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
5495
5496 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5497 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5498 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
5499 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
5500
5501 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5502 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5503
5504
5505 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5506
5507 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5508
5509 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5510
5511 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5512 output += 16;
5513 }
5514 if XNN_UNLIKELY(n != 0) {
5515 do {
5516 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5517 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5518 input_a += 8;
5519 input_b += 8;
5520
5521
5522 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5523 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
5524
5525 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
5526 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
5527
5528 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5529 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5530
5531 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5532 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5533
5534 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5535 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5536
5537 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5538 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5539
5540 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5541
5542 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5543 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5544 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5545
5546 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5547 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5548 output += 8;
5549 n -= 8 * sizeof(int8_t);
5550 } else {
5551 if (n & (4 * sizeof(int8_t))) {
5552 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5553 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5554 output += 4;
5555 }
5556 if (n & (2 * sizeof(int8_t))) {
5557 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5558 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5559 output += 2;
5560 }
5561 if (n & (1 * sizeof(int8_t))) {
5562 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5563 }
5564 n = 0;
5565 }
5566 } while (n != 0);
5567 }
5568 }
5569
xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5570 void xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
5571 size_t n,
5572 const int8_t* input_a,
5573 const int8_t* input_b,
5574 int8_t* output,
5575 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5576
5577 {
5578 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
5579 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5580 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5581 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5582 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
5583
5584 __m128i vxb = _mm_sub_epi16(
5585 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
5586 _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point));
5587 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5588 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5589 const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5590 input_a += 16;
5591
5592
5593 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5594 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
5595
5596 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
5597 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
5598 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
5599 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
5600
5601 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5602 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5603 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5604 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5605
5606 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5607 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5608 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
5609 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
5610
5611 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5612 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5613 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
5614 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
5615
5616 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5617 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5618 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
5619 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
5620
5621 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5622 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5623
5624
5625 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5626
5627 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5628
5629 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5630
5631 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5632 output += 16;
5633 }
5634 if XNN_UNLIKELY(n != 0) {
5635 do {
5636 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5637 input_a += 8;
5638
5639
5640 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5641
5642 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
5643 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
5644
5645 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5646 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5647
5648 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5649 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5650
5651 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5652 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5653
5654 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5655 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5656
5657 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5658
5659 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5660 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5661 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5662
5663 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5664 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5665 output += 8;
5666 n -= 8 * sizeof(int8_t);
5667 } else {
5668 if (n & (4 * sizeof(int8_t))) {
5669 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5670 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5671 output += 4;
5672 }
5673 if (n & (2 * sizeof(int8_t))) {
5674 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5675 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5676 output += 2;
5677 }
5678 if (n & (1 * sizeof(int8_t))) {
5679 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5680 }
5681 n = 0;
5682 }
5683 } while (n != 0);
5684 }
5685 }
5686
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5687 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(
5688 size_t channels,
5689 size_t output_width,
5690 const uint8_t** input,
5691 const void* weights,
5692 uint8_t* output,
5693 size_t input_stride,
5694 size_t output_increment,
5695 size_t input_offset,
5696 const uint8_t* zero,
5697 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5698 {
5699 assert(channels != 0);
5700 assert(output_width != 0);
5701
5702 do {
5703 const uint8_t* i0 = input[0];
5704 assert(i0 != NULL);
5705 if XNN_UNPREDICTABLE(i0 != zero) {
5706 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
5707 }
5708 const uint8_t* i1 = input[1];
5709 assert(i1 != NULL);
5710 if XNN_UNPREDICTABLE(i1 != zero) {
5711 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
5712 }
5713 const uint8_t* i2 = input[2];
5714 assert(i2 != NULL);
5715 if XNN_UNPREDICTABLE(i2 != zero) {
5716 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
5717 }
5718 const uint8_t* i3 = input[3];
5719 assert(i3 != NULL);
5720 if XNN_UNPREDICTABLE(i3 != zero) {
5721 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
5722 }
5723 const uint8_t* i4 = input[4];
5724 assert(i4 != NULL);
5725 if XNN_UNPREDICTABLE(i4 != zero) {
5726 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
5727 }
5728 const uint8_t* i5 = input[5];
5729 assert(i5 != NULL);
5730 if XNN_UNPREDICTABLE(i5 != zero) {
5731 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
5732 }
5733 const uint8_t* i6 = input[6];
5734 assert(i6 != NULL);
5735 if XNN_UNPREDICTABLE(i6 != zero) {
5736 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
5737 }
5738 const uint8_t* i7 = input[7];
5739 assert(i7 != NULL);
5740 if XNN_UNPREDICTABLE(i7 != zero) {
5741 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
5742 }
5743 const uint8_t* i8 = input[8];
5744 assert(i8 != NULL);
5745 if XNN_UNPREDICTABLE(i8 != zero) {
5746 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
5747 }
5748 const uint8_t* i9 = input[9];
5749 assert(i9 != NULL);
5750 if XNN_UNPREDICTABLE(i9 != zero) {
5751 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
5752 }
5753 const uint8_t* i10 = input[10];
5754 assert(i10 != NULL);
5755 if XNN_UNPREDICTABLE(i10 != zero) {
5756 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
5757 }
5758 const uint8_t* i11 = input[11];
5759 assert(i11 != NULL);
5760 if XNN_UNPREDICTABLE(i11 != zero) {
5761 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
5762 }
5763 const uint8_t* i12 = input[12];
5764 assert(i12 != NULL);
5765 if XNN_UNPREDICTABLE(i12 != zero) {
5766 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
5767 }
5768 const uint8_t* i13 = input[13];
5769 assert(i13 != NULL);
5770 if XNN_UNPREDICTABLE(i13 != zero) {
5771 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
5772 }
5773 const uint8_t* i14 = input[14];
5774 assert(i14 != NULL);
5775 if XNN_UNPREDICTABLE(i14 != zero) {
5776 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
5777 }
5778 const uint8_t* i15 = input[15];
5779 assert(i15 != NULL);
5780 if XNN_UNPREDICTABLE(i15 != zero) {
5781 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
5782 }
5783 const uint8_t* i16 = input[16];
5784 assert(i16 != NULL);
5785 if XNN_UNPREDICTABLE(i16 != zero) {
5786 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
5787 }
5788 const uint8_t* i17 = input[17];
5789 assert(i17 != NULL);
5790 if XNN_UNPREDICTABLE(i17 != zero) {
5791 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
5792 }
5793 const uint8_t* i18 = input[18];
5794 assert(i18 != NULL);
5795 if XNN_UNPREDICTABLE(i18 != zero) {
5796 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
5797 }
5798 const uint8_t* i19 = input[19];
5799 assert(i19 != NULL);
5800 if XNN_UNPREDICTABLE(i19 != zero) {
5801 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
5802 }
5803 const uint8_t* i20 = input[20];
5804 assert(i20 != NULL);
5805 if XNN_UNPREDICTABLE(i20 != zero) {
5806 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
5807 }
5808 const uint8_t* i21 = input[21];
5809 assert(i21 != NULL);
5810 if XNN_UNPREDICTABLE(i21 != zero) {
5811 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
5812 }
5813 const uint8_t* i22 = input[22];
5814 assert(i22 != NULL);
5815 if XNN_UNPREDICTABLE(i22 != zero) {
5816 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
5817 }
5818 const uint8_t* i23 = input[23];
5819 assert(i23 != NULL);
5820 if XNN_UNPREDICTABLE(i23 != zero) {
5821 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
5822 }
5823 const uint8_t* i24 = input[24];
5824 assert(i24 != NULL);
5825 if XNN_UNPREDICTABLE(i24 != zero) {
5826 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
5827 }
5828 input = (const uint8_t**) ((uintptr_t) input + input_stride);
5829
5830 size_t c = channels;
5831 const void* w = weights;
5832 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
5833 for (; c >= 8; c -= 8) {
5834 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5835 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5836
5837
5838 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5839 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
5840 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
5841 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
5842 i0 += 8;
5843
5844
5845 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5846 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
5847
5848 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
5849 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
5850
5851 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5852 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
5853 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
5854 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
5855 i1 += 8;
5856
5857
5858 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
5859 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
5860
5861 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
5862 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
5863
5864 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5865 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
5866 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
5867 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
5868 i2 += 8;
5869
5870
5871 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5872 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
5873
5874 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
5875 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
5876
5877 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5878 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
5879 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
5880 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
5881 i3 += 8;
5882
5883
5884 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
5885 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
5886
5887 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
5888 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
5889
5890 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5891 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
5892 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
5893 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
5894 i4 += 8;
5895
5896
5897 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5898 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
5899
5900 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
5901 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
5902
5903 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5904 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
5905 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
5906 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
5907 i5 += 8;
5908
5909
5910 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
5911 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
5912
5913 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
5914 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
5915
5916 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5917 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
5918 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
5919 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
5920 i6 += 8;
5921
5922
5923 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5924 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
5925
5926 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
5927 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
5928
5929 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5930 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
5931 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
5932 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
5933 i7 += 8;
5934
5935
5936 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
5937 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
5938
5939 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
5940 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
5941
5942 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5943 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
5944 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
5945 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
5946 i8 += 8;
5947
5948
5949 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5950 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
5951
5952 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
5953 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
5954
5955 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5956 const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
5957 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
5958 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
5959 i9 += 8;
5960
5961
5962 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
5963 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
5964
5965 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
5966 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
5967
5968 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5969 const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
5970 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
5971 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
5972 i10 += 8;
5973
5974
5975 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5976 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
5977
5978 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
5979 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
5980
5981 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5982 const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
5983 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
5984 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
5985 i11 += 8;
5986
5987
5988 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
5989 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
5990
5991 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
5992 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
5993
5994 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5995 const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
5996 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
5997 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
5998 i12 += 8;
5999
6000
6001 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
6002 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
6003
6004 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
6005 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
6006
6007 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
6008 const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
6009 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
6010 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
6011 i13 += 8;
6012
6013
6014 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
6015 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
6016
6017 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
6018 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
6019
6020 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
6021 const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
6022 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
6023 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
6024 i14 += 8;
6025
6026
6027 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
6028 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
6029
6030 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
6031 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
6032
6033 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
6034 const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
6035 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
6036 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
6037 i15 += 8;
6038
6039
6040 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
6041 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
6042
6043 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
6044 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
6045
6046 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
6047 const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
6048 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
6049 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
6050 i16 += 8;
6051
6052
6053 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
6054 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
6055
6056 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
6057 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
6058
6059 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
6060 const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
6061 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
6062 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
6063 i17 += 8;
6064
6065
6066 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
6067 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
6068
6069 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
6070 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
6071
6072 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
6073 const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
6074 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
6075 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
6076 i18 += 8;
6077
6078
6079 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
6080 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
6081
6082 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
6083 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
6084
6085 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
6086 const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
6087 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
6088 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
6089 i19 += 8;
6090
6091
6092 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
6093 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
6094
6095 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
6096 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
6097
6098 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
6099 const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
6100 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
6101 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
6102 i20 += 8;
6103
6104
6105 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
6106 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
6107
6108 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
6109 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
6110
6111 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
6112 const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
6113 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
6114 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
6115 i21 += 8;
6116
6117
6118 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
6119 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
6120
6121 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
6122 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
6123
6124 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
6125 const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
6126 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
6127 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
6128 i22 += 8;
6129
6130
6131 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
6132 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
6133
6134 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
6135 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
6136
6137 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
6138 const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
6139 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
6140 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
6141 i23 += 8;
6142
6143
6144 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
6145 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
6146
6147 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
6148 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
6149
6150 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
6151 const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
6152 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
6153 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
6154 i24 += 8;
6155
6156
6157 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
6158 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
6159
6160 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
6161 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
6162
6163 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
6164
6165 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6166 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6167
6168 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6169 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6170 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6171
6172 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6173 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6174 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6175
6176 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6177 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6178
6179 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6180 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6181
6182 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6183
6184 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6185 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6186
6187 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6188 output += 8;
6189 }
6190 if XNN_UNLIKELY(c != 0) {
6191 {
6192 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6193 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6194
6195
6196 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6197 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6198 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6199 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6200
6201
6202 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6203 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6204
6205 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6206 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6207
6208 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6209 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6210 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6211 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6212
6213
6214 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6215 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6216
6217 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6218 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6219
6220 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6221 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6222 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6223 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6224
6225
6226 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6227 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6228
6229 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6230 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6231
6232 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6233 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6234 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6235 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6236
6237
6238 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6239 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6240
6241 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6242 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6243
6244 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6245 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6246 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6247 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6248
6249
6250 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6251 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6252
6253 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6254 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6255
6256 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6257 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6258 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6259 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6260
6261
6262 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6263 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6264
6265 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6266 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6267
6268 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6269 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6270 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6271 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6272
6273
6274 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6275 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6276
6277 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6278 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6279
6280 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6281 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6282 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6283 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6284
6285
6286 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6287 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6288
6289 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6290 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6291
6292 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6293 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6294 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6295 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6296
6297
6298 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6299 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6300
6301 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6302 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6303
6304 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
6305 const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
6306 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
6307 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
6308
6309
6310 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
6311 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
6312
6313 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
6314 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
6315
6316 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
6317 const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
6318 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
6319 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
6320
6321
6322 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
6323 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
6324
6325 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
6326 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
6327
6328 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
6329 const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
6330 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
6331 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
6332
6333
6334 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
6335 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
6336
6337 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
6338 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
6339
6340 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
6341 const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
6342 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
6343 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
6344
6345
6346 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
6347 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
6348
6349 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
6350 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
6351
6352 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
6353 const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
6354 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
6355 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
6356
6357
6358 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
6359 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
6360
6361 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
6362 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
6363
6364 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
6365 const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
6366 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
6367 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
6368
6369
6370 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
6371 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
6372
6373 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
6374 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
6375
6376 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
6377 const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
6378 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
6379 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
6380
6381
6382 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
6383 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
6384
6385 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
6386 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
6387
6388 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
6389 const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
6390 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
6391 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
6392
6393
6394 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
6395 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
6396
6397 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
6398 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
6399
6400 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
6401 const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
6402 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
6403 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
6404
6405
6406 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
6407 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
6408
6409 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
6410 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
6411
6412 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
6413 const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
6414 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
6415 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
6416
6417
6418 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
6419 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
6420
6421 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
6422 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
6423
6424 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
6425 const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
6426 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
6427 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
6428
6429
6430 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
6431 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
6432
6433 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
6434 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
6435
6436 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
6437 const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
6438 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
6439 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
6440
6441
6442 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
6443 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
6444
6445 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
6446 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
6447
6448 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
6449 const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
6450 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
6451 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
6452
6453
6454 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
6455 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
6456
6457 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
6458 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
6459
6460 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
6461 const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
6462 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
6463 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
6464
6465
6466 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
6467 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
6468
6469 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
6470 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
6471
6472 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
6473 const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
6474 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
6475 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
6476
6477
6478 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
6479 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
6480
6481 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
6482 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
6483
6484 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
6485 const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
6486 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
6487 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
6488
6489
6490 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
6491 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
6492
6493 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
6494 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
6495
6496
6497 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6498 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6499
6500 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6501 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6502 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6503
6504 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6505 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6506 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6507
6508 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6509 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6510
6511
6512 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6513 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6514
6515 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6516
6517 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
6518
6519 if (c & 4) {
6520 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6521 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6522 output += 4;
6523 }
6524 if (c & 2) {
6525 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6526 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6527 output += 2;
6528 }
6529 if (c & 1) {
6530 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6531 output += 1;
6532 }
6533 }
6534 }
6535
6536 output = (uint8_t*) ((uintptr_t) output + output_increment);
6537 } while (--output_width != 0);
6538 }
6539
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6540 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(
6541 size_t channels,
6542 size_t output_width,
6543 const uint8_t** input,
6544 const void* weights,
6545 uint8_t* output,
6546 size_t input_stride,
6547 size_t output_increment,
6548 size_t input_offset,
6549 const uint8_t* zero,
6550 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6551 {
6552 assert(channels != 0);
6553 assert(output_width != 0);
6554
6555 do {
6556 const uint8_t* i0 = input[0];
6557 assert(i0 != NULL);
6558 if XNN_UNPREDICTABLE(i0 != zero) {
6559 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
6560 }
6561 const uint8_t* i1 = input[1];
6562 assert(i1 != NULL);
6563 if XNN_UNPREDICTABLE(i1 != zero) {
6564 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
6565 }
6566 const uint8_t* i2 = input[2];
6567 assert(i2 != NULL);
6568 if XNN_UNPREDICTABLE(i2 != zero) {
6569 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
6570 }
6571 const uint8_t* i3 = input[3];
6572 assert(i3 != NULL);
6573 if XNN_UNPREDICTABLE(i3 != zero) {
6574 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
6575 }
6576 const uint8_t* i4 = input[4];
6577 assert(i4 != NULL);
6578 if XNN_UNPREDICTABLE(i4 != zero) {
6579 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
6580 }
6581 const uint8_t* i5 = input[5];
6582 assert(i5 != NULL);
6583 if XNN_UNPREDICTABLE(i5 != zero) {
6584 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
6585 }
6586 const uint8_t* i6 = input[6];
6587 assert(i6 != NULL);
6588 if XNN_UNPREDICTABLE(i6 != zero) {
6589 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
6590 }
6591 const uint8_t* i7 = input[7];
6592 assert(i7 != NULL);
6593 if XNN_UNPREDICTABLE(i7 != zero) {
6594 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
6595 }
6596 const uint8_t* i8 = input[8];
6597 assert(i8 != NULL);
6598 if XNN_UNPREDICTABLE(i8 != zero) {
6599 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
6600 }
6601 input = (const uint8_t**) ((uintptr_t) input + input_stride);
6602
6603 size_t c = channels;
6604 const void* w = weights;
6605 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
6606 for (; c >= 8; c -= 8) {
6607 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6608 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6609
6610
6611 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6612 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6613 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6614 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6615 i0 += 8;
6616
6617
6618 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6619 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6620
6621 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6622 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6623
6624 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6625 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6626 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6627 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6628 i1 += 8;
6629
6630
6631 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6632 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6633
6634 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6635 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6636
6637 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6638 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6639 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6640 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6641 i2 += 8;
6642
6643
6644 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6645 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6646
6647 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6648 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6649
6650 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6651 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6652 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6653 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6654 i3 += 8;
6655
6656
6657 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6658 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6659
6660 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6661 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6662
6663 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6664 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6665 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6666 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6667 i4 += 8;
6668
6669
6670 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6671 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6672
6673 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6674 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6675
6676 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6677 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6678 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6679 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6680 i5 += 8;
6681
6682
6683 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6684 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6685
6686 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6687 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6688
6689 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6690 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6691 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6692 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6693 i6 += 8;
6694
6695
6696 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6697 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6698
6699 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6700 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6701
6702 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6703 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6704 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6705 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6706 i7 += 8;
6707
6708
6709 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6710 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6711
6712 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6713 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6714
6715 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6716 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6717 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6718 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6719 i8 += 8;
6720
6721
6722 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6723 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6724
6725 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6726 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6727
6728 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
6729
6730 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6731 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6732
6733 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6734 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6735 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6736
6737 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6738 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6739 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6740
6741 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6742 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6743
6744 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6745 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6746
6747 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6748
6749 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6750 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6751
6752 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6753 output += 8;
6754 }
6755 if XNN_UNLIKELY(c != 0) {
6756 {
6757 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6758 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6759
6760
6761 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6762 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6763 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6764 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6765
6766
6767 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6768 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6769
6770 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6771 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6772
6773 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6774 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6775 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6776 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6777
6778
6779 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6780 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6781
6782 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6783 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6784
6785 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6786 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6787 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6788 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6789
6790
6791 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6792 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6793
6794 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6795 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6796
6797 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6798 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6799 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6800 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6801
6802
6803 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6804 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6805
6806 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6807 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6808
6809 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6810 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6811 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6812 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6813
6814
6815 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6816 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6817
6818 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6819 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6820
6821 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6822 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6823 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6824 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6825
6826
6827 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6828 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6829
6830 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6831 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6832
6833 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6834 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6835 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6836 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6837
6838
6839 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6840 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6841
6842 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6843 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6844
6845 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6846 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6847 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6848 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6849
6850
6851 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6852 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6853
6854 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6855 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6856
6857 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6858 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6859 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6860 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6861
6862
6863 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6864 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6865
6866 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6867 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6868
6869
6870 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6871 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6872
6873 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6874 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6875 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6876
6877 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6878 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6879 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6880
6881 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6882 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6883
6884
6885 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6886 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6887
6888 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6889
6890 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
6891
6892 if (c & 4) {
6893 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6894 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6895 output += 4;
6896 }
6897 if (c & 2) {
6898 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6899 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6900 output += 2;
6901 }
6902 if (c & 1) {
6903 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6904 output += 1;
6905 }
6906 }
6907 }
6908
6909 output = (uint8_t*) ((uintptr_t) output + output_increment);
6910 } while (--output_width != 0);
6911 }
6912
xnn_qu8_f32_vcvt_ukernel__sse41_x16(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])6913 void xnn_qu8_f32_vcvt_ukernel__sse41_x16(
6914 size_t n,
6915 const uint8_t* x,
6916 float* y,
6917 const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6918 {
6919 assert(n != 0);
6920 assert(n % sizeof(uint8_t) == 0);
6921 assert(x != NULL);
6922 assert(y != NULL);
6923
6924 const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->sse4.minus_zero_point);
6925 const __m128 vscale = _mm_load_ps(params->sse4.scale);
6926 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6927 __m128i vx0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6928 __m128i vx4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
6929 __m128i vx89AB = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
6930 __m128i vxCDEF = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
6931 x += 16;
6932
6933 vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
6934 vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
6935 vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
6936 vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
6937
6938 __m128 vy0123 = _mm_cvtepi32_ps(vx0123);
6939 __m128 vy4567 = _mm_cvtepi32_ps(vx4567);
6940 __m128 vy89AB = _mm_cvtepi32_ps(vx89AB);
6941 __m128 vyCDEF = _mm_cvtepi32_ps(vxCDEF);
6942
6943 vy0123 = _mm_mul_ps(vy0123, vscale);
6944 vy4567 = _mm_mul_ps(vy4567, vscale);
6945 vy89AB = _mm_mul_ps(vy89AB, vscale);
6946 vyCDEF = _mm_mul_ps(vyCDEF, vscale);
6947
6948 _mm_storeu_ps(y, vy0123);
6949 _mm_storeu_ps(y + 4, vy4567);
6950 _mm_storeu_ps(y + 8, vy89AB);
6951 _mm_storeu_ps(y + 12, vyCDEF);
6952 y += 16;
6953 }
6954 for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
6955 __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6956 vx = _mm_add_epi32(vx, vminus_zero_point);
6957 x += 4;
6958
6959 __m128 vy = _mm_cvtepi32_ps(vx);
6960 vy = _mm_mul_ps(vy, vscale);
6961
6962 _mm_storeu_ps(y, vy);
6963 y += 4;
6964 }
6965 if XNN_UNLIKELY(n != 0) {
6966 assert(n >= 1 * sizeof(uint8_t));
6967 assert(n <= 3 * sizeof(uint8_t));
6968
6969 __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6970 vx = _mm_add_epi32(vx, vminus_zero_point);
6971
6972 __m128 vy = _mm_cvtepi32_ps(vx);
6973 vy = _mm_mul_ps(vy, vscale);
6974
6975 if (n & (2 * sizeof(uint8_t))) {
6976 _mm_storel_pi((__m64*) y, vy);
6977 vy = _mm_movehl_ps(vy, vy);
6978 y += 2;
6979 }
6980 if (n & (1 * sizeof(uint8_t))) {
6981 _mm_store_ss(y, vy);
6982 }
6983 }
6984 }
6985
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6986 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
6987 size_t rows,
6988 size_t channels,
6989 const uint8_t* input,
6990 size_t input_stride,
6991 const uint8_t* zero,
6992 int32_t* buffer,
6993 uint8_t* output,
6994 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6995 {
6996 assert(rows > 7);
6997 assert(channels != 0);
6998
6999 const uint8_t* i0 = input;
7000 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
7001 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
7002 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
7003 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
7004 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
7005 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
7006 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
7007
7008 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
7009 int32_t* b = buffer;
7010 size_t c = channels;
7011 for (; c != 0; c = doz(c, 8)) {
7012 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7013 i0 += 8;
7014 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7015 i1 += 8;
7016
7017 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7018 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7019 i2 += 8;
7020
7021 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7022 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7023 i3 += 8;
7024 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7025 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7026 i4 += 8;
7027 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7028 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7029 i5 += 8;
7030 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7031 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7032 i6 += 8;
7033
7034 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7035
7036 const __m128i vzero = _mm_setzero_si128();
7037 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7038 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7039
7040 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7041 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7042
7043 _mm_store_si128((__m128i*) b, vacc0123);
7044 _mm_store_si128((__m128i*) (b + 4), vacc4567);
7045 b += 8;
7046 }
7047
7048 for (rows -= 7; rows > 7; rows -= 7) {
7049 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
7050 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
7051 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
7052 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
7053 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
7054 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
7055 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
7056
7057 int32_t* b = buffer;
7058 size_t c = channels;
7059 for (; c != 0; c = doz(c, 8)) {
7060 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7061 i0 += 8;
7062 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7063 i1 += 8;
7064
7065 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7066 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7067 i2 += 8;
7068
7069 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7070 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7071 i3 += 8;
7072 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7073 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7074 i4 += 8;
7075 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7076 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7077 i5 += 8;
7078 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7079 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7080 i6 += 8;
7081
7082 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7083
7084 const __m128i vzero = _mm_setzero_si128();
7085 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7086 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7087
7088 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
7089 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
7090
7091 _mm_store_si128((__m128i*) b, vacc0123);
7092 _mm_store_si128((__m128i*) (b + 4), vacc4567);
7093 b += 8;
7094 }
7095 }
7096
7097 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
7098 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
7099 if XNN_UNPREDICTABLE(rows < 2) {
7100 i1 = zero;
7101 }
7102 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
7103 if XNN_UNPREDICTABLE(rows <= 2) {
7104 i2 = zero;
7105 }
7106 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
7107 if XNN_UNPREDICTABLE(rows < 4) {
7108 i3 = zero;
7109 }
7110 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
7111 if XNN_UNPREDICTABLE(rows <= 4) {
7112 i4 = zero;
7113 }
7114 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
7115 if XNN_UNPREDICTABLE(rows < 6) {
7116 i5 = zero;
7117 }
7118 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
7119 if XNN_UNPREDICTABLE(rows <= 6) {
7120 i6 = zero;
7121 }
7122
7123 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7124 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7125 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7126 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7127 for (; channels >= 8; channels -= 8) {
7128 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7129 i0 += 8;
7130 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7131 i1 += 8;
7132
7133 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7134 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7135 i2 += 8;
7136
7137 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7138 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7139 i3 += 8;
7140 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7141 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7142 i4 += 8;
7143 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7144 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7145 i5 += 8;
7146 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7147 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7148 i6 += 8;
7149
7150 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7151
7152 const __m128i vzero = _mm_setzero_si128();
7153 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7154 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7155
7156 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
7157 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
7158 buffer += 8;
7159
7160 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7161 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7162
7163 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7164 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7165
7166 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7167 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7168
7169 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7170 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7171
7172 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7173
7174 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7175
7176 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7177
7178 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7179 output += 8;
7180 }
7181 if XNN_UNLIKELY(channels != 0) {
7182 {
7183 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7184 i0 += 8;
7185 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7186 i1 += 8;
7187
7188 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7189 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7190 i2 += 8;
7191
7192 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7193 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7194 i3 += 8;
7195 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7196 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7197 i4 += 8;
7198 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7199 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7200 i5 += 8;
7201 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7202 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7203 i6 += 8;
7204
7205 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7206
7207 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7208 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
7209
7210 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
7211 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
7212 buffer += 8;
7213
7214 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7215 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7216
7217 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7218 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7219
7220 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7221 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7222
7223 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7224 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7225
7226 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7227
7228 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7229 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7230
7231 if (channels & 4) {
7232 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7233 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7234 output += 4;
7235 }
7236 if (channels & 2) {
7237 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7238 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7239 output += 2;
7240 }
7241 if (channels & 1) {
7242 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7243 }
7244 }
7245 }
7246 }
7247
xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7248 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
7249 size_t rows,
7250 size_t channels,
7251 const uint8_t* input,
7252 size_t input_stride,
7253 const uint8_t* zero,
7254 uint8_t* output,
7255 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7256 {
7257 assert(rows != 0);
7258 assert(rows <= 7);
7259 assert(channels != 0);
7260
7261 const uint8_t* i0 = input;
7262 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
7263 if XNN_UNPREDICTABLE(rows < 2) {
7264 i1 = zero;
7265 }
7266 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
7267 if XNN_UNPREDICTABLE(rows <= 2) {
7268 i2 = zero;
7269 }
7270 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
7271 if XNN_UNPREDICTABLE(rows < 4) {
7272 i3 = zero;
7273 }
7274 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
7275 if XNN_UNPREDICTABLE(rows <= 4) {
7276 i4 = zero;
7277 }
7278 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
7279 if XNN_UNPREDICTABLE(rows < 6) {
7280 i5 = zero;
7281 }
7282 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
7283 if XNN_UNPREDICTABLE(rows <= 6) {
7284 i6 = zero;
7285 }
7286
7287 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
7288 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7289 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7290 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7291 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7292 for (; channels >= 8; channels -= 8) {
7293 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7294 i0 += 8;
7295 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7296 i1 += 8;
7297
7298 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7299 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7300 i2 += 8;
7301
7302 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7303 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7304 i3 += 8;
7305 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7306 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7307 i4 += 8;
7308 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7309 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7310 i5 += 8;
7311 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7312 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7313 i6 += 8;
7314
7315 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7316
7317 const __m128i vzero = _mm_setzero_si128();
7318 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7319 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7320
7321 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7322 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7323
7324 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7325 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7326
7327 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7328 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7329
7330 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7331 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7332
7333 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7334 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7335
7336 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7337
7338 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7339
7340 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7341
7342 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7343 output += 8;
7344 }
7345 if XNN_UNLIKELY(channels != 0) {
7346 {
7347 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7348 i0 += 8;
7349 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7350 i1 += 8;
7351
7352 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7353 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7354 i2 += 8;
7355
7356 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7357 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7358 i3 += 8;
7359 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7360 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7361 i4 += 8;
7362 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7363 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7364 i5 += 8;
7365 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7366 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7367 i6 += 8;
7368
7369 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7370
7371 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7372 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
7373
7374 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7375 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7376
7377 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7378 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7379
7380 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7381 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7382
7383 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7384 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7385
7386 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7387 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7388
7389 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7390
7391 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7392 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7393
7394 if (channels & 4) {
7395 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7396 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7397 output += 4;
7398 }
7399 if (channels & 2) {
7400 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7401 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7402 output += 2;
7403 }
7404 if (channels & 1) {
7405 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7406 }
7407 }
7408 }
7409 }
7410
xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7411 void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
7412 size_t mr,
7413 size_t nc,
7414 size_t kc,
7415 const uint8_t* restrict a,
7416 size_t a_stride,
7417 const void* restrict w,
7418 uint8_t* restrict c,
7419 size_t cm_stride,
7420 size_t cn_stride,
7421 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7422 {
7423 assert(mr != 0);
7424 assert(mr <= 1);
7425 assert(nc != 0);
7426 assert(kc != 0);
7427 assert(kc % sizeof(uint8_t) == 0);
7428 assert(a != NULL);
7429 assert(w != NULL);
7430 assert(c != NULL);
7431
7432 kc = round_up_po2(kc, 8);
7433 const uint8_t* a0 = a;
7434 uint8_t* c0 = c;
7435
7436 do {
7437 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7438 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7439 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7440 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7441 w = (const int32_t*) w + 4;
7442
7443 size_t k = 0;
7444 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7445 while (k < kc) {
7446 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7447 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7448 a0 += 8;
7449
7450 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7451 const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7452
7453 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7454 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7455 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7456
7457 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7458 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7459 const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7460
7461 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7462 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7463 const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7464
7465 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7466
7467 w = (const void*) ((const uint8_t*) w + 32);
7468 k += 8 * sizeof(uint8_t);
7469 }
7470
7471 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7472 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7473
7474 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7475
7476 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7477
7478 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7479 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7480
7481 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7482 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7483
7484 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7485
7486 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7487 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
7488
7489 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
7490
7491 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7492
7493 if (nc >= 4) {
7494 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7495
7496 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7497
7498 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7499
7500 nc -= 4;
7501 } else {
7502 if (nc & 2) {
7503 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7504 c0 += 2;
7505 vout = _mm_srli_epi32(vout, 16);
7506 }
7507 if (nc & 1) {
7508 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7509 }
7510
7511 nc = 0;
7512 }
7513 } while (nc != 0);
7514 }
7515
xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7516 void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
7517 size_t mr,
7518 size_t nc,
7519 size_t kc,
7520 const uint8_t* restrict a,
7521 size_t a_stride,
7522 const void* restrict w,
7523 uint8_t* restrict c,
7524 size_t cm_stride,
7525 size_t cn_stride,
7526 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7527 {
7528 assert(mr != 0);
7529 assert(mr <= 3);
7530 assert(nc != 0);
7531 assert(kc != 0);
7532 assert(kc % sizeof(uint8_t) == 0);
7533 assert(a != NULL);
7534 assert(w != NULL);
7535 assert(c != NULL);
7536
7537 kc = round_up_po2(kc, 8);
7538 const uint8_t* a0 = a;
7539 uint8_t* c0 = c;
7540 const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
7541 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7542 if XNN_UNPREDICTABLE(mr < 2) {
7543 a1 = a0;
7544 c1 = c0;
7545 }
7546 const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
7547 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7548 if XNN_UNPREDICTABLE(mr <= 2) {
7549 a2 = a1;
7550 c2 = c1;
7551 }
7552
7553 do {
7554 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7555 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7556 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7557 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7558 __m128i vacc1x0 = vacc0x0;
7559 __m128i vacc1x1 = vacc0x1;
7560 __m128i vacc1x2 = vacc0x2;
7561 __m128i vacc1x3 = vacc0x3;
7562 __m128i vacc2x0 = vacc0x0;
7563 __m128i vacc2x1 = vacc0x1;
7564 __m128i vacc2x2 = vacc0x2;
7565 __m128i vacc2x3 = vacc0x3;
7566 w = (const int32_t*) w + 4;
7567
7568 size_t k = 0;
7569 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7570 while (k < kc) {
7571 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7572 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7573 a0 += 8;
7574 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
7575 const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
7576 a1 += 8;
7577 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
7578 const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
7579 a2 += 8;
7580
7581 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7582 const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7583
7584 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7585 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
7586 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
7587 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7588 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7589
7590 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7591 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
7592 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
7593 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7594 const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7595
7596 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7597 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
7598 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
7599 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7600 const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7601
7602 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7603 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
7604 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
7605
7606 w = (const void*) ((const uint8_t*) w + 32);
7607 k += 8 * sizeof(uint8_t);
7608 }
7609
7610 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7611 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7612 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
7613 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
7614 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
7615 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
7616
7617 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7618 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
7619 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
7620
7621 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7622 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
7623 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
7624
7625 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7626 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7627 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
7628 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
7629
7630 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7631 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7632 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
7633 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
7634
7635 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7636 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
7637 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
7638
7639 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7640 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
7641 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
7642
7643 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
7644
7645 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7646
7647 if (nc >= 4) {
7648 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7649 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
7650 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
7651
7652 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7653 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7654 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7655
7656 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7657 a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
7658 a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
7659
7660 nc -= 4;
7661 } else {
7662 if (nc & 2) {
7663 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7664 c0 += 2;
7665 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
7666 c1 += 2;
7667 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
7668 c2 += 2;
7669 vout = _mm_srli_epi32(vout, 16);
7670 }
7671 if (nc & 1) {
7672 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7673 *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
7674 *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
7675 }
7676
7677 nc = 0;
7678 }
7679 } while (nc != 0);
7680 }
7681
xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7682 void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
7683 size_t mr,
7684 size_t nc,
7685 size_t kc,
7686 size_t ks,
7687 const uint8_t** restrict a,
7688 const void* restrict w,
7689 uint8_t* restrict c,
7690 size_t cm_stride,
7691 size_t cn_stride,
7692 size_t a_offset,
7693 const uint8_t* zero,
7694 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7695 {
7696 assert(mr != 0);
7697 assert(mr <= 1);
7698 assert(nc != 0);
7699 assert(kc != 0);
7700 assert(ks != 0);
7701 assert(ks % (1 * sizeof(void*)) == 0);
7702 assert(a_offset % sizeof(uint8_t) == 0);
7703 assert(a != NULL);
7704 assert(w != NULL);
7705 assert(c != NULL);
7706
7707 kc = round_up_po2(kc, 8);
7708 uint8_t* c0 = c;
7709
7710 do {
7711 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7712 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7713 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7714 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7715 w = (const int32_t*) w + 4;
7716
7717 size_t p = ks;
7718 do {
7719 const uint8_t* restrict a0 = a[0];
7720 if XNN_UNPREDICTABLE(a0 != zero) {
7721 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7722 }
7723 a += 1;
7724
7725 size_t k = 0;
7726 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7727 while (k < kc) {
7728 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7729 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7730 a0 += 8;
7731
7732 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7733 const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7734
7735 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7736 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7737 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7738
7739 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7740 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7741 const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7742
7743 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7744 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7745 const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7746
7747 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7748
7749 w = (const void*) ((const uint8_t*) w + 32);
7750 k += 8 * sizeof(uint8_t);
7751 }
7752 p -= 1 * sizeof(void*);
7753 } while (p != 0);
7754
7755 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7756 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7757
7758 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7759
7760 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7761
7762 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7763 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7764
7765 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7766 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7767
7768 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7769
7770 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7771 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
7772
7773 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
7774
7775 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7776
7777 if (nc >= 4) {
7778 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7779 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7780
7781 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7782
7783 nc -= 4;
7784 } else {
7785 if (nc & 2) {
7786 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7787 c0 += 2;
7788 vout = _mm_srli_epi32(vout, 16);
7789 }
7790 if (nc & 1) {
7791 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7792 }
7793
7794 nc = 0;
7795 }
7796 } while (nc != 0);
7797 }
7798
xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7799 void xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
7800 size_t mr,
7801 size_t nc,
7802 size_t kc,
7803 size_t ks,
7804 const uint8_t** restrict a,
7805 const void* restrict w,
7806 uint8_t* restrict c,
7807 size_t cm_stride,
7808 size_t cn_stride,
7809 size_t a_offset,
7810 const uint8_t* zero,
7811 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7812 {
7813 assert(mr != 0);
7814 assert(mr <= 3);
7815 assert(nc != 0);
7816 assert(kc != 0);
7817 assert(ks != 0);
7818 assert(ks % (3 * sizeof(void*)) == 0);
7819 assert(a_offset % sizeof(uint8_t) == 0);
7820 assert(a != NULL);
7821 assert(w != NULL);
7822 assert(c != NULL);
7823
7824 kc = round_up_po2(kc, 8);
7825 uint8_t* c0 = c;
7826 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7827 if XNN_UNPREDICTABLE(mr < 2) {
7828 c1 = c0;
7829 }
7830 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7831 if XNN_UNPREDICTABLE(mr <= 2) {
7832 c2 = c1;
7833 }
7834
7835 do {
7836 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7837 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7838 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7839 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7840 __m128i vacc1x0 = vacc0x0;
7841 __m128i vacc1x1 = vacc0x1;
7842 __m128i vacc1x2 = vacc0x2;
7843 __m128i vacc1x3 = vacc0x3;
7844 __m128i vacc2x0 = vacc0x0;
7845 __m128i vacc2x1 = vacc0x1;
7846 __m128i vacc2x2 = vacc0x2;
7847 __m128i vacc2x3 = vacc0x3;
7848 w = (const int32_t*) w + 4;
7849
7850 size_t p = ks;
7851 do {
7852 const uint8_t* restrict a0 = a[0];
7853 if XNN_UNPREDICTABLE(a0 != zero) {
7854 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7855 }
7856 const uint8_t* restrict a1 = a[1];
7857 if XNN_UNPREDICTABLE(a1 != zero) {
7858 a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
7859 }
7860 const uint8_t* restrict a2 = a[2];
7861 if XNN_UNPREDICTABLE(a2 != zero) {
7862 a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
7863 }
7864 a += 3;
7865
7866 size_t k = 0;
7867 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7868 while (k < kc) {
7869 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7870 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7871 a0 += 8;
7872 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
7873 const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
7874 a1 += 8;
7875 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
7876 const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
7877 a2 += 8;
7878
7879 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7880 const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7881
7882 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7883 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
7884 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
7885 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7886 const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7887
7888 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7889 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
7890 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
7891 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7892 const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7893
7894 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7895 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
7896 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
7897 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7898 const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7899
7900 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7901 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
7902 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
7903
7904 w = (const void*) ((const uint8_t*) w + 32);
7905 k += 8 * sizeof(uint8_t);
7906 }
7907 p -= 3 * sizeof(void*);
7908 } while (p != 0);
7909
7910 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7911 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7912 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
7913 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
7914 const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
7915 const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
7916
7917 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7918 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
7919 __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
7920
7921 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7922 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
7923 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
7924
7925 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7926 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7927 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
7928 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
7929
7930 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7931 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7932 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
7933 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
7934
7935 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7936 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
7937 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
7938
7939 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7940 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
7941 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
7942
7943 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
7944
7945 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7946
7947 if (nc >= 4) {
7948 unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
7949 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7950 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
7951 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7952 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7953 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7954
7955 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7956
7957 nc -= 4;
7958 } else {
7959 if (nc & 2) {
7960 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
7961 c2 += 2;
7962 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
7963 c1 += 2;
7964 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7965 c0 += 2;
7966 vout = _mm_srli_epi32(vout, 16);
7967 }
7968 if (nc & 1) {
7969 *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
7970 *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
7971 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7972 }
7973
7974 nc = 0;
7975 }
7976 } while (nc != 0);
7977 }
7978
xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7979 void xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(
7980 size_t n,
7981 const uint8_t* input_a,
7982 const uint8_t* input_b,
7983 uint8_t* output,
7984 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7985 {
7986 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
7987 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
7988 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
7989 const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
7990 const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
7991 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
7992 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
7993 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
7994 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
7995
7996 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
7997 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
7998 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
7999 input_a += 8;
8000 input_b += 8;
8001
8002
8003 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8004 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
8005 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8006 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
8007
8008 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8009 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
8010
8011
8012 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8013 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8014
8015 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
8016 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
8017
8018 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8019 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8020
8021 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8022
8023
8024 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8025
8026 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8027
8028 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8029
8030 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8031 output += 8;
8032 }
8033 if XNN_UNLIKELY(n != 0) {
8034 {
8035 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8036 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8037
8038
8039 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8040 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
8041 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8042 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
8043
8044 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8045 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
8046
8047
8048 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8049 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8050
8051 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
8052 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
8053
8054 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8055 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8056
8057 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8058
8059 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8060 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8061 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8062
8063 if (n & (4 * sizeof(uint8_t))) {
8064 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8065 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8066 output += 4;
8067 }
8068 if (n & (2 * sizeof(uint8_t))) {
8069 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8070 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8071 output += 2;
8072 }
8073 if (n & (1 * sizeof(uint8_t))) {
8074 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8075 }
8076 }
8077 }
8078 }
8079
xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8080 void xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(
8081 size_t n,
8082 const uint8_t* input_a,
8083 const uint8_t* input_b,
8084 uint8_t* output,
8085 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8086 {
8087 const __m128i vbias = _mm_add_epi32(
8088 _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse2.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
8089 _mm_load_si128((const __m128i*) params->sse2.bias));
8090 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
8091 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
8092 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
8093 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
8094 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
8095 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
8096
8097 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8098 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8099 input_a += 8;
8100
8101
8102 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8103 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8104
8105 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8106
8107
8108 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8109 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8110
8111 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8112 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8113
8114 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8115
8116
8117 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8118
8119 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8120
8121 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8122
8123 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8124 output += 8;
8125 }
8126 if XNN_UNLIKELY(n != 0) {
8127 {
8128 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8129
8130
8131 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8132 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8133
8134 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8135
8136
8137 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8138 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8139
8140 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8141 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8142
8143 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8144
8145 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8146 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8147 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8148
8149 if (n & (4 * sizeof(uint8_t))) {
8150 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8151 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8152 output += 4;
8153 }
8154 if (n & (2 * sizeof(uint8_t))) {
8155 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8156 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8157 output += 2;
8158 }
8159 if (n & (1 * sizeof(uint8_t))) {
8160 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8161 }
8162 }
8163 }
8164 }
8165
xnn_qu8_vcvt_ukernel__sse41_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8166 void xnn_qu8_vcvt_ukernel__sse41_x32(
8167 size_t n,
8168 const uint8_t* x,
8169 uint8_t* y,
8170 const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8171 {
8172 assert(n != 0);
8173 assert(n % sizeof(uint8_t) == 0);
8174 assert(x != NULL);
8175 assert(y != NULL);
8176
8177 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
8178 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
8179 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
8180 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
8181 __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8182 __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8183 __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
8184 __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
8185 x += 32;
8186
8187 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
8188 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
8189 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
8190 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
8191
8192 vacc0 = _mm_slli_epi16(vacc0, 7);
8193 vacc1 = _mm_slli_epi16(vacc1, 7);
8194 vacc2 = _mm_slli_epi16(vacc2, 7);
8195 vacc3 = _mm_slli_epi16(vacc3, 7);
8196
8197 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
8198 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
8199 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
8200 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
8201
8202 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
8203 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
8204 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
8205 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
8206
8207 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
8208 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
8209
8210 _mm_storeu_si128((__m128i*) y, vy0);
8211 _mm_storeu_si128((__m128i*) (y + 16), vy1);
8212 y += 32;
8213 }
8214 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8215 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8216 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8217 vacc = _mm_slli_epi16(vacc, 7);
8218 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8219 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8220 x += 8;
8221
8222 const __m128i vy = _mm_packus_epi16(vacc, vacc);
8223 _mm_storel_epi64((__m128i*) y, vy);
8224 y += 8;
8225 }
8226 if XNN_UNLIKELY(n != 0) {
8227 assert(n >= 1 * sizeof(uint8_t));
8228 assert(n <= 7 * sizeof(uint8_t));
8229
8230 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8231 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8232 vacc = _mm_slli_epi16(vacc, 7);
8233 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8234 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8235
8236 __m128i vy = _mm_packus_epi16(vacc, vacc);
8237 if (n & (4 * sizeof(uint8_t))) {
8238 _mm_storeu_si32(y, vy);
8239 vy = _mm_srli_epi64(vy, 32);
8240 y += 4;
8241 }
8242 if (n & (2 * sizeof(uint8_t))) {
8243 _mm_storeu_si16(y, vy);
8244 vy = _mm_srli_epi32(vy, 16);
8245 y += 2;
8246 }
8247 if (n & (1 * sizeof(uint8_t))) {
8248 *y = (uint8_t) _mm_extract_epi8(vy, 0);
8249 }
8250 }
8251 }
8252
xnn_qu8_vlrelu_ukernel__sse41_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])8253 void xnn_qu8_vlrelu_ukernel__sse41_x32(
8254 size_t n,
8255 const uint8_t* x,
8256 uint8_t* y,
8257 const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8258 {
8259 assert(n != 0);
8260 assert(n % sizeof(uint8_t) == 0);
8261 assert(x != NULL);
8262 assert(y != NULL);
8263
8264 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
8265 const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
8266 const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
8267 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
8268 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
8269 __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8270 __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8271 __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
8272 __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
8273 x += 32;
8274
8275 __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
8276 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
8277 __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
8278 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
8279 __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
8280 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
8281 __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
8282 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
8283
8284 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
8285 vacc0 = _mm_slli_epi16(vacc0, 7);
8286 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
8287 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
8288 vacc1 = _mm_slli_epi16(vacc1, 7);
8289 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
8290 vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
8291 vacc2 = _mm_slli_epi16(vacc2, 7);
8292 vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
8293 vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
8294 vacc3 = _mm_slli_epi16(vacc3, 7);
8295 vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
8296
8297 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
8298 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
8299 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
8300 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
8301
8302 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
8303 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
8304 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
8305 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
8306
8307 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
8308 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
8309
8310 _mm_storeu_si128((__m128i*) y, vy0);
8311 _mm_storeu_si128((__m128i*) (y + 16), vy1);
8312 y += 32;
8313 }
8314 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8315 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8316 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
8317 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8318 vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
8319 vacc = _mm_slli_epi16(vacc, 7);
8320 vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
8321 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8322 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8323 x += 8;
8324
8325 const __m128i vy = _mm_packus_epi16(vacc, vacc);
8326 _mm_storel_epi64((__m128i*) y, vy);
8327 y += 8;
8328 }
8329 if XNN_UNLIKELY(n != 0) {
8330 assert(n >= 1 * sizeof(uint8_t));
8331 assert(n <= 7 * sizeof(uint8_t));
8332
8333 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8334 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
8335 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8336 vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
8337 vacc = _mm_slli_epi16(vacc, 7);
8338 vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
8339 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8340 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8341
8342 __m128i vy = _mm_packus_epi16(vacc, vacc);
8343 if (n & (4 * sizeof(uint8_t))) {
8344 _mm_storeu_si32(y, vy);
8345 vy = _mm_srli_epi64(vy, 32);
8346 y += 4;
8347 }
8348 if (n & (2 * sizeof(uint8_t))) {
8349 _mm_storeu_si16(y, vy);
8350 vy = _mm_srli_epi32(vy, 16);
8351 y += 2;
8352 }
8353 if (n & (1 * sizeof(uint8_t))) {
8354 *y = (uint8_t) _mm_extract_epi8(vy, 0);
8355 }
8356 }
8357 }
8358
xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8359 void xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
8360 size_t n,
8361 const uint8_t* input_a,
8362 const uint8_t* input_b,
8363 uint8_t* output,
8364 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8365
8366 {
8367 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
8368 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
8369 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
8370 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
8371 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
8372 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
8373
8374 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
8375 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8376 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8377 const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
8378 const __m128i vb89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
8379 input_a += 16;
8380 input_b += 16;
8381
8382
8383 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8384 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
8385 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
8386 const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
8387
8388 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
8389 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
8390 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
8391 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
8392
8393 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8394 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8395 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8396 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8397
8398 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8399 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8400 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
8401 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
8402
8403 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8404 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8405 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
8406 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
8407
8408 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8409 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8410 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
8411 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
8412
8413 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8414 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8415
8416
8417 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
8418
8419 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
8420
8421 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
8422
8423 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8424 output += 16;
8425 }
8426 if XNN_UNLIKELY(n != 0) {
8427 do {
8428 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8429 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8430 input_a += 8;
8431 input_b += 8;
8432
8433
8434 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8435 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
8436
8437 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
8438 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
8439
8440 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8441 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8442
8443 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8444 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8445
8446 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8447 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8448
8449 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8450 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8451
8452 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8453
8454 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8455 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8456 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8457
8458 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
8459 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8460 output += 8;
8461 n -= 8 * sizeof(uint8_t);
8462 } else {
8463 if (n & (4 * sizeof(uint8_t))) {
8464 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8465 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8466 output += 4;
8467 }
8468 if (n & (2 * sizeof(uint8_t))) {
8469 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8470 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8471 output += 2;
8472 }
8473 if (n & (1 * sizeof(uint8_t))) {
8474 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8475 }
8476 n = 0;
8477 }
8478 } while (n != 0);
8479 }
8480 }
8481
xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8482 void xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
8483 size_t n,
8484 const uint8_t* input_a,
8485 const uint8_t* input_b,
8486 uint8_t* output,
8487 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8488
8489 {
8490 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
8491 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
8492 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
8493 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
8494 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
8495
8496 __m128i vxb = _mm_sub_epi16(
8497 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
8498 _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
8499 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
8500 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8501 const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
8502 input_a += 16;
8503
8504
8505 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8506 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
8507
8508 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
8509 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
8510 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
8511 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
8512
8513 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8514 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8515 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8516 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8517
8518 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8519 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8520 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
8521 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
8522
8523 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8524 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8525 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
8526 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
8527
8528 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8529 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8530 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
8531 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
8532
8533 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8534 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8535
8536
8537 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
8538
8539 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
8540
8541 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
8542
8543 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8544 output += 16;
8545 }
8546 if XNN_UNLIKELY(n != 0) {
8547 do {
8548 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8549 input_a += 8;
8550
8551
8552 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8553
8554 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
8555 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
8556
8557 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8558 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8559
8560 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8561 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8562
8563 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8564 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8565
8566 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8567 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8568
8569 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8570
8571 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8572 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8573 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8574
8575 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
8576 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8577 output += 8;
8578 n -= 8 * sizeof(uint8_t);
8579 } else {
8580 if (n & (4 * sizeof(uint8_t))) {
8581 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8582 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8583 output += 4;
8584 }
8585 if (n & (2 * sizeof(uint8_t))) {
8586 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8587 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8588 output += 2;
8589 }
8590 if (n & (1 * sizeof(uint8_t))) {
8591 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8592 }
8593 n = 0;
8594 }
8595 } while (n != 0);
8596 }
8597 }
8598
xnn_s8_ibilinear_ukernel__sse41_c16(size_t output_pixels,size_t channels,const int8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,int8_t * restrict output,size_t output_increment)8599 void xnn_s8_ibilinear_ukernel__sse41_c16(
8600 size_t output_pixels,
8601 size_t channels,
8602 const int8_t**restrict input,
8603 size_t input_offset,
8604 const int16_t*restrict weights,
8605 int8_t*restrict output,
8606 size_t output_increment) XNN_OOB_READS
8607 {
8608 assert(output_pixels != 0);
8609 assert(channels != 0);
8610
8611 do {
8612 const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
8613 const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
8614 const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
8615 const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
8616 input += 4;
8617
8618 const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
8619 weights += 2;
8620 __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
8621 valphah = _mm_unpacklo_epi64(valphah, valphah);
8622 __m128i valphav = _mm_srli_epi32(valpha, 16);
8623 valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0));
8624
8625 valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA);
8626
8627 const __m128i vrounding = _mm_set1_epi32(0x00200000);
8628
8629 size_t c = channels;
8630 for (; c >= 16 * sizeof(int8_t); c -= 16 * sizeof(int8_t)) {
8631 const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8632 const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8633 const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8634 const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8635 const __m128i vtl89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
8636 const __m128i vtr89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
8637 const __m128i vbl89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
8638 const __m128i vbr89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
8639 i0 += 16;
8640 i1 += 16;
8641 i2 += 16;
8642 i3 += 16;
8643
8644
8645 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8646 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8647 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8648 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8649 const __m128i vdr89ABCDEF = _mm_sub_epi16(vbr89ABCDEF, vtr89ABCDEF);
8650 const __m128i vt89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
8651 const __m128i vdl89ABCDEF = _mm_sub_epi16(vbl89ABCDEF, vtl89ABCDEF);
8652 const __m128i vtCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
8653
8654 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8655 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8656 const __m128i vd89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
8657 const __m128i vdCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
8658
8659 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8660 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8661 __m128i vacc89AB = _mm_mullo_epi32(vd89AB, valphav);
8662 __m128i vaccCDEF = _mm_mullo_epi32(vdCDEF, valphav);
8663
8664 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8665 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8666 vacc89AB = _mm_add_epi32(_mm_slli_epi32(vt89AB, 11), vacc89AB);
8667 vaccCDEF = _mm_add_epi32(_mm_slli_epi32(vtCDEF, 11), vaccCDEF);
8668
8669 vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8670 vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8671 vacc89AB = _mm_srai_epi32(_mm_add_epi16(vacc89AB, vrounding), 22);
8672 vaccCDEF = _mm_srai_epi32(_mm_add_epi16(vaccCDEF, vrounding), 22);
8673
8674 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8675 const __m128i vacc89ABCDEF = _mm_packs_epi32(vacc89AB, vaccCDEF);
8676
8677 const __m128i vo0123456789ABCDEF = _mm_packs_epi16(vacc01234567, vacc89ABCDEF);
8678
8679 _mm_storeu_si128((__m128i*) output, vo0123456789ABCDEF);
8680 output += 16;
8681 }
8682 for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) {
8683 const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8684 i0 += 8;
8685 const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8686 i1 += 8;
8687 const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8688 i2 += 8;
8689 const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8690 i3 += 8;
8691
8692
8693 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8694 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8695 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8696 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8697
8698 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8699 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8700
8701 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8702 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8703
8704 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8705 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8706
8707 vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8708 vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8709
8710 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8711
8712 const __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
8713
8714 _mm_storel_epi64((__m128i*) output, vo01234567);
8715 output += 8;
8716 }
8717 if XNN_UNLIKELY(c != 0) {
8718 const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8719 const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8720 const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8721 const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8722
8723
8724 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8725 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8726 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8727 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8728
8729 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8730 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8731
8732 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8733 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8734
8735 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8736 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8737
8738 vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8739 vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8740
8741 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8742
8743 __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
8744
8745 if (c & (4 * sizeof(int8_t))) {
8746 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
8747 output += 4;
8748 vo01234567 = _mm_srli_epi64(vo01234567, 32);
8749 }
8750 if (c & (2 * sizeof(int8_t))) {
8751 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0));
8752 output += 2;
8753 vo01234567 = _mm_srli_epi32(vo01234567, 16);
8754 }
8755 if (c & (1 * sizeof(int8_t))) {
8756 *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0);
8757 }
8758 }
8759
8760 output = (int8_t*) ((uintptr_t) output + output_increment);
8761 } while (--output_pixels != 0);
8762 }
8763
xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16(size_t output_pixels,size_t kernel_elements,size_t channels,const int8_t ** input,size_t input_offset,int8_t * output,size_t input_increment,size_t output_increment,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8764 void xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16(
8765 size_t output_pixels,
8766 size_t kernel_elements,
8767 size_t channels,
8768 const int8_t** input,
8769 size_t input_offset,
8770 int8_t* output,
8771 size_t input_increment,
8772 size_t output_increment,
8773 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8774 {
8775 assert(output_pixels != 0);
8776 assert(kernel_elements != 0);
8777 assert(channels != 0);
8778
8779 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.max);
8780 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.min);
8781
8782 do {
8783 int8_t* o = output;
8784 {
8785 const int8_t* i0 = *input++;
8786 const int8_t* i1 = *input++;
8787 const int8_t* i2 = *input++;
8788 const int8_t* i3 = *input++;
8789 const int8_t* i4 = *input++;
8790 const int8_t* i5 = *input++;
8791 const int8_t* i6 = *input++;
8792 const int8_t* i7 = *input++;
8793 const int8_t* i8 = *input++;
8794 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
8795 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
8796 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
8797 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
8798 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
8799 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
8800 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
8801 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
8802 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
8803 if (kernel_elements < 2) {
8804 i1 = i0;
8805 }
8806 if (kernel_elements <= 2) {
8807 i2 = i0;
8808 }
8809 if (kernel_elements < 4) {
8810 i3 = i0;
8811 }
8812 if (kernel_elements <= 4) {
8813 i4 = i0;
8814 }
8815 if (kernel_elements < 6) {
8816 i5 = i0;
8817 }
8818 if (kernel_elements <= 6) {
8819 i6 = i0;
8820 }
8821 if (kernel_elements < 8) {
8822 i7 = i0;
8823 }
8824 if (kernel_elements <= 8) {
8825 i8 = i0;
8826 }
8827
8828 size_t c = channels;
8829 for (; c >= 16; c -= 16) {
8830 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
8831 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
8832 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
8833 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
8834 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
8835 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
8836 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
8837 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
8838 const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8); i8 += 16;
8839
8840 const __m128i vmax018 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vi8);
8841 const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8842 const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8843 const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8844
8845 const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8846 const __m128i vmax01678 = _mm_max_epi8(vmax018, vmax67);
8847 __m128i vout = _mm_max_epi8(vmax2345, vmax01678);
8848 vout = _mm_max_epi8(vout, voutput_min);
8849 vout = _mm_min_epi8(vout, voutput_max);
8850
8851 _mm_storeu_si128((__m128i*) o, vout); o += 16;
8852 }
8853 if (c != 0) {
8854 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
8855 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
8856 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
8857 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
8858 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
8859 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
8860 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
8861 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
8862 const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8);
8863
8864 const __m128i vmax018 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vi8);
8865 const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8866 const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8867 const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8868
8869 const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8870 const __m128i vmax01678 = _mm_max_epi8(vmax018, vmax67);
8871 __m128i vout = _mm_max_epi8(vmax2345, vmax01678);
8872 vout = _mm_max_epi8(vout, voutput_min);
8873 vout = _mm_min_epi8(vout, voutput_max);
8874
8875 if (c & 8) {
8876 _mm_storel_epi64((__m128i*) o, vout);
8877 vout = _mm_unpackhi_epi64(vout, vout);
8878 o += 8;
8879 }
8880 if (c & 4) {
8881 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
8882 vout = _mm_srli_epi64(vout, 32);
8883 o += 4;
8884 }
8885 if (c & 2) {
8886 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
8887 vout = _mm_srli_epi32(vout, 16);
8888 o += 2;
8889 }
8890 if (c & 1) {
8891 *o = (int8_t) _mm_cvtsi128_si32(vout);
8892 o += 1;
8893 }
8894 }
8895 }
8896
8897 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
8898 const int8_t* i0 = *input++;
8899 const int8_t* i1 = *input++;
8900 const int8_t* i2 = *input++;
8901 const int8_t* i3 = *input++;
8902 const int8_t* i4 = *input++;
8903 const int8_t* i5 = *input++;
8904 const int8_t* i6 = *input++;
8905 const int8_t* i7 = *input++;
8906 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
8907 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
8908 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
8909 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
8910 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
8911 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
8912 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
8913 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
8914 if (k < 2) {
8915 i1 = i0;
8916 }
8917 if (k <= 2) {
8918 i2 = i0;
8919 }
8920 if (k < 4) {
8921 i3 = i0;
8922 }
8923 if (k <= 4) {
8924 i4 = i0;
8925 }
8926 if (k < 6) {
8927 i5 = i0;
8928 }
8929 if (k <= 6) {
8930 i6 = i0;
8931 }
8932 if (k < 8) {
8933 i7 = i0;
8934 }
8935
8936 o = output;
8937 size_t c = channels;
8938 for (; c >= 16; c -= 16) {
8939 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
8940 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
8941 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
8942 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
8943 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
8944 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
8945 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
8946 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
8947 const __m128i vo = _mm_loadu_si128((const __m128i*) o);
8948
8949 const __m128i vmax01 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vo);
8950 const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8951 const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8952 const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8953
8954 const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8955 const __m128i vmax0167 = _mm_max_epi8(vmax01, vmax67);
8956 __m128i vout = _mm_max_epi8(vmax2345, vmax0167);
8957 vout = _mm_max_epi8(vout, voutput_min);
8958 vout = _mm_min_epi8(vout, voutput_max);
8959
8960 _mm_storeu_si128((__m128i*) o, vout);
8961 o += 16;
8962 }
8963 if (c != 0) {
8964 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
8965 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
8966 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
8967 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
8968 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
8969 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
8970 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
8971 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
8972 const __m128i vo = _mm_loadu_si128((const __m128i*) o);
8973
8974 const __m128i vmax01 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vo);
8975 const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8976 const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8977 const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8978
8979 const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8980 const __m128i vmax0167 = _mm_max_epi8(vmax01, vmax67);
8981 __m128i vout = _mm_max_epi8(vmax2345, vmax0167);
8982 vout = _mm_max_epi8(vout, voutput_min);
8983 vout = _mm_min_epi8(vout, voutput_max);
8984
8985 if (c & 8) {
8986 _mm_storel_epi64((__m128i*) o, vout);
8987 vout = _mm_unpackhi_epi64(vout, vout);
8988 o += 8;
8989 }
8990 if (c & 4) {
8991 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
8992 vout = _mm_srli_epi64(vout, 32);
8993 o += 4;
8994 }
8995 if (c & 2) {
8996 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
8997 vout = _mm_srli_epi32(vout, 16);
8998 o += 2;
8999 }
9000 if (c & 1) {
9001 *o = (int8_t) _mm_cvtsi128_si32(vout);
9002 o += 1;
9003 }
9004 }
9005 }
9006 input = (const int8_t**) ((uintptr_t) input + input_increment);
9007 output = (int8_t*) ((uintptr_t) o + output_increment);
9008 } while (--output_pixels != 0);
9009 }
9010
xnn_s8_vclamp_ukernel__sse41_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9011 void xnn_s8_vclamp_ukernel__sse41_x64(
9012 size_t n,
9013 const int8_t* x,
9014 int8_t* y,
9015 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9016 {
9017 assert(n != 0);
9018
9019 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.max);
9020 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.min);
9021 for (; n >= 64; n -= 64) {
9022 __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
9023 __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
9024 __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
9025 __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
9026 x += 64;
9027
9028 vacc0 = _mm_max_epi8(vacc0, voutput_min);
9029 vacc1 = _mm_max_epi8(vacc1, voutput_min);
9030 vacc2 = _mm_max_epi8(vacc2, voutput_min);
9031 vacc3 = _mm_max_epi8(vacc3, voutput_min);
9032
9033 vacc0 = _mm_min_epi8(vacc0, voutput_max);
9034 vacc1 = _mm_min_epi8(vacc1, voutput_max);
9035 vacc2 = _mm_min_epi8(vacc2, voutput_max);
9036 vacc3 = _mm_min_epi8(vacc3, voutput_max);
9037
9038 _mm_storeu_si128((__m128i*) y, vacc0);
9039 _mm_storeu_si128((__m128i*) y + 1, vacc1);
9040 _mm_storeu_si128((__m128i*) y + 2, vacc2);
9041 _mm_storeu_si128((__m128i*) y + 3, vacc3);
9042 y += 64;
9043 }
9044 for (; n >= 16; n -= 16) {
9045 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
9046 x += 16;
9047
9048 vacc = _mm_min_epi8(vacc, voutput_max);
9049 vacc = _mm_max_epi8(vacc, voutput_min);
9050
9051 _mm_storeu_si128((__m128i*) y, vacc);
9052 y += 16;
9053 }
9054 if XNN_UNLIKELY(n != 0) {
9055 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
9056
9057 vacc = _mm_min_epi8(vacc, voutput_max);
9058 vacc = _mm_max_epi8(vacc, voutput_min);
9059
9060 if (n & 8) {
9061 _mm_storel_epi64((__m128i*) y, vacc);
9062 y += 8;
9063 vacc = _mm_unpackhi_epi64(vacc, vacc);
9064 }
9065 if (n & 4) {
9066 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vacc));
9067 y += 4;
9068 vacc = _mm_srli_epi64(vacc, 32);
9069 }
9070 if (n & 2) {
9071 unaligned_store_u16(y, (uint16_t) _mm_cvtsi128_si32(vacc));
9072 y += 2;
9073 vacc = _mm_srli_epi32(vacc, 16);
9074 }
9075 if (n & 1) {
9076 *y = (int8_t) _mm_cvtsi128_si32(vacc);
9077 }
9078 }
9079 }
9080
xnn_u8_ibilinear_ukernel__sse41_c16(size_t output_pixels,size_t channels,const uint8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,uint8_t * restrict output,size_t output_increment)9081 void xnn_u8_ibilinear_ukernel__sse41_c16(
9082 size_t output_pixels,
9083 size_t channels,
9084 const uint8_t**restrict input,
9085 size_t input_offset,
9086 const int16_t*restrict weights,
9087 uint8_t*restrict output,
9088 size_t output_increment) XNN_OOB_READS
9089 {
9090 assert(output_pixels != 0);
9091 assert(channels != 0);
9092
9093 do {
9094 const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
9095 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
9096 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
9097 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
9098 input += 4;
9099
9100 const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
9101 weights += 2;
9102 __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
9103 valphah = _mm_unpacklo_epi64(valphah, valphah);
9104 __m128i valphav = _mm_srli_epi32(valpha, 16);
9105 valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0));
9106
9107 valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA);
9108
9109 const __m128i vrounding = _mm_set1_epi32(0x00200000);
9110
9111 size_t c = channels;
9112 for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
9113 const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9114 const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9115 const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9116 const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9117 const __m128i vtl89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
9118 const __m128i vtr89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
9119 const __m128i vbl89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
9120 const __m128i vbr89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
9121 i0 += 16;
9122 i1 += 16;
9123 i2 += 16;
9124 i3 += 16;
9125
9126
9127 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9128 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9129 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9130 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9131 const __m128i vdr89ABCDEF = _mm_sub_epi16(vbr89ABCDEF, vtr89ABCDEF);
9132 const __m128i vt89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
9133 const __m128i vdl89ABCDEF = _mm_sub_epi16(vbl89ABCDEF, vtl89ABCDEF);
9134 const __m128i vtCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
9135
9136 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9137 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9138 const __m128i vd89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
9139 const __m128i vdCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
9140
9141 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9142 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9143 __m128i vacc89AB = _mm_mullo_epi32(vd89AB, valphav);
9144 __m128i vaccCDEF = _mm_mullo_epi32(vdCDEF, valphav);
9145
9146 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9147 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9148 vacc89AB = _mm_add_epi32(_mm_slli_epi32(vt89AB, 11), vacc89AB);
9149 vaccCDEF = _mm_add_epi32(_mm_slli_epi32(vtCDEF, 11), vaccCDEF);
9150
9151 vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9152 vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9153 vacc89AB = _mm_srli_epi32(_mm_add_epi16(vacc89AB, vrounding), 22);
9154 vaccCDEF = _mm_srli_epi32(_mm_add_epi16(vaccCDEF, vrounding), 22);
9155
9156 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9157 const __m128i vacc89ABCDEF = _mm_packs_epi32(vacc89AB, vaccCDEF);
9158
9159 const __m128i vo0123456789ABCDEF = _mm_packus_epi16(vacc01234567, vacc89ABCDEF);
9160
9161 _mm_storeu_si128((__m128i*) output, vo0123456789ABCDEF);
9162 output += 16;
9163 }
9164 for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) {
9165 const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9166 i0 += 8;
9167 const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9168 i1 += 8;
9169 const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9170 i2 += 8;
9171 const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9172 i3 += 8;
9173
9174
9175 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9176 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9177 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9178 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9179
9180 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9181 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9182
9183 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9184 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9185
9186 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9187 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9188
9189 vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9190 vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9191
9192 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9193
9194 const __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
9195
9196 _mm_storel_epi64((__m128i*) output, vo01234567);
9197 output += 8;
9198 }
9199 if XNN_UNLIKELY(c != 0) {
9200 const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9201 const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9202 const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9203 const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9204
9205
9206 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9207 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9208 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9209 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9210
9211 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9212 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9213
9214 __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9215 __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9216
9217 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9218 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9219
9220 vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9221 vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9222
9223 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9224
9225 __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
9226
9227 if (c & (4 * sizeof(uint8_t))) {
9228 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
9229 output += 4;
9230 vo01234567 = _mm_srli_epi64(vo01234567, 32);
9231 }
9232 if (c & (2 * sizeof(uint8_t))) {
9233 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0));
9234 output += 2;
9235 vo01234567 = _mm_srli_epi32(vo01234567, 16);
9236 }
9237 if (c & (1 * sizeof(uint8_t))) {
9238 *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0);
9239 }
9240 }
9241
9242 output = (uint8_t*) ((uintptr_t) output + output_increment);
9243 } while (--output_pixels != 0);
9244 }
9245