xref: /aosp_15_r20/external/XNNPACK/src/amalgam/sse41.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gavgpool.h>
13 #include <xnnpack/gemm.h>
14 #include <xnnpack/ibilinear.h>
15 #include <xnnpack/igemm.h>
16 #include <xnnpack/intrinsics-polyfill.h>
17 #include <xnnpack/math.h>
18 #include <xnnpack/maxpool.h>
19 #include <xnnpack/prelu.h>
20 #include <xnnpack/unaligned.h>
21 #include <xnnpack/vadd.h>
22 #include <xnnpack/vcvt.h>
23 #include <xnnpack/vlrelu.h>
24 #include <xnnpack/vmul.h>
25 #include <xnnpack/vunary.h>
26 
27 
xnn_f16_f32_vcvt_ukernel__sse41_int16_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])28 void xnn_f16_f32_vcvt_ukernel__sse41_int16_x16(
29     size_t n,
30     const void* input,
31     float* output,
32     const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
33 {
34   assert(n != 0);
35   assert(n % sizeof(uint16_t) == 0);
36   assert(input != NULL);
37   assert(output != NULL);
38 
39   const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
40   const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
41   const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
42   const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
43   const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
44   const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
45 
46   const uint16_t* i = (const uint16_t*) input;
47   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
48     const __m128i vh0 = _mm_loadu_si128((const __m128i*) i);
49     const __m128i vh1 = _mm_loadu_si128((const __m128i*) (i + 8));
50     i += 16;
51 
52     const __m128i vsign0 = _mm_and_si128(vh0, vsign_mask);
53     const __m128i vsign1 = _mm_and_si128(vh1, vsign_mask);
54 
55     const __m128i vnonsign0 = _mm_xor_si128(vh0, vsign0);
56     const __m128i vnonsign1 = _mm_xor_si128(vh1, vsign1);
57 
58     const __m128i vprenorm0 = _mm_slli_epi16(vnonsign0, 13);
59     const __m128i vprenorm1 = _mm_add_epi16(_mm_srli_epi16(vnonsign0, 3), vexp_offset);
60     const __m128i vprenorm2 = _mm_slli_epi16(vnonsign1, 13);
61     const __m128i vprenorm3 = _mm_add_epi16(_mm_srli_epi16(vnonsign1, 3), vexp_offset);
62 
63     const __m128i vnorm0 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm0, vprenorm1)), vexp_scale));
64     const __m128i vnorm1 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm0, vprenorm1)), vexp_scale));
65     const __m128i vnorm2 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm2, vprenorm3)), vexp_scale));
66     const __m128i vnorm3 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm2, vprenorm3)), vexp_scale));
67 
68     const __m128i vdenorm0 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
69     const __m128i vdenorm1 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
70     const __m128i vdenorm2 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
71     const __m128i vdenorm3 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
72 
73     const __m128i vmask0 = _mm_cmpgt_epi16(vnonsign0, vdenorm_cutoff);
74     const __m128i vmask1 = _mm_cmpgt_epi16(vnonsign1, vdenorm_cutoff);
75 
76     const __m128i vf0 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign0),
77       _mm_blendv_epi8(vdenorm0, vnorm0, _mm_cvtepi16_epi32(vmask0)));
78     const __m128i vf1 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign0),
79       _mm_blendv_epi8(vdenorm1, vnorm1, _mm_unpackhi_epi16(vmask0, vmask0)));
80     const __m128i vf2 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign1),
81       _mm_blendv_epi8(vdenorm2, vnorm2, _mm_cvtepi16_epi32(vmask1)));
82     const __m128i vf3 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign1),
83       _mm_blendv_epi8(vdenorm3, vnorm3, _mm_unpackhi_epi16(vmask1, vmask1)));
84 
85     _mm_storeu_ps(output, _mm_castsi128_ps(vf0));
86     _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf1));
87     _mm_storeu_ps(output + 8, _mm_castsi128_ps(vf2));
88     _mm_storeu_ps(output + 12, _mm_castsi128_ps(vf3));
89     output += 16;
90   }
91   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
92     const __m128i vh = _mm_loadu_si128((const __m128i*) i);
93     i += 8;
94 
95     const __m128i vsign = _mm_and_si128(vh, vsign_mask);
96 
97     const __m128i vnonsign = _mm_xor_si128(vh, vsign);
98 
99     const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
100     const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
101 
102     const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
103     const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
104 
105     const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
106     const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
107 
108     const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
109 
110     const __m128i vf_lo = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
111       _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
112 
113     const __m128i vf_hi = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
114       _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
115 
116     _mm_storeu_ps(output, _mm_castsi128_ps(vf_lo));
117     _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf_hi));
118     output += 8;
119   }
120   if XNN_UNPREDICTABLE(n != 0) {
121     const __m128i vh = _mm_loadu_si128((const __m128i*) i);
122 
123     const __m128i vsign = _mm_and_si128(vh, vsign_mask);
124 
125     const __m128i vnonsign = _mm_xor_si128(vh, vsign);
126 
127     const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
128     const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
129 
130     const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
131     const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
132 
133     const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
134     const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
135 
136     const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
137 
138     __m128i vf = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
139       _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
140 
141     if (n & (4 * sizeof(uint16_t))) {
142       _mm_storeu_ps(output, _mm_castsi128_ps(vf));
143       output += 4;
144 
145       vf = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
146         _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
147     }
148     if (n & (2 * sizeof(uint16_t))) {
149       _mm_storel_pi((__m64*) output, _mm_castsi128_ps(vf));
150       output += 2;
151 
152       vf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(vf), _mm_castsi128_ps(vf)));
153     }
154     if (n & (1 * sizeof(uint16_t))) {
155       _mm_store_ss(output, _mm_castsi128_ps(vf));
156     }
157   }
158 }
159 
xnn_f32_f16_vcvt_ukernel__sse41_x8(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])160 void xnn_f32_f16_vcvt_ukernel__sse41_x8(
161     size_t n,
162     const float* input,
163     void* output,
164     const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
165 {
166   assert(n != 0);
167   assert(n % sizeof(float) == 0);
168   assert(input != NULL);
169   assert(output != NULL);
170 
171   const __m128 vnonsign_mask = _mm_load_ps((const float*) params->sse2.nonsign_mask);
172   const __m128i vexp_bias = _mm_load_si128((const __m128i*) params->sse2.exp_bias);
173   const __m128 vscale_to_inf = _mm_load_ps(params->sse2.scale_to_inf);
174   const __m128i vexpw_max = _mm_load_si128((const __m128i*) params->sse2.expw_max);
175   const __m128 vscale_to_zero = _mm_load_ps(params->sse2.scale_to_zero);
176   const __m128i vbias_min = _mm_load_si128((const __m128i*) params->sse2.bias_min);
177   const __m128i vmanth_mask = _mm_load_si128((const __m128i*) params->sse2.manth_mask);
178   const __m128i vexph_mask = _mm_load_si128((const __m128i*) params->sse2.exph_mask);
179   const __m128i vnanh = _mm_load_si128((const __m128i*) params->sse2.nanh);
180 
181   uint16_t* o = (uint16_t*) output;
182   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
183     const __m128 vx_lo = _mm_loadu_ps(input);
184     const __m128 vx_hi = _mm_loadu_ps(input + 4);
185     input += 8;
186 
187     const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
188     const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
189 
190     const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
191     const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
192     __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
193     __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
194     __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
195     __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
196     const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
197     const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
198 
199     vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
200     vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
201     vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
202     vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
203     const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
204     const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
205 
206     vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
207     vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
208 
209     vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
210     vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
211 
212     __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
213     __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
214     const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
215     const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
216 
217     vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
218     vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
219 
220     const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
221     const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
222 
223     const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
224 
225     const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
226 
227     const __m128i vh = _mm_or_si128(vabsh, vsignh);
228 
229     _mm_storeu_si128((__m128i*) o, vh);
230     o += 8;
231   }
232   if XNN_UNPREDICTABLE(n != 0) {
233     const __m128 vx_lo = _mm_loadu_ps(input);
234     const float* input_hi = (const float*) ((uintptr_t) input + (n & (4 * sizeof(float))));
235     const __m128 vx_hi = _mm_loadu_ps(input_hi);
236 
237     const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
238     const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
239 
240     const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
241     const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
242     __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
243     __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
244     __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
245     __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
246     const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
247     const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
248 
249     vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
250     vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
251     vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
252     vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
253     const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
254     const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
255 
256     vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
257     vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
258 
259     vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
260     vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
261 
262     __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
263     __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
264     const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
265     const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
266 
267     vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
268     vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
269 
270     const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
271     const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
272 
273     const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
274 
275     const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
276 
277     __m128i vh = _mm_or_si128(vabsh, vsignh);
278 
279     if (n & (4 * sizeof(float))) {
280       _mm_storel_epi64((__m128i*) o, vh);
281       vh = _mm_unpackhi_epi64(vh, vh);
282       o += 4;
283     }
284     if (n & (2 * sizeof(float))) {
285       unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vh));
286       vh = _mm_srli_epi64(vh, 32);
287       o += 2;
288     }
289     if (n & (1 * sizeof(float))) {
290       *o = (uint16_t) _mm_extract_epi16(vh, 0);
291     }
292   }
293 }
294 
xnn_f32_prelu_ukernel__sse41_2x8(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride)295 void xnn_f32_prelu_ukernel__sse41_2x8(
296     size_t rows,
297     size_t channels,
298     const float*restrict input,
299     size_t input_stride,
300     const float*restrict weights,
301     float*restrict output,
302     size_t output_stride) XNN_OOB_READS
303 {
304   assert(rows != 0);
305   assert(channels != 0);
306   assert(channels % sizeof(float) == 0);
307 
308   const float* i0 = input;
309   float* o0 = output;
310   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
311   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
312 
313   const size_t input_increment = input_stride * 2 - channels;
314   const size_t output_increment = output_stride * 2 - channels;
315 
316   do {
317     if XNN_UNPREDICTABLE(rows < 2) {
318       i1 = i0;
319       o1 = o0;
320     }
321 
322     const float* w = weights;
323     size_t c = channels;
324     for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
325       const __m128 vw0123 = _mm_load_ps(w);
326       const __m128 vw4567 = _mm_load_ps(w + 4);
327       w += 8;
328 
329       const __m128 vi0x0123 = _mm_loadu_ps(i0);
330       const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
331       i0 += 8;
332       const __m128 vi1x0123 = _mm_loadu_ps(i1);
333       const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
334       i1 += 8;
335 
336       const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
337       const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567);
338       const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
339       const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567);
340 
341       const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
342       const __m128 vacc0x4567 = _mm_blendv_ps(vi0x4567, vprod0x4567, vi0x4567);
343       const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
344       const __m128 vacc1x4567 = _mm_blendv_ps(vi1x4567, vprod1x4567, vi1x4567);
345 
346       _mm_storeu_ps(o0, vacc0x0123);
347       _mm_storeu_ps(o0 + 4, vacc0x4567);
348       o0 += 8;
349       _mm_storeu_ps(o1, vacc1x0123);
350       _mm_storeu_ps(o1 + 4, vacc1x4567);
351       o1 += 8;
352     }
353     for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
354       const __m128 vw0123 = _mm_load_ps(w);
355       w += 4;
356 
357       const __m128 vi0x0123 = _mm_loadu_ps(i0);
358       i0 += 4;
359       const __m128 vi1x0123 = _mm_loadu_ps(i1);
360       i1 += 4;
361 
362       const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
363       const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
364 
365       __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
366       __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
367 
368       _mm_storeu_ps(o0, vacc0x0123);
369       o0 += 4;
370       _mm_storeu_ps(o1, vacc1x0123);
371       o1 += 4;
372     }
373     if XNN_UNLIKELY(c != 0) {
374       const __m128 vw0123 = _mm_load_ps(w);
375       w = (const float*) ((uintptr_t) w + c);
376 
377       const __m128 vi0x0123 = _mm_loadu_ps(i0);
378       i0 = (const float*) ((uintptr_t) i0 + c);
379       const __m128 vi1x0123 = _mm_loadu_ps(i1);
380       i1 = (const float*) ((uintptr_t) i1 + c);
381 
382       const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
383       const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
384 
385       __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
386       __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
387 
388       if (c & (2 * sizeof(float))) {
389         _mm_storel_pi((__m64*) o0, vacc0x0123);
390         _mm_storel_pi((__m64*) o1, vacc1x0123);
391 
392         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
393         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
394 
395         o0 += 2;
396         o1 += 2;
397       }
398       if (c & (1 * sizeof(float))) {
399         _mm_store_ss(o0, vacc0x0123);
400         _mm_store_ss(o1, vacc1x0123);
401 
402         o0 += 1;
403         o1 += 1;
404       }
405     }
406     i0 = (const float*) ((uintptr_t) i0 + input_increment);
407     o0 = (float*) ((uintptr_t) o0 + output_increment);
408     i1 = (const float*) ((uintptr_t) i1 + input_increment);
409     o1 = (float*) ((uintptr_t) o1 + output_increment);
410     rows = doz(rows, 2);
411   } while (rows != 0);
412 }
413 
xnn_f32_qs8_vcvt_ukernel__sse41_x32(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])414 void xnn_f32_qs8_vcvt_ukernel__sse41_x32(
415     size_t n,
416     const float* x,
417     int8_t* y,
418     const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
419 {
420   assert(n != 0);
421   assert(n % sizeof(float) == 0);
422   assert(x != NULL);
423   assert(y != NULL);
424 
425   const __m128 vscale = _mm_load_ps(params->sse4.scale);
426   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->sse4.output_max_less_zero_point);
427   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
428   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
429 
430   for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
431     __m128 vx0123 = _mm_loadu_ps(x);
432     __m128 vx4567 = _mm_loadu_ps(x + 4);
433     __m128 vx89AB = _mm_loadu_ps(x + 8);
434     __m128 vxCDEF = _mm_loadu_ps(x + 12);
435     __m128 vxGHIJ = _mm_loadu_ps(x + 16);
436     __m128 vxKLMN = _mm_loadu_ps(x + 20);
437     __m128 vxOPQR = _mm_loadu_ps(x + 24);
438     __m128 vxSTUV = _mm_loadu_ps(x + 28);
439     x += 32;
440 
441     vx0123 = _mm_mul_ps(vx0123, vscale);
442     vx4567 = _mm_mul_ps(vx4567, vscale);
443     vx89AB = _mm_mul_ps(vx89AB, vscale);
444     vxCDEF = _mm_mul_ps(vxCDEF, vscale);
445     vxGHIJ = _mm_mul_ps(vxGHIJ, vscale);
446     vxKLMN = _mm_mul_ps(vxKLMN, vscale);
447     vxOPQR = _mm_mul_ps(vxOPQR, vscale);
448     vxSTUV = _mm_mul_ps(vxSTUV, vscale);
449 
450     vx0123 = _mm_min_ps(vx0123, voutput_max_less_zero_point);
451     vx4567 = _mm_min_ps(vx4567, voutput_max_less_zero_point);
452     vx89AB = _mm_min_ps(vx89AB, voutput_max_less_zero_point);
453     vxCDEF = _mm_min_ps(vxCDEF, voutput_max_less_zero_point);
454     vxGHIJ = _mm_min_ps(vxGHIJ, voutput_max_less_zero_point);
455     vxKLMN = _mm_min_ps(vxKLMN, voutput_max_less_zero_point);
456     vxOPQR = _mm_min_ps(vxOPQR, voutput_max_less_zero_point);
457     vxSTUV = _mm_min_ps(vxSTUV, voutput_max_less_zero_point);
458 
459     const __m128i vy0123 = _mm_cvtps_epi32(vx0123);
460     const __m128i vy4567 = _mm_cvtps_epi32(vx4567);
461     const __m128i vy89AB = _mm_cvtps_epi32(vx89AB);
462     const __m128i vyCDEF = _mm_cvtps_epi32(vxCDEF);
463     const __m128i vyGHIJ = _mm_cvtps_epi32(vxGHIJ);
464     const __m128i vyKLMN = _mm_cvtps_epi32(vxKLMN);
465     const __m128i vyOPQR = _mm_cvtps_epi32(vxOPQR);
466     const __m128i vySTUV = _mm_cvtps_epi32(vxSTUV);
467 
468     __m128i vy01234567 = _mm_packs_epi32(vy0123, vy4567);
469     __m128i vy89ABCDEF = _mm_packs_epi32(vy89AB, vyCDEF);
470     __m128i vyGHIJKLMN = _mm_packs_epi32(vyGHIJ, vyKLMN);
471     __m128i vyOPQRSTUV = _mm_packs_epi32(vyOPQR, vySTUV);
472 
473     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
474     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
475     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
476     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
477 
478 
479     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
480     __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
481 
482     vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
483     vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
484 
485     _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
486     _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
487     y += 32;
488   }
489   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
490     __m128 vx_lo = _mm_loadu_ps(x);
491     __m128 vx_hi = _mm_loadu_ps(x + 4);
492     x += 8;
493 
494     vx_lo = _mm_mul_ps(vx_lo, vscale);
495     vx_hi = _mm_mul_ps(vx_hi, vscale);
496 
497     vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
498     vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
499 
500     const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
501     const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
502 
503     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
504     vy = _mm_adds_epi16(vy, voutput_zero_point);
505     vy = _mm_packs_epi16(vy, vy);
506     vy = _mm_max_epi8(vy, voutput_min);
507 
508     _mm_storel_epi64((__m128i*) y, vy);
509     y += 8;
510   }
511   if XNN_UNLIKELY(n != 0) {
512     __m128 vx_lo = _mm_loadu_ps(x);
513     const float* x_hi = (const float*) ((uintptr_t) x + (n & (4 * sizeof(float))));
514     __m128 vx_hi = _mm_loadu_ps(x_hi);
515 
516     vx_lo = _mm_mul_ps(vx_lo, vscale);
517     vx_hi = _mm_mul_ps(vx_hi, vscale);
518 
519     vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
520     vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
521 
522     const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
523     const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
524 
525     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
526     vy = _mm_adds_epi16(vy, voutput_zero_point);
527     vy = _mm_packs_epi16(vy, vy);
528     vy = _mm_max_epi8(vy, voutput_min);
529 
530     if (n & (4 * sizeof(float))) {
531       unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
532       y += 4;
533       vy = _mm_srli_epi64(vy, 32);
534     }
535     if (n & (2 * sizeof(float))) {
536       unaligned_store_u16(y, (uint16_t) _mm_extract_epi16(vy, 0));
537       y += 2;
538       vy = _mm_srli_epi32(vy, 16);
539     }
540     if (n & (1 * sizeof(float))) {
541       *y = (int8_t) _mm_extract_epi8(vy, 0);
542     }
543   }
544 }
545 
xnn_f32_vlrelu_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])546 void xnn_f32_vlrelu_ukernel__sse41_x8(
547     size_t n,
548     const float* x,
549     float* y,
550     const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
551 {
552   assert(n != 0);
553   assert(n % sizeof(float) == 0);
554 
555   const __m128 vslope = _mm_load_ps(params->sse.slope);
556   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
557     const __m128 vx0123 = _mm_loadu_ps(x);
558     const __m128 vx4567 = _mm_loadu_ps(x + 4);
559     x += 8;
560 
561     __m128 vacc0123 = _mm_mul_ps(vx0123, vslope);
562     __m128 vacc4567 = _mm_mul_ps(vx4567, vslope);
563 
564     vacc0123 = _mm_blendv_ps(vx0123, vacc0123, vx0123);
565     vacc4567 = _mm_blendv_ps(vx4567, vacc4567, vx4567);
566 
567     _mm_storeu_ps(y, vacc0123);
568     _mm_storeu_ps(y + 4, vacc4567);
569     y += 8;
570   }
571   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
572     const __m128 vx = _mm_loadu_ps(x);
573     x += 4;
574 
575     __m128 vacc = _mm_mul_ps(vx, vslope);
576     vacc = _mm_blendv_ps(vx, vacc, vx);
577 
578     _mm_storeu_ps(y, vacc);
579     y += 4;
580   }
581   if XNN_UNLIKELY(n != 0) {
582     const __m128 vx = _mm_loadu_ps(x);
583 
584     __m128 vacc = _mm_mul_ps(vx, vslope);
585     vacc = _mm_blendv_ps(vx, vacc, vx);
586 
587     if (n & (2 * sizeof(float))) {
588       _mm_storel_pi((__m64*) y, vacc);
589       vacc = _mm_movehl_ps(vacc, vacc);
590       y += 2;
591     }
592     if (n & (1 * sizeof(float))) {
593       _mm_store_ss(y, vacc);
594     }
595   }
596 }
597 
xnn_f32_vrndd_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])598 void xnn_f32_vrndd_ukernel__sse41_x8(
599     size_t n,
600     const float* x,
601     float* y,
602     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
603 {
604   assert(n != 0);
605   assert(n % sizeof(float) == 0);
606 
607   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
608     const __m128 vx0123 = _mm_loadu_ps(x);
609     const __m128 vx4567 = _mm_loadu_ps(x + 4);
610     x += 8;
611 
612     const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
613     const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
614 
615     _mm_storeu_ps(y, vy0123);
616     _mm_storeu_ps(y + 4, vy4567);
617     y += 8;
618   }
619   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
620     const __m128 vx = _mm_loadu_ps(x);
621     x += 4;
622 
623     const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
624 
625     _mm_storeu_ps(y, vy);
626     y += 4;
627   }
628   if XNN_UNLIKELY(n != 0) {
629     const __m128 vx = _mm_loadu_ps(x);
630     __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
631     if (n & (2 * sizeof(float))) {
632       _mm_storel_pi((__m64*) y, vy);
633       vy = _mm_movehl_ps(vy, vy);
634       y += 2;
635     }
636     if (n & (1 * sizeof(float))) {
637       _mm_store_ss(y, vy);
638     }
639   }
640 }
641 
xnn_f32_vrndne_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])642 void xnn_f32_vrndne_ukernel__sse41_x8(
643     size_t n,
644     const float* x,
645     float* y,
646     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
647 {
648   assert(n != 0);
649   assert(n % sizeof(float) == 0);
650 
651   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
652     const __m128 vx0123 = _mm_loadu_ps(x);
653     const __m128 vx4567 = _mm_loadu_ps(x + 4);
654     x += 8;
655 
656     const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
657     const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
658 
659     _mm_storeu_ps(y, vy0123);
660     _mm_storeu_ps(y + 4, vy4567);
661     y += 8;
662   }
663   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
664     const __m128 vx = _mm_loadu_ps(x);
665     x += 4;
666 
667     const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
668 
669     _mm_storeu_ps(y, vy);
670     y += 4;
671   }
672   if XNN_UNLIKELY(n != 0) {
673     const __m128 vx = _mm_loadu_ps(x);
674     __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
675     if (n & (2 * sizeof(float))) {
676       _mm_storel_pi((__m64*) y, vy);
677       vy = _mm_movehl_ps(vy, vy);
678       y += 2;
679     }
680     if (n & (1 * sizeof(float))) {
681       _mm_store_ss(y, vy);
682     }
683   }
684 }
685 
xnn_f32_vrndu_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])686 void xnn_f32_vrndu_ukernel__sse41_x8(
687     size_t n,
688     const float* x,
689     float* y,
690     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
691 {
692   assert(n != 0);
693   assert(n % sizeof(float) == 0);
694 
695   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
696     const __m128 vx0123 = _mm_loadu_ps(x);
697     const __m128 vx4567 = _mm_loadu_ps(x + 4);
698     x += 8;
699 
700     const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
701     const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
702 
703     _mm_storeu_ps(y, vy0123);
704     _mm_storeu_ps(y + 4, vy4567);
705     y += 8;
706   }
707   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
708     const __m128 vx = _mm_loadu_ps(x);
709     x += 4;
710 
711     const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
712 
713     _mm_storeu_ps(y, vy);
714     y += 4;
715   }
716   if XNN_UNLIKELY(n != 0) {
717     const __m128 vx = _mm_loadu_ps(x);
718     __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
719     if (n & (2 * sizeof(float))) {
720       _mm_storel_pi((__m64*) y, vy);
721       vy = _mm_movehl_ps(vy, vy);
722       y += 2;
723     }
724     if (n & (1 * sizeof(float))) {
725       _mm_store_ss(y, vy);
726     }
727   }
728 }
729 
xnn_f32_vrndz_ukernel__sse41_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])730 void xnn_f32_vrndz_ukernel__sse41_x8(
731     size_t n,
732     const float* x,
733     float* y,
734     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
735 {
736   assert(n != 0);
737   assert(n % sizeof(float) == 0);
738 
739   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
740     const __m128 vx0123 = _mm_loadu_ps(x);
741     const __m128 vx4567 = _mm_loadu_ps(x + 4);
742     x += 8;
743 
744     const __m128 vy0123 = _mm_round_ps(vx0123, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
745     const __m128 vy4567 = _mm_round_ps(vx4567, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
746 
747     _mm_storeu_ps(y, vy0123);
748     _mm_storeu_ps(y + 4, vy4567);
749     y += 8;
750   }
751   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
752     const __m128 vx = _mm_loadu_ps(x);
753     x += 4;
754 
755     const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
756 
757     _mm_storeu_ps(y, vy);
758     y += 4;
759   }
760   if XNN_UNLIKELY(n != 0) {
761     const __m128 vx = _mm_loadu_ps(x);
762     __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
763     if (n & (2 * sizeof(float))) {
764       _mm_storel_pi((__m64*) y, vy);
765       vy = _mm_movehl_ps(vy, vy);
766       y += 2;
767     }
768     if (n & (1 * sizeof(float))) {
769       _mm_store_ss(y, vy);
770     }
771   }
772 }
773 
774 extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64];
775 
xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])776 void xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8(
777     size_t n,
778     const float* x,
779     float* y,
780     const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
781 {
782   assert(n % sizeof(float) == 0);
783 
784   const __m128 vsign_mask = _mm_load_ps(params->sse2_rr2_lut64_p2.sign_mask);
785   const __m128 vmagic_bias = _mm_load_ps(params->sse2_rr2_lut64_p2.magic_bias);
786   const __m128 vlog2e = _mm_load_ps(params->sse2_rr2_lut64_p2.log2e);
787   const __m128i vindex_mask = _mm_load_si128((const __m128i*) params->sse2_rr2_lut64_p2.index_mask);
788   const __m128 vminus_ln2_hi = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_hi);
789   const __m128 vminus_ln2_lo = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_lo);
790   const __m128 vc2 = _mm_load_ps(params->sse2_rr2_lut64_p2.c2);
791   const __m128 vone = _mm_load_ps(params->sse2_rr2_lut64_p2.one);
792   const __m128 vdenorm_cutoff = _mm_load_ps(params->sse2_rr2_lut64_p2.denorm_cutoff);
793 
794   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
795     const __m128 vx0123 = _mm_loadu_ps(x);
796     const __m128 vx4567 = _mm_loadu_ps(x + 4);
797     x += 8;
798 
799     const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
800     const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
801 
802     __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
803     __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
804 
805     const __m128i ve0123 = _mm_slli_epi32(_mm_castps_si128(vn0123), 17);
806     const __m128i ve4567 = _mm_slli_epi32(_mm_castps_si128(vn4567), 17);
807 
808     const __m128i vidx0123 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn0123), vindex_mask), 2);
809     const __m128i vidx4567 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn4567), vindex_mask), 2);
810 
811     #if XNN_ARCH_X86_64
812       const uint64_t vidx01 = (uint64_t) _mm_cvtsi128_si64(vidx0123);
813       const uint64_t vidx23 = (uint64_t) _mm_extract_epi64(vidx0123, 1);
814       const __m128i vl0   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)));
815       const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)));
816       const __m128i vl01 = _mm_insert_epi32(vl0, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))), 1);
817       const __m128i vl23 = _mm_insert_epi32(vl2, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))), 1);
818       const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
819       const uint64_t vidx45 = (uint64_t) _mm_cvtsi128_si64(vidx4567);
820       const uint64_t vidx67 = (uint64_t) _mm_extract_epi64(vidx4567, 1);
821       const __m128i vl4   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)));
822       const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)));
823       const __m128i vl45 = _mm_insert_epi32(vl4, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))), 1);
824       const __m128i vl67 = _mm_insert_epi32(vl6, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))), 1);
825       const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
826     #else  // !XNN_ARCH_X86_64
827       const uint32_t vidx0 = (uint32_t) _mm_cvtsi128_si32(vidx0123);
828       const uint32_t vidx1 = (uint32_t) _mm_extract_epi16(vidx0123, 2);
829       const uint32_t vidx2 = (uint32_t) _mm_extract_epi16(vidx0123, 4);
830       const uint32_t vidx3 = (uint32_t) _mm_extract_epi16(vidx0123, 6);
831       const __m128i vl0   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx0)));
832       const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx2)));
833       const __m128i vl01 = _mm_insert_epi32(vl0, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx1)), 1);
834       const __m128i vl23 = _mm_insert_epi32(vl2, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx3)), 1);
835       const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
836       const uint32_t vidx4 = (uint32_t) _mm_cvtsi128_si32(vidx4567);
837       const uint32_t vidx5 = (uint32_t) _mm_extract_epi16(vidx4567, 2);
838       const uint32_t vidx6 = (uint32_t) _mm_extract_epi16(vidx4567, 4);
839       const uint32_t vidx7 = (uint32_t) _mm_extract_epi16(vidx4567, 6);
840       const __m128i vl4   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx4)));
841       const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx6)));
842       const __m128i vl45 = _mm_insert_epi32(vl4, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx5)), 1);
843       const __m128i vl67 = _mm_insert_epi32(vl6, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx7)), 1);
844       const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
845     #endif  // XNN_ARCH_X86_64
846 
847     const __m128 vs0123 = _mm_castsi128_ps(_mm_add_epi32(vl0123, ve0123));
848     const __m128 vs4567 = _mm_castsi128_ps(_mm_add_epi32(vl4567, ve4567));
849 
850     vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
851     vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
852 
853     __m128 vt0123 = _mm_add_ps(vz0123, _mm_mul_ps(vn0123, vminus_ln2_hi));
854     __m128 vt4567 = _mm_add_ps(vz4567, _mm_mul_ps(vn4567, vminus_ln2_hi));
855 
856     vt0123 = _mm_add_ps(vt0123, _mm_mul_ps(vn0123, vminus_ln2_lo));
857     vt4567 = _mm_add_ps(vt4567, _mm_mul_ps(vn4567, vminus_ln2_lo));
858 
859     __m128 vp0123 = _mm_mul_ps(vt0123, vc2);
860     __m128 vp4567 = _mm_mul_ps(vt4567, vc2);
861 
862     vp0123 = _mm_add_ps(vt0123, _mm_mul_ps(vp0123, vt0123));
863     vp4567 = _mm_add_ps(vt4567, _mm_mul_ps(vp4567, vt4567));
864 
865     const __m128 vy0123 = _mm_add_ps(vs0123, _mm_mul_ps(vs0123, vp0123));
866     const __m128 vy4567 = _mm_add_ps(vs4567, _mm_mul_ps(vs4567, vp4567));
867 
868     __m128 vf0123 = _mm_div_ps(vy0123, _mm_add_ps(vy0123, vone));
869     __m128 vf4567 = _mm_div_ps(vy4567, _mm_add_ps(vy4567, vone));
870 
871     vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
872     vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
873 
874     vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
875     vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
876 
877     _mm_storeu_ps(y, vf0123);
878     _mm_storeu_ps(y + 4, vf4567);
879     y += 8;
880   }
881   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
882     const __m128 vx = _mm_loadu_ps(x);
883     x += 4;
884 
885     const __m128 vz = _mm_or_ps(vx, vsign_mask);
886 
887     __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
888     const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
889 
890     const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
891     #if XNN_ARCH_X86_64
892       const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
893       const uint64_t vidx_hi = (uint64_t) _mm_extract_epi64(vidx, 1);
894       const __m128i vl_ll   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
895       const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
896       const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1);
897       const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))), 1);
898     #else  // !XNN_ARCH_X86_64
899       const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
900       const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
901       const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))), 1);
902       const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))), 1);
903     #endif  // XNN_ARCH_X86_64
904     const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
905 
906     const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
907     vn = _mm_sub_ps(vn, vmagic_bias);
908 
909     __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
910     vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
911 
912     __m128 vp = _mm_mul_ps(vt, vc2);
913     vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
914 
915     const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
916 
917     __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
918     vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
919     vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
920 
921     _mm_storeu_ps(y, vf);
922     y += 4;
923   }
924   if XNN_UNLIKELY(n != 0) {
925     const __m128 vx = _mm_loadu_ps(x);
926 
927     const __m128 vz = _mm_or_ps(vx, vsign_mask);
928 
929     __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
930     const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
931 
932     const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
933     #if XNN_ARCH_X86_64
934       const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
935       const uint64_t vidx_hi = (uint64_t) _mm_extract_epi64(vidx, 1);
936       const __m128i vl_ll   = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
937       const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
938       const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))), 1);
939       const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))), 1);
940     #else  // !XNN_ARCH_X86_64
941       const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
942       const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
943       const __m128i vl_lo = _mm_insert_epi32(vl_ll, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))), 1);
944       const __m128i vl_hi = _mm_insert_epi32(vl_hl, *((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))), 1);
945     #endif  // XNN_ARCH_X86_64
946     const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
947 
948     const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
949     vn = _mm_sub_ps(vn, vmagic_bias);
950 
951     __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
952     vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
953 
954     __m128 vp = _mm_mul_ps(vt, vc2);
955     vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
956 
957     const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
958 
959     __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
960     vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
961     vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
962 
963     if (n & (2 * sizeof(float))) {
964       _mm_storel_pi((__m64*) y, vf);
965       vf = _mm_movehl_ps(vf, vf);
966       y += 2;
967     }
968     if (n & (1 * sizeof(float))) {
969       _mm_store_ss(y, vf);
970     }
971   }
972 }
973 
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])974 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(
975     size_t channels,
976     size_t output_width,
977     const int8_t** input,
978     const void* weights,
979     int8_t* output,
980     size_t input_stride,
981     size_t output_increment,
982     size_t input_offset,
983     const int8_t* zero,
984     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
985 {
986   assert(channels != 0);
987   assert(output_width != 0);
988 
989   do {
990     const int8_t* i0 = input[0];
991     assert(i0 != NULL);
992     if XNN_UNPREDICTABLE(i0 != zero) {
993       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
994     }
995     const int8_t* i1 = input[1];
996     assert(i1 != NULL);
997     if XNN_UNPREDICTABLE(i1 != zero) {
998       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
999     }
1000     const int8_t* i2 = input[2];
1001     assert(i2 != NULL);
1002     if XNN_UNPREDICTABLE(i2 != zero) {
1003       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1004     }
1005     const int8_t* i3 = input[3];
1006     assert(i3 != NULL);
1007     if XNN_UNPREDICTABLE(i3 != zero) {
1008       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
1009     }
1010     const int8_t* i4 = input[4];
1011     assert(i4 != NULL);
1012     if XNN_UNPREDICTABLE(i4 != zero) {
1013       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
1014     }
1015     const int8_t* i5 = input[5];
1016     assert(i5 != NULL);
1017     if XNN_UNPREDICTABLE(i5 != zero) {
1018       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
1019     }
1020     const int8_t* i6 = input[6];
1021     assert(i6 != NULL);
1022     if XNN_UNPREDICTABLE(i6 != zero) {
1023       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
1024     }
1025     const int8_t* i7 = input[7];
1026     assert(i7 != NULL);
1027     if XNN_UNPREDICTABLE(i7 != zero) {
1028       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
1029     }
1030     const int8_t* i8 = input[8];
1031     assert(i8 != NULL);
1032     if XNN_UNPREDICTABLE(i8 != zero) {
1033       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
1034     }
1035     const int8_t* i9 = input[9];
1036     assert(i9 != NULL);
1037     if XNN_UNPREDICTABLE(i9 != zero) {
1038       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
1039     }
1040     const int8_t* i10 = input[10];
1041     assert(i10 != NULL);
1042     if XNN_UNPREDICTABLE(i10 != zero) {
1043       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
1044     }
1045     const int8_t* i11 = input[11];
1046     assert(i11 != NULL);
1047     if XNN_UNPREDICTABLE(i11 != zero) {
1048       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
1049     }
1050     const int8_t* i12 = input[12];
1051     assert(i12 != NULL);
1052     if XNN_UNPREDICTABLE(i12 != zero) {
1053       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
1054     }
1055     const int8_t* i13 = input[13];
1056     assert(i13 != NULL);
1057     if XNN_UNPREDICTABLE(i13 != zero) {
1058       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
1059     }
1060     const int8_t* i14 = input[14];
1061     assert(i14 != NULL);
1062     if XNN_UNPREDICTABLE(i14 != zero) {
1063       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
1064     }
1065     const int8_t* i15 = input[15];
1066     assert(i15 != NULL);
1067     if XNN_UNPREDICTABLE(i15 != zero) {
1068       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
1069     }
1070     const int8_t* i16 = input[16];
1071     assert(i16 != NULL);
1072     if XNN_UNPREDICTABLE(i16 != zero) {
1073       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
1074     }
1075     const int8_t* i17 = input[17];
1076     assert(i17 != NULL);
1077     if XNN_UNPREDICTABLE(i17 != zero) {
1078       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
1079     }
1080     const int8_t* i18 = input[18];
1081     assert(i18 != NULL);
1082     if XNN_UNPREDICTABLE(i18 != zero) {
1083       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
1084     }
1085     const int8_t* i19 = input[19];
1086     assert(i19 != NULL);
1087     if XNN_UNPREDICTABLE(i19 != zero) {
1088       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
1089     }
1090     const int8_t* i20 = input[20];
1091     assert(i20 != NULL);
1092     if XNN_UNPREDICTABLE(i20 != zero) {
1093       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
1094     }
1095     const int8_t* i21 = input[21];
1096     assert(i21 != NULL);
1097     if XNN_UNPREDICTABLE(i21 != zero) {
1098       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
1099     }
1100     const int8_t* i22 = input[22];
1101     assert(i22 != NULL);
1102     if XNN_UNPREDICTABLE(i22 != zero) {
1103       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
1104     }
1105     const int8_t* i23 = input[23];
1106     assert(i23 != NULL);
1107     if XNN_UNPREDICTABLE(i23 != zero) {
1108       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
1109     }
1110     const int8_t* i24 = input[24];
1111     assert(i24 != NULL);
1112     if XNN_UNPREDICTABLE(i24 != zero) {
1113       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
1114     }
1115     input = (const int8_t**) ((uintptr_t) input + input_stride);
1116 
1117     size_t c = channels;
1118     const void* w = weights;
1119     for (; c >= 8; c -= 8) {
1120       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1121       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1122 
1123 
1124       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1125       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1126       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1127       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1128       i0 += 8;
1129 
1130 
1131       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1132 
1133       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1134       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1135 
1136       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1137       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1138       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1139       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1140       i1 += 8;
1141 
1142 
1143       vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1144 
1145       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1146       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1147 
1148       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1149       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1150       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1151       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1152       i2 += 8;
1153 
1154 
1155       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1156 
1157       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1158       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1159 
1160       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
1161       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
1162       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1163       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
1164       i3 += 8;
1165 
1166 
1167       vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
1168 
1169       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1170       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1171 
1172       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
1173       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
1174       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
1175       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
1176       i4 += 8;
1177 
1178 
1179       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
1180 
1181       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1182       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1183 
1184       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
1185       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
1186       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
1187       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
1188       i5 += 8;
1189 
1190 
1191       vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
1192 
1193       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1194       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1195 
1196       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
1197       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
1198       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
1199       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
1200       i6 += 8;
1201 
1202 
1203       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
1204 
1205       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1206       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1207 
1208       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
1209       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
1210       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
1211       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
1212       i7 += 8;
1213 
1214 
1215       vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
1216 
1217       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1218       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1219 
1220       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
1221       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
1222       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
1223       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
1224       i8 += 8;
1225 
1226 
1227       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
1228 
1229       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1230       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1231 
1232       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
1233       const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
1234       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
1235       const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
1236       i9 += 8;
1237 
1238 
1239       vprod01234567 = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
1240 
1241       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1242       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1243 
1244       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
1245       const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
1246       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
1247       const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
1248       i10 += 8;
1249 
1250 
1251       vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
1252 
1253       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1254       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1255 
1256       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
1257       const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
1258       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
1259       const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
1260       i11 += 8;
1261 
1262 
1263       vprod01234567 = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
1264 
1265       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1266       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1267 
1268       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
1269       const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
1270       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
1271       const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
1272       i12 += 8;
1273 
1274 
1275       vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
1276 
1277       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1278       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1279 
1280       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
1281       const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
1282       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
1283       const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
1284       i13 += 8;
1285 
1286 
1287       vprod01234567 = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
1288 
1289       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1290       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1291 
1292       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
1293       const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
1294       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
1295       const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
1296       i14 += 8;
1297 
1298 
1299       vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
1300 
1301       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1302       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1303 
1304       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
1305       const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
1306       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
1307       const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
1308       i15 += 8;
1309 
1310 
1311       vprod01234567 = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
1312 
1313       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1314       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1315 
1316       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
1317       const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
1318       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
1319       const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
1320       i16 += 8;
1321 
1322 
1323       vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
1324 
1325       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1326       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1327 
1328       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
1329       const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
1330       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
1331       const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
1332       i17 += 8;
1333 
1334 
1335       vprod01234567 = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
1336 
1337       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1338       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1339 
1340       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
1341       const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
1342       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
1343       const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
1344       i18 += 8;
1345 
1346 
1347       vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
1348 
1349       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1350       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1351 
1352       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
1353       const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
1354       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
1355       const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
1356       i19 += 8;
1357 
1358 
1359       vprod01234567 = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
1360 
1361       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1362       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1363 
1364       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
1365       const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
1366       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
1367       const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
1368       i20 += 8;
1369 
1370 
1371       vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
1372 
1373       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1374       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1375 
1376       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
1377       const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
1378       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
1379       const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
1380       i21 += 8;
1381 
1382 
1383       vprod01234567 = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
1384 
1385       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1386       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1387 
1388       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
1389       const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
1390       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
1391       const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
1392       i22 += 8;
1393 
1394 
1395       vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
1396 
1397       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1398       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1399 
1400       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
1401       const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
1402       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
1403       const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
1404       i23 += 8;
1405 
1406 
1407       vprod01234567 = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
1408 
1409       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1410       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1411 
1412       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
1413       const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
1414       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
1415       const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
1416       i24 += 8;
1417 
1418 
1419       vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
1420 
1421       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1422       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1423 
1424       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
1425 
1426       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1427       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1428 
1429       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
1430       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
1431       w = (const void*) ((const float*) w + 8);
1432       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1433       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1434 
1435       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1436       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1437       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1438 
1439       vacc0123 = _mm_cvtps_epi32(vscaled0123);
1440       vacc4567 = _mm_cvtps_epi32(vscaled4567);
1441 
1442       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1443       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1444 
1445 
1446       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1447 
1448       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
1449       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1450 
1451       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1452       output += 8;
1453     }
1454     if XNN_UNLIKELY(c != 0) {
1455       {
1456         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1457         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1458 
1459 
1460         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1461         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1462         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1463         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1464 
1465 
1466         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1467 
1468         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1469         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1470 
1471         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1472         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1473         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1474         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1475 
1476 
1477         vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1478 
1479         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1480         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1481 
1482         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1483         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1484         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1485         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1486 
1487 
1488         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1489 
1490         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1491         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1492 
1493         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
1494         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
1495         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1496         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
1497 
1498 
1499         vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
1500 
1501         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1502         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1503 
1504         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
1505         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
1506         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
1507         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
1508 
1509 
1510         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
1511 
1512         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1513         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1514 
1515         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
1516         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
1517         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
1518         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
1519 
1520 
1521         vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
1522 
1523         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1524         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1525 
1526         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
1527         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
1528         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
1529         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
1530 
1531 
1532         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
1533 
1534         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1535         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1536 
1537         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
1538         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
1539         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
1540         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
1541 
1542 
1543         vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
1544 
1545         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1546         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1547 
1548         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
1549         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
1550         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
1551         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
1552 
1553 
1554         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
1555 
1556         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1557         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1558 
1559         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
1560         const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
1561         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
1562         const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
1563 
1564 
1565         vprod01234567 = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
1566 
1567         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1568         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1569 
1570         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
1571         const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
1572         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
1573         const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
1574 
1575 
1576         vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
1577 
1578         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1579         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1580 
1581         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
1582         const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
1583         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
1584         const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
1585 
1586 
1587         vprod01234567 = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
1588 
1589         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1590         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1591 
1592         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
1593         const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
1594         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
1595         const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
1596 
1597 
1598         vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
1599 
1600         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1601         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1602 
1603         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
1604         const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
1605         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
1606         const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
1607 
1608 
1609         vprod01234567 = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
1610 
1611         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1612         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1613 
1614         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
1615         const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
1616         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
1617         const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
1618 
1619 
1620         vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
1621 
1622         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1623         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1624 
1625         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
1626         const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
1627         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
1628         const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
1629 
1630 
1631         vprod01234567 = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
1632 
1633         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1634         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1635 
1636         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
1637         const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
1638         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
1639         const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
1640 
1641 
1642         vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
1643 
1644         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1645         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1646 
1647         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
1648         const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
1649         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
1650         const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
1651 
1652 
1653         vprod01234567 = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
1654 
1655         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1656         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1657 
1658         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
1659         const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
1660         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
1661         const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
1662 
1663 
1664         vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
1665 
1666         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1667         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1668 
1669         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
1670         const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
1671         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
1672         const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
1673 
1674 
1675         vprod01234567 = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
1676 
1677         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1678         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1679 
1680         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
1681         const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
1682         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
1683         const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
1684 
1685 
1686         vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
1687 
1688         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1689         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1690 
1691         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
1692         const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
1693         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
1694         const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
1695 
1696 
1697         vprod01234567 = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
1698 
1699         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1700         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1701 
1702         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
1703         const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
1704         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
1705         const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
1706 
1707 
1708         vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
1709 
1710         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1711         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1712 
1713         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
1714         const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
1715         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
1716         const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
1717 
1718 
1719         vprod01234567 = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
1720 
1721         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1722         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1723 
1724         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
1725         const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
1726         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
1727         const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
1728 
1729 
1730         vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
1731 
1732         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1733         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1734 
1735 
1736         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1737         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1738 
1739         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t)));
1740         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t) + 4 * sizeof(float)));
1741         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1742         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1743 
1744         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1745         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1746         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1747 
1748         vacc0123 = _mm_cvtps_epi32(vscaled0123);
1749         vacc4567 = _mm_cvtps_epi32(vscaled4567);
1750 
1751 
1752         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1753         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1754 
1755 
1756         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1757 
1758         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
1759 
1760         if (c & 4) {
1761           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
1762           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1763           output += 4;
1764         }
1765         if (c & 2) {
1766           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
1767           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1768           output += 2;
1769         }
1770         if (c & 1) {
1771           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1772           output += 1;
1773         }
1774       }
1775     }
1776 
1777     output = (int8_t*) ((uintptr_t) output + output_increment);
1778   } while (--output_width != 0);
1779 }
1780 
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1781 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16(
1782     size_t channels,
1783     size_t output_width,
1784     const int8_t** input,
1785     const void* weights,
1786     int8_t* output,
1787     size_t input_stride,
1788     size_t output_increment,
1789     size_t input_offset,
1790     const int8_t* zero,
1791     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1792 {
1793   assert(channels != 0);
1794   assert(output_width != 0);
1795 
1796   do {
1797     const int8_t* i0 = input[0];
1798     assert(i0 != NULL);
1799     if XNN_UNPREDICTABLE(i0 != zero) {
1800       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1801     }
1802     const int8_t* i1 = input[1];
1803     assert(i1 != NULL);
1804     if XNN_UNPREDICTABLE(i1 != zero) {
1805       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1806     }
1807     const int8_t* i2 = input[2];
1808     assert(i2 != NULL);
1809     if XNN_UNPREDICTABLE(i2 != zero) {
1810       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
1811     }
1812     input = (const int8_t**) ((uintptr_t) input + input_stride);
1813 
1814     size_t c = channels;
1815     const void* w = weights;
1816     for (; c >= 8; c -= 8) {
1817       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1818       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1819 
1820 
1821       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1822       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1823       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1824       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1825       i0 += 8;
1826 
1827 
1828       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1829 
1830       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1831       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1832 
1833       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1834       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1835       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1836       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1837       i1 += 8;
1838 
1839 
1840       vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1841 
1842       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1843       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1844 
1845       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1846       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1847       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1848       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1849       i2 += 8;
1850 
1851 
1852       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1853 
1854       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1855       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1856 
1857       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t));
1858 
1859       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1860       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1861 
1862       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
1863       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
1864       w = (const void*) ((const float*) w + 8);
1865       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1866       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1867 
1868       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1869       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1870       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1871 
1872       vacc0123 = _mm_cvtps_epi32(vscaled0123);
1873       vacc4567 = _mm_cvtps_epi32(vscaled4567);
1874 
1875       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1876       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1877 
1878 
1879       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1880 
1881       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
1882       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
1883 
1884       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
1885       output += 8;
1886     }
1887     if XNN_UNLIKELY(c != 0) {
1888       {
1889         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
1890         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
1891 
1892 
1893         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
1894         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
1895         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
1896         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
1897 
1898 
1899         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
1900 
1901         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1902         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1903 
1904         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
1905         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
1906         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
1907         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
1908 
1909 
1910         vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
1911 
1912         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1913         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1914 
1915         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
1916         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
1917         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
1918         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
1919 
1920 
1921         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
1922 
1923         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
1924         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
1925 
1926 
1927         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
1928         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
1929 
1930         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
1931         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t) + 4 * sizeof(float)));
1932         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
1933         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
1934 
1935         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
1936         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
1937         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
1938 
1939         vacc0123 = _mm_cvtps_epi32(vscaled0123);
1940         vacc4567 = _mm_cvtps_epi32(vscaled4567);
1941 
1942 
1943         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
1944         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
1945 
1946 
1947         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
1948 
1949         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
1950 
1951         if (c & 4) {
1952           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
1953           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
1954           output += 4;
1955         }
1956         if (c & 2) {
1957           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
1958           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
1959           output += 2;
1960         }
1961         if (c & 1) {
1962           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
1963           output += 1;
1964         }
1965       }
1966     }
1967 
1968     output = (int8_t*) ((uintptr_t) output + output_increment);
1969   } while (--output_width != 0);
1970 }
1971 
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1972 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(
1973     size_t channels,
1974     size_t output_width,
1975     const int8_t** input,
1976     const void* weights,
1977     int8_t* output,
1978     size_t input_stride,
1979     size_t output_increment,
1980     size_t input_offset,
1981     const int8_t* zero,
1982     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1983 {
1984   assert(channels != 0);
1985   assert(output_width != 0);
1986 
1987   do {
1988     const int8_t* i0 = input[0];
1989     assert(i0 != NULL);
1990     if XNN_UNPREDICTABLE(i0 != zero) {
1991       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
1992     }
1993     const int8_t* i1 = input[1];
1994     assert(i1 != NULL);
1995     if XNN_UNPREDICTABLE(i1 != zero) {
1996       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
1997     }
1998     const int8_t* i2 = input[2];
1999     assert(i2 != NULL);
2000     if XNN_UNPREDICTABLE(i2 != zero) {
2001       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2002     }
2003     const int8_t* i3 = input[3];
2004     assert(i3 != NULL);
2005     if XNN_UNPREDICTABLE(i3 != zero) {
2006       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2007     }
2008     const int8_t* i4 = input[4];
2009     assert(i4 != NULL);
2010     if XNN_UNPREDICTABLE(i4 != zero) {
2011       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2012     }
2013     const int8_t* i5 = input[5];
2014     assert(i5 != NULL);
2015     if XNN_UNPREDICTABLE(i5 != zero) {
2016       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2017     }
2018     const int8_t* i6 = input[6];
2019     assert(i6 != NULL);
2020     if XNN_UNPREDICTABLE(i6 != zero) {
2021       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2022     }
2023     const int8_t* i7 = input[7];
2024     assert(i7 != NULL);
2025     if XNN_UNPREDICTABLE(i7 != zero) {
2026       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2027     }
2028     const int8_t* i8 = input[8];
2029     assert(i8 != NULL);
2030     if XNN_UNPREDICTABLE(i8 != zero) {
2031       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2032     }
2033     input = (const int8_t**) ((uintptr_t) input + input_stride);
2034 
2035     size_t c = channels;
2036     const void* w = weights;
2037     for (; c >= 8; c -= 8) {
2038       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
2039       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
2040 
2041 
2042       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
2043       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
2044       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
2045       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
2046       i0 += 8;
2047 
2048 
2049       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
2050 
2051       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2052       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2053 
2054       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
2055       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
2056       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
2057       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
2058       i1 += 8;
2059 
2060 
2061       vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
2062 
2063       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2064       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2065 
2066       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
2067       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
2068       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
2069       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
2070       i2 += 8;
2071 
2072 
2073       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
2074 
2075       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2076       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2077 
2078       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
2079       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
2080       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
2081       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
2082       i3 += 8;
2083 
2084 
2085       vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
2086 
2087       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2088       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2089 
2090       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
2091       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
2092       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
2093       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
2094       i4 += 8;
2095 
2096 
2097       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
2098 
2099       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2100       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2101 
2102       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
2103       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
2104       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
2105       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
2106       i5 += 8;
2107 
2108 
2109       vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
2110 
2111       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2112       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2113 
2114       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
2115       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
2116       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
2117       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
2118       i6 += 8;
2119 
2120 
2121       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
2122 
2123       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2124       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2125 
2126       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
2127       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
2128       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
2129       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
2130       i7 += 8;
2131 
2132 
2133       vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
2134 
2135       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2136       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2137 
2138       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
2139       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
2140       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
2141       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
2142       i8 += 8;
2143 
2144 
2145       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
2146 
2147       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2148       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2149 
2150       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
2151 
2152       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
2153       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
2154 
2155       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
2156       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
2157       w = (const void*) ((const float*) w + 8);
2158       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
2159       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
2160 
2161       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2162       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
2163       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
2164 
2165       vacc0123 = _mm_cvtps_epi32(vscaled0123);
2166       vacc4567 = _mm_cvtps_epi32(vscaled4567);
2167 
2168       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2169       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
2170 
2171 
2172       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2173 
2174       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
2175       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
2176 
2177       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
2178       output += 8;
2179     }
2180     if XNN_UNLIKELY(c != 0) {
2181       {
2182         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
2183         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
2184 
2185 
2186         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
2187         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
2188         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
2189         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
2190 
2191 
2192         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
2193 
2194         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2195         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2196 
2197         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
2198         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
2199         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
2200         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
2201 
2202 
2203         vprod01234567 = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
2204 
2205         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2206         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2207 
2208         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
2209         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
2210         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
2211         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
2212 
2213 
2214         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
2215 
2216         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2217         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2218 
2219         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
2220         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
2221         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
2222         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
2223 
2224 
2225         vprod01234567 = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
2226 
2227         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2228         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2229 
2230         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
2231         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
2232         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
2233         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
2234 
2235 
2236         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
2237 
2238         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2239         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2240 
2241         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
2242         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
2243         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
2244         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
2245 
2246 
2247         vprod01234567 = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
2248 
2249         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2250         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2251 
2252         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
2253         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
2254         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
2255         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
2256 
2257 
2258         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
2259 
2260         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2261         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2262 
2263         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
2264         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
2265         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
2266         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
2267 
2268 
2269         vprod01234567 = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
2270 
2271         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2272         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2273 
2274         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
2275         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
2276         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
2277         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
2278 
2279 
2280         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
2281 
2282         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
2283         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
2284 
2285 
2286         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
2287         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
2288 
2289         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
2290         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t) + 4 * sizeof(float)));
2291         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
2292         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
2293 
2294         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2295         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
2296         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
2297 
2298         vacc0123 = _mm_cvtps_epi32(vscaled0123);
2299         vacc4567 = _mm_cvtps_epi32(vscaled4567);
2300 
2301 
2302         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2303         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
2304 
2305 
2306         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
2307 
2308         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2309 
2310         if (c & 4) {
2311           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
2312           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
2313           output += 4;
2314         }
2315         if (c & 2) {
2316           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
2317           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
2318           output += 2;
2319         }
2320         if (c & 1) {
2321           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
2322           output += 1;
2323         }
2324       }
2325     }
2326 
2327     output = (int8_t*) ((uintptr_t) output + output_increment);
2328   } while (--output_width != 0);
2329 }
2330 
xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2331 void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
2332     size_t mr,
2333     size_t nc,
2334     size_t kc,
2335     const int8_t* restrict a,
2336     size_t a_stride,
2337     const void* restrict w,
2338     int8_t* restrict c,
2339     size_t cm_stride,
2340     size_t cn_stride,
2341     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2342 {
2343   assert(mr != 0);
2344   assert(mr <= 1);
2345   assert(nc != 0);
2346   assert(kc != 0);
2347   assert(kc % sizeof(int8_t) == 0);
2348   assert(a != NULL);
2349   assert(w != NULL);
2350   assert(c != NULL);
2351 
2352   kc = round_up_po2(kc, 8);
2353   const int8_t* a0 = a;
2354   int8_t* c0 = c;
2355 
2356   do {
2357     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2358     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2359     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2360     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2361     w = (const int32_t*) w + 4;
2362 
2363     size_t k = 0;
2364     while (k < kc) {
2365       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2366       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2367       a0 += 8;
2368 
2369       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2370       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2371 
2372       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2373       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2374       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2375 
2376       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2377       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2378       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2379 
2380       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2381       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2382       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2383 
2384       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2385 
2386       w = (const void*) ((const int8_t*) w + 32);
2387       k += 8 * sizeof(int8_t);
2388     }
2389 
2390     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2391     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2392 
2393     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2394 
2395     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2396 
2397     const __m128 vscale0123 = _mm_load_ps((const float*) w);
2398     w = (const void*) ((const float*) w + 4);
2399     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2400 
2401     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2402     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2403 
2404     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2405 
2406     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2407     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
2408 
2409 
2410     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
2411 
2412     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2413 
2414     if (nc >= 4) {
2415       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2416 
2417       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2418 
2419       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2420 
2421       nc -= 4;
2422     } else {
2423       if (nc & 2) {
2424         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2425         c0 += 2;
2426         vout = _mm_srli_epi32(vout, 16);
2427       }
2428       if (nc & 1) {
2429         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2430       }
2431 
2432       nc = 0;
2433     }
2434   } while (nc != 0);
2435 }
2436 
xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2437 void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
2438     size_t mr,
2439     size_t nc,
2440     size_t kc,
2441     const int8_t* restrict a,
2442     size_t a_stride,
2443     const void* restrict w,
2444     int8_t* restrict c,
2445     size_t cm_stride,
2446     size_t cn_stride,
2447     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2448 {
2449   assert(mr != 0);
2450   assert(mr <= 3);
2451   assert(nc != 0);
2452   assert(kc != 0);
2453   assert(kc % sizeof(int8_t) == 0);
2454   assert(a != NULL);
2455   assert(w != NULL);
2456   assert(c != NULL);
2457 
2458   kc = round_up_po2(kc, 8);
2459   const int8_t* a0 = a;
2460   int8_t* c0 = c;
2461   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
2462   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2463   if XNN_UNPREDICTABLE(mr < 2) {
2464     a1 = a0;
2465     c1 = c0;
2466   }
2467   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
2468   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2469   if XNN_UNPREDICTABLE(mr <= 2) {
2470     a2 = a1;
2471     c2 = c1;
2472   }
2473 
2474   do {
2475     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2476     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2477     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2478     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2479     __m128i vacc1x0 = vacc0x0;
2480     __m128i vacc1x1 = vacc0x1;
2481     __m128i vacc1x2 = vacc0x2;
2482     __m128i vacc1x3 = vacc0x3;
2483     __m128i vacc2x0 = vacc0x0;
2484     __m128i vacc2x1 = vacc0x1;
2485     __m128i vacc2x2 = vacc0x2;
2486     __m128i vacc2x3 = vacc0x3;
2487     w = (const int32_t*) w + 4;
2488 
2489     size_t k = 0;
2490     while (k < kc) {
2491       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2492       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2493       a0 += 8;
2494       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
2495       const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
2496       a1 += 8;
2497       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
2498       const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
2499       a2 += 8;
2500 
2501       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2502       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2503 
2504       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2505       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
2506       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
2507       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2508       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2509 
2510       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2511       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
2512       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
2513       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2514       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2515 
2516       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2517       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
2518       vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
2519       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2520       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2521 
2522       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2523       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
2524       vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
2525 
2526       w = (const void*) ((const int8_t*) w + 32);
2527       k += 8 * sizeof(int8_t);
2528     }
2529 
2530     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2531     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2532     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
2533     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
2534     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
2535     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
2536 
2537     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2538     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
2539     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
2540 
2541     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2542     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
2543     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
2544 
2545     const __m128 vscale0123 = _mm_load_ps((const float*) w);
2546     w = (const void*) ((const float*) w + 4);
2547     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2548     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
2549     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
2550 
2551     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2552     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2553     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
2554     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
2555 
2556     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2557     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
2558     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
2559 
2560     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2561     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
2562     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
2563 
2564 
2565     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
2566 
2567     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2568 
2569     if (nc >= 4) {
2570       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2571       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
2572       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
2573 
2574       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2575       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2576       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2577 
2578       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
2579       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
2580       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
2581 
2582       nc -= 4;
2583     } else {
2584       if (nc & 2) {
2585         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2586         c0 += 2;
2587         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
2588         c1 += 2;
2589         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
2590         c2 += 2;
2591         vout = _mm_srli_epi32(vout, 16);
2592       }
2593       if (nc & 1) {
2594         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2595         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
2596         *c2 = (int8_t) _mm_extract_epi8(vout, 8);
2597       }
2598 
2599       nc = 0;
2600     }
2601   } while (nc != 0);
2602 }
2603 
xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2604 void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
2605     size_t mr,
2606     size_t nc,
2607     size_t kc,
2608     size_t ks,
2609     const int8_t** restrict a,
2610     const void* restrict w,
2611     int8_t* restrict c,
2612     size_t cm_stride,
2613     size_t cn_stride,
2614     size_t a_offset,
2615     const int8_t* zero,
2616     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2617 {
2618   assert(mr != 0);
2619   assert(mr <= 1);
2620   assert(nc != 0);
2621   assert(kc != 0);
2622   assert(ks != 0);
2623   assert(ks % (1 * sizeof(void*)) == 0);
2624   assert(a_offset % sizeof(int8_t) == 0);
2625   assert(a != NULL);
2626   assert(w != NULL);
2627   assert(c != NULL);
2628 
2629   kc = round_up_po2(kc, 8);
2630   int8_t* c0 = c;
2631 
2632   do {
2633     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2634     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2635     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2636     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2637     w = (const int32_t*) w + 4;
2638 
2639     size_t p = ks;
2640     do {
2641       const int8_t* restrict a0 = a[0];
2642       if XNN_UNPREDICTABLE(a0 != zero) {
2643         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2644       }
2645       a += 1;
2646 
2647       size_t k = 0;
2648       while (k < kc) {
2649         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2650         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2651         a0 += 8;
2652 
2653         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2654         const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2655 
2656         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2657         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2658         const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2659 
2660         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2661         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2662         const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2663 
2664         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2665         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2666         const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2667 
2668         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2669 
2670         w = (const void*) ((const int8_t*) w + 32);
2671         k += 8 * sizeof(int8_t);
2672       }
2673       p -= 1 * sizeof(void*);
2674     } while (p != 0);
2675 
2676     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2677     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2678 
2679     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2680 
2681     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2682 
2683     const __m128 vscale0123 = _mm_load_ps((const float*) w);
2684     w = (const void*) ((const float*) w + 4);
2685     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2686 
2687     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2688     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2689 
2690     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2691 
2692     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2693     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
2694 
2695 
2696     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
2697 
2698     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2699 
2700     if (nc >= 4) {
2701       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2702       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2703 
2704       a = (const int8_t**restrict) ((uintptr_t) a - ks);
2705 
2706       nc -= 4;
2707     } else {
2708       if (nc & 2) {
2709         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2710         c0 += 2;
2711         vout = _mm_srli_epi32(vout, 16);
2712       }
2713       if (nc & 1) {
2714         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2715       }
2716 
2717       nc = 0;
2718     }
2719   } while (nc != 0);
2720 }
2721 
xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2722 void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
2723     size_t mr,
2724     size_t nc,
2725     size_t kc,
2726     size_t ks,
2727     const int8_t** restrict a,
2728     const void* restrict w,
2729     int8_t* restrict c,
2730     size_t cm_stride,
2731     size_t cn_stride,
2732     size_t a_offset,
2733     const int8_t* zero,
2734     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2735 {
2736   assert(mr != 0);
2737   assert(mr <= 3);
2738   assert(nc != 0);
2739   assert(kc != 0);
2740   assert(ks != 0);
2741   assert(ks % (3 * sizeof(void*)) == 0);
2742   assert(a_offset % sizeof(int8_t) == 0);
2743   assert(a != NULL);
2744   assert(w != NULL);
2745   assert(c != NULL);
2746 
2747   kc = round_up_po2(kc, 8);
2748   int8_t* c0 = c;
2749   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
2750   if XNN_UNPREDICTABLE(mr < 2) {
2751     c1 = c0;
2752   }
2753   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
2754   if XNN_UNPREDICTABLE(mr <= 2) {
2755     c2 = c1;
2756   }
2757 
2758   do {
2759     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
2760     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
2761     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
2762     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
2763     __m128i vacc1x0 = vacc0x0;
2764     __m128i vacc1x1 = vacc0x1;
2765     __m128i vacc1x2 = vacc0x2;
2766     __m128i vacc1x3 = vacc0x3;
2767     __m128i vacc2x0 = vacc0x0;
2768     __m128i vacc2x1 = vacc0x1;
2769     __m128i vacc2x2 = vacc0x2;
2770     __m128i vacc2x3 = vacc0x3;
2771     w = (const int32_t*) w + 4;
2772 
2773     size_t p = ks;
2774     do {
2775       const int8_t* restrict a0 = a[0];
2776       if XNN_UNPREDICTABLE(a0 != zero) {
2777         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
2778       }
2779       const int8_t* restrict a1 = a[1];
2780       if XNN_UNPREDICTABLE(a1 != zero) {
2781         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
2782       }
2783       const int8_t* restrict a2 = a[2];
2784       if XNN_UNPREDICTABLE(a2 != zero) {
2785         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
2786       }
2787       a += 3;
2788 
2789       size_t k = 0;
2790       while (k < kc) {
2791         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
2792         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
2793         a0 += 8;
2794         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
2795         const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
2796         a1 += 8;
2797         const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
2798         const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
2799         a2 += 8;
2800 
2801         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
2802         const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
2803 
2804         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
2805         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
2806         vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
2807         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
2808         const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
2809 
2810         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
2811         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
2812         vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
2813         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
2814         const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
2815 
2816         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
2817         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
2818         vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
2819         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
2820         const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
2821 
2822         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
2823         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
2824         vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
2825 
2826         w = (const void*) ((const int8_t*) w + 32);
2827         k += 8 * sizeof(int8_t);
2828       }
2829       p -= 3 * sizeof(void*);
2830     } while (p != 0);
2831 
2832     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
2833     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
2834     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
2835     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
2836     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
2837     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
2838 
2839     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
2840     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
2841     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
2842 
2843     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
2844     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
2845     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
2846 
2847     const __m128 vscale0123 = _mm_load_ps((const float*) w);
2848     w = (const void*) ((const float*) w + 4);
2849     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
2850     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
2851     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
2852 
2853     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
2854     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
2855     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
2856     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
2857 
2858     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
2859     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
2860     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
2861 
2862     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
2863     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
2864     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
2865 
2866 
2867     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
2868 
2869     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
2870 
2871     if (nc >= 4) {
2872       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
2873       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
2874       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
2875       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
2876       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
2877       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
2878 
2879       a = (const int8_t**restrict) ((uintptr_t) a - ks);
2880 
2881       nc -= 4;
2882     } else {
2883       if (nc & 2) {
2884         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
2885         c2 += 2;
2886         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
2887         c1 += 2;
2888         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
2889         c0 += 2;
2890         vout = _mm_srli_epi32(vout, 16);
2891       }
2892       if (nc & 1) {
2893         *c2 = (int8_t) _mm_extract_epi8(vout, 8);
2894         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
2895         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
2896       }
2897 
2898       nc = 0;
2899     }
2900   } while (nc != 0);
2901 }
2902 
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2903 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16(
2904     size_t channels,
2905     size_t output_width,
2906     const int8_t** input,
2907     const void* weights,
2908     int8_t* output,
2909     size_t input_stride,
2910     size_t output_increment,
2911     size_t input_offset,
2912     const int8_t* zero,
2913     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2914 {
2915   assert(channels != 0);
2916   assert(output_width != 0);
2917 
2918   do {
2919     const int8_t* i0 = input[0];
2920     assert(i0 != NULL);
2921     if XNN_UNPREDICTABLE(i0 != zero) {
2922       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2923     }
2924     const int8_t* i1 = input[1];
2925     assert(i1 != NULL);
2926     if XNN_UNPREDICTABLE(i1 != zero) {
2927       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2928     }
2929     const int8_t* i2 = input[2];
2930     assert(i2 != NULL);
2931     if XNN_UNPREDICTABLE(i2 != zero) {
2932       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2933     }
2934     const int8_t* i3 = input[3];
2935     assert(i3 != NULL);
2936     if XNN_UNPREDICTABLE(i3 != zero) {
2937       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2938     }
2939     const int8_t* i4 = input[4];
2940     assert(i4 != NULL);
2941     if XNN_UNPREDICTABLE(i4 != zero) {
2942       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2943     }
2944     const int8_t* i5 = input[5];
2945     assert(i5 != NULL);
2946     if XNN_UNPREDICTABLE(i5 != zero) {
2947       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2948     }
2949     const int8_t* i6 = input[6];
2950     assert(i6 != NULL);
2951     if XNN_UNPREDICTABLE(i6 != zero) {
2952       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2953     }
2954     const int8_t* i7 = input[7];
2955     assert(i7 != NULL);
2956     if XNN_UNPREDICTABLE(i7 != zero) {
2957       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2958     }
2959     const int8_t* i8 = input[8];
2960     assert(i8 != NULL);
2961     if XNN_UNPREDICTABLE(i8 != zero) {
2962       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2963     }
2964     const int8_t* i9 = input[9];
2965     assert(i9 != NULL);
2966     if XNN_UNPREDICTABLE(i9 != zero) {
2967       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2968     }
2969     const int8_t* i10 = input[10];
2970     assert(i10 != NULL);
2971     if XNN_UNPREDICTABLE(i10 != zero) {
2972       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2973     }
2974     const int8_t* i11 = input[11];
2975     assert(i11 != NULL);
2976     if XNN_UNPREDICTABLE(i11 != zero) {
2977       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2978     }
2979     const int8_t* i12 = input[12];
2980     assert(i12 != NULL);
2981     if XNN_UNPREDICTABLE(i12 != zero) {
2982       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2983     }
2984     const int8_t* i13 = input[13];
2985     assert(i13 != NULL);
2986     if XNN_UNPREDICTABLE(i13 != zero) {
2987       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2988     }
2989     const int8_t* i14 = input[14];
2990     assert(i14 != NULL);
2991     if XNN_UNPREDICTABLE(i14 != zero) {
2992       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
2993     }
2994     const int8_t* i15 = input[15];
2995     assert(i15 != NULL);
2996     if XNN_UNPREDICTABLE(i15 != zero) {
2997       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
2998     }
2999     const int8_t* i16 = input[16];
3000     assert(i16 != NULL);
3001     if XNN_UNPREDICTABLE(i16 != zero) {
3002       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
3003     }
3004     const int8_t* i17 = input[17];
3005     assert(i17 != NULL);
3006     if XNN_UNPREDICTABLE(i17 != zero) {
3007       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
3008     }
3009     const int8_t* i18 = input[18];
3010     assert(i18 != NULL);
3011     if XNN_UNPREDICTABLE(i18 != zero) {
3012       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
3013     }
3014     const int8_t* i19 = input[19];
3015     assert(i19 != NULL);
3016     if XNN_UNPREDICTABLE(i19 != zero) {
3017       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
3018     }
3019     const int8_t* i20 = input[20];
3020     assert(i20 != NULL);
3021     if XNN_UNPREDICTABLE(i20 != zero) {
3022       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
3023     }
3024     const int8_t* i21 = input[21];
3025     assert(i21 != NULL);
3026     if XNN_UNPREDICTABLE(i21 != zero) {
3027       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
3028     }
3029     const int8_t* i22 = input[22];
3030     assert(i22 != NULL);
3031     if XNN_UNPREDICTABLE(i22 != zero) {
3032       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
3033     }
3034     const int8_t* i23 = input[23];
3035     assert(i23 != NULL);
3036     if XNN_UNPREDICTABLE(i23 != zero) {
3037       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
3038     }
3039     const int8_t* i24 = input[24];
3040     assert(i24 != NULL);
3041     if XNN_UNPREDICTABLE(i24 != zero) {
3042       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
3043     }
3044     input = (const int8_t**) ((uintptr_t) input + input_stride);
3045 
3046     size_t c = channels;
3047     const void* w = weights;
3048     for (; c >= 8; c -= 8) {
3049       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3050       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3051 
3052 
3053       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3054       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3055       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3056       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3057       i0 += 8;
3058 
3059 
3060       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3061 
3062 
3063       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3064       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3065       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3066       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3067       i1 += 8;
3068 
3069 
3070       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3071 
3072       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3073       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3074 
3075       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3076       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3077       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3078       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3079       i2 += 8;
3080 
3081 
3082       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3083 
3084 
3085       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3086       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3087       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3088       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3089       i3 += 8;
3090 
3091 
3092       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3093 
3094       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3095       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3096 
3097       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3098       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3099       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3100       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3101       i4 += 8;
3102 
3103 
3104       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3105 
3106 
3107       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3108       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3109       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3110       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3111       i5 += 8;
3112 
3113 
3114       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3115 
3116       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3117       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3118 
3119       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3120       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3121       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3122       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3123       i6 += 8;
3124 
3125 
3126       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3127 
3128 
3129       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3130       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3131       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3132       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3133       i7 += 8;
3134 
3135 
3136       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3137 
3138       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3139       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3140 
3141       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3142       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3143       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3144       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3145       i8 += 8;
3146 
3147 
3148       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3149 
3150 
3151       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
3152       const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
3153       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
3154       const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
3155       i9 += 8;
3156 
3157 
3158       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
3159 
3160       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3161       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3162 
3163       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
3164       const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
3165       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
3166       const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
3167       i10 += 8;
3168 
3169 
3170       vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
3171 
3172 
3173       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
3174       const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
3175       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
3176       const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
3177       i11 += 8;
3178 
3179 
3180       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
3181 
3182       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3183       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3184 
3185       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
3186       const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
3187       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
3188       const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
3189       i12 += 8;
3190 
3191 
3192       vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3193 
3194 
3195       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3196       const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
3197       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3198       const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
3199       i13 += 8;
3200 
3201 
3202       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
3203 
3204       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3205       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3206 
3207       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3208       const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
3209       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3210       const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
3211       i14 += 8;
3212 
3213 
3214       vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3215 
3216 
3217       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3218       const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
3219       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3220       const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
3221       i15 += 8;
3222 
3223 
3224       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
3225 
3226       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3227       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3228 
3229       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3230       const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
3231       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3232       const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
3233       i16 += 8;
3234 
3235 
3236       vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3237 
3238 
3239       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3240       const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
3241       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3242       const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
3243       i17 += 8;
3244 
3245 
3246       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
3247 
3248       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3249       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3250 
3251       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3252       const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
3253       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3254       const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
3255       i18 += 8;
3256 
3257 
3258       vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3259 
3260 
3261       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3262       const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
3263       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3264       const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
3265       i19 += 8;
3266 
3267 
3268       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
3269 
3270       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3271       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3272 
3273       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3274       const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
3275       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3276       const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
3277       i20 += 8;
3278 
3279 
3280       vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3281 
3282 
3283       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3284       const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
3285       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3286       const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
3287       i21 += 8;
3288 
3289 
3290       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
3291 
3292       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3293       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3294 
3295       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3296       const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
3297       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3298       const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
3299       i22 += 8;
3300 
3301 
3302       vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3303 
3304 
3305       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3306       const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
3307       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3308       const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
3309       i23 += 8;
3310 
3311 
3312       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
3313 
3314       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3315       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3316 
3317       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3318       const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
3319       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3320       const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
3321       i24 += 8;
3322 
3323 
3324       vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3325 
3326       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3327       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3328 
3329       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
3330 
3331       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3332       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3333 
3334       const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3335       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3336       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3337 
3338       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3339       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3340       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3341 
3342       vacc0123 = _mm_cvtps_epi32(vscaled0123);
3343       vacc4567 = _mm_cvtps_epi32(vscaled4567);
3344 
3345       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3346       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3347 
3348 
3349       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3350 
3351       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
3352       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3353 
3354       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3355       output += 8;
3356     }
3357     if XNN_UNLIKELY(c != 0) {
3358       {
3359         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3360         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3361 
3362 
3363         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3364         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3365         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3366         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3367 
3368 
3369         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3370 
3371 
3372         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3373         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3374         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3375         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3376 
3377 
3378         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3379 
3380         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3381         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3382 
3383         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3384         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3385         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3386         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3387 
3388 
3389         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3390 
3391 
3392         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3393         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3394         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3395         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3396 
3397 
3398         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3399 
3400         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3401         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3402 
3403         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3404         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3405         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3406         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3407 
3408 
3409         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3410 
3411 
3412         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3413         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3414         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3415         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3416 
3417 
3418         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3419 
3420         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3421         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3422 
3423         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3424         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3425         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3426         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3427 
3428 
3429         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3430 
3431 
3432         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3433         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3434         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3435         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3436 
3437 
3438         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3439 
3440         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3441         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3442 
3443         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3444         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3445         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3446         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3447 
3448 
3449         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3450 
3451 
3452         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
3453         const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
3454         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
3455         const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
3456 
3457 
3458         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
3459 
3460         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3461         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3462 
3463         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
3464         const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
3465         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
3466         const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
3467 
3468 
3469         vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
3470 
3471 
3472         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
3473         const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
3474         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
3475         const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
3476 
3477 
3478         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
3479 
3480         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3481         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3482 
3483         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
3484         const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
3485         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
3486         const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
3487 
3488 
3489         vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3490 
3491 
3492         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3493         const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
3494         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3495         const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
3496 
3497 
3498         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
3499 
3500         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3501         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3502 
3503         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3504         const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
3505         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3506         const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
3507 
3508 
3509         vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3510 
3511 
3512         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3513         const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
3514         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3515         const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
3516 
3517 
3518         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
3519 
3520         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3521         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3522 
3523         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3524         const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
3525         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3526         const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
3527 
3528 
3529         vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3530 
3531 
3532         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3533         const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
3534         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3535         const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
3536 
3537 
3538         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
3539 
3540         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3541         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3542 
3543         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3544         const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
3545         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3546         const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
3547 
3548 
3549         vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3550 
3551 
3552         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3553         const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
3554         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3555         const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
3556 
3557 
3558         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
3559 
3560         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3561         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3562 
3563         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3564         const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
3565         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3566         const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
3567 
3568 
3569         vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3570 
3571 
3572         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3573         const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
3574         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3575         const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
3576 
3577 
3578         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
3579 
3580         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3581         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3582 
3583         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3584         const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
3585         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3586         const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
3587 
3588 
3589         vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3590 
3591 
3592         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3593         const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
3594         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3595         const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
3596 
3597 
3598         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
3599 
3600         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3601         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3602 
3603         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3604         const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
3605         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3606         const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
3607 
3608 
3609         vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3610 
3611         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3612         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3613 
3614 
3615         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3616         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3617 
3618         const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3619         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3620         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3621 
3622         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3623         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3624         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3625 
3626         vacc0123 = _mm_cvtps_epi32(vscaled0123);
3627         vacc4567 = _mm_cvtps_epi32(vscaled4567);
3628 
3629 
3630         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3631         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3632 
3633 
3634         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3635 
3636         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
3637 
3638         if (c & 4) {
3639           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3640           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3641           output += 4;
3642         }
3643         if (c & 2) {
3644           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3645           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3646           output += 2;
3647         }
3648         if (c & 1) {
3649           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3650           output += 1;
3651         }
3652       }
3653     }
3654 
3655     output = (int8_t*) ((uintptr_t) output + output_increment);
3656   } while (--output_width != 0);
3657 }
3658 
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3659 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16(
3660     size_t channels,
3661     size_t output_width,
3662     const int8_t** input,
3663     const void* weights,
3664     int8_t* output,
3665     size_t input_stride,
3666     size_t output_increment,
3667     size_t input_offset,
3668     const int8_t* zero,
3669     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3670 {
3671   assert(channels != 0);
3672   assert(output_width != 0);
3673 
3674   do {
3675     const int8_t* i0 = input[0];
3676     assert(i0 != NULL);
3677     if XNN_UNPREDICTABLE(i0 != zero) {
3678       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3679     }
3680     const int8_t* i1 = input[1];
3681     assert(i1 != NULL);
3682     if XNN_UNPREDICTABLE(i1 != zero) {
3683       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3684     }
3685     const int8_t* i2 = input[2];
3686     assert(i2 != NULL);
3687     if XNN_UNPREDICTABLE(i2 != zero) {
3688       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3689     }
3690     const int8_t* i3 = input[3];
3691     assert(i3 != NULL);
3692     if XNN_UNPREDICTABLE(i3 != zero) {
3693       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3694     }
3695     const int8_t* i4 = input[4];
3696     assert(i4 != NULL);
3697     if XNN_UNPREDICTABLE(i4 != zero) {
3698       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3699     }
3700     const int8_t* i5 = input[5];
3701     assert(i5 != NULL);
3702     if XNN_UNPREDICTABLE(i5 != zero) {
3703       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3704     }
3705     const int8_t* i6 = input[6];
3706     assert(i6 != NULL);
3707     if XNN_UNPREDICTABLE(i6 != zero) {
3708       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3709     }
3710     const int8_t* i7 = input[7];
3711     assert(i7 != NULL);
3712     if XNN_UNPREDICTABLE(i7 != zero) {
3713       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3714     }
3715     const int8_t* i8 = input[8];
3716     assert(i8 != NULL);
3717     if XNN_UNPREDICTABLE(i8 != zero) {
3718       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3719     }
3720     input = (const int8_t**) ((uintptr_t) input + input_stride);
3721 
3722     size_t c = channels;
3723     const void* w = weights;
3724     for (; c >= 8; c -= 8) {
3725       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3726       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3727 
3728 
3729       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3730       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3731       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3732       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3733       i0 += 8;
3734 
3735 
3736       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3737 
3738 
3739       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3740       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3741       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3742       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3743       i1 += 8;
3744 
3745 
3746       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3747 
3748       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3749       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3750 
3751       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3752       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3753       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3754       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3755       i2 += 8;
3756 
3757 
3758       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3759 
3760 
3761       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3762       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3763       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3764       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3765       i3 += 8;
3766 
3767 
3768       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3769 
3770       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3771       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3772 
3773       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3774       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3775       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3776       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3777       i4 += 8;
3778 
3779 
3780       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3781 
3782 
3783       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3784       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3785       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3786       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3787       i5 += 8;
3788 
3789 
3790       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3791 
3792       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3793       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3794 
3795       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3796       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3797       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3798       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3799       i6 += 8;
3800 
3801 
3802       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3803 
3804 
3805       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3806       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3807       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3808       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3809       i7 += 8;
3810 
3811 
3812       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3813 
3814       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3815       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3816 
3817       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3818       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3819       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3820       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3821       i8 += 8;
3822 
3823 
3824       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3825 
3826       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3827       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3828 
3829       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
3830 
3831       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3832       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3833 
3834       const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3835       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3836       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3837 
3838       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3839       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3840       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3841 
3842       vacc0123 = _mm_cvtps_epi32(vscaled0123);
3843       vacc4567 = _mm_cvtps_epi32(vscaled4567);
3844 
3845       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3846       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3847 
3848 
3849       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3850 
3851       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
3852       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
3853 
3854       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3855       output += 8;
3856     }
3857     if XNN_UNLIKELY(c != 0) {
3858       {
3859         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3860         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3861 
3862 
3863         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3864         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
3865         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3866         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
3867 
3868 
3869         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3870 
3871 
3872         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3873         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
3874         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3875         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
3876 
3877 
3878         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
3879 
3880         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3881         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3882 
3883         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3884         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
3885         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3886         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
3887 
3888 
3889         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3890 
3891 
3892         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3893         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
3894         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3895         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
3896 
3897 
3898         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
3899 
3900         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3901         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3902 
3903         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3904         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
3905         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3906         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
3907 
3908 
3909         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3910 
3911 
3912         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3913         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
3914         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3915         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
3916 
3917 
3918         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
3919 
3920         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3921         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3922 
3923         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3924         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
3925         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3926         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
3927 
3928 
3929         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3930 
3931 
3932         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3933         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
3934         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3935         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
3936 
3937 
3938         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
3939 
3940         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3941         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3942 
3943         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3944         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
3945         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3946         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
3947 
3948 
3949         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3950 
3951         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
3952         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
3953 
3954 
3955         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3956         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3957 
3958         const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
3959         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
3960         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
3961 
3962         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
3963         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3964         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3965 
3966         vacc0123 = _mm_cvtps_epi32(vscaled0123);
3967         vacc4567 = _mm_cvtps_epi32(vscaled4567);
3968 
3969 
3970         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
3971         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3972 
3973 
3974         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3975 
3976         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
3977 
3978         if (c & 4) {
3979           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3980           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3981           output += 4;
3982         }
3983         if (c & 2) {
3984           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3985           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3986           output += 2;
3987         }
3988         if (c & 1) {
3989           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
3990           output += 1;
3991         }
3992       }
3993     }
3994 
3995     output = (int8_t*) ((uintptr_t) output + output_increment);
3996   } while (--output_width != 0);
3997 }
3998 
xnn_qs8_f32_vcvt_ukernel__sse41_x16(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])3999 void xnn_qs8_f32_vcvt_ukernel__sse41_x16(
4000     size_t n,
4001     const int8_t* x,
4002     float* y,
4003     const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4004 {
4005   assert(n != 0);
4006   assert(n % sizeof(int8_t) == 0);
4007   assert(x != NULL);
4008   assert(y != NULL);
4009 
4010   const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->sse4.minus_zero_point);
4011   const __m128 vscale = _mm_load_ps(params->sse4.scale);
4012   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
4013     __m128i vx0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4014     __m128i vx4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
4015     __m128i vx89AB = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
4016     __m128i vxCDEF = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
4017     x += 16;
4018 
4019     vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
4020     vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
4021     vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
4022     vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
4023 
4024     __m128 vy0123 = _mm_cvtepi32_ps(vx0123);
4025     __m128 vy4567 = _mm_cvtepi32_ps(vx4567);
4026     __m128 vy89AB = _mm_cvtepi32_ps(vx89AB);
4027     __m128 vyCDEF = _mm_cvtepi32_ps(vxCDEF);
4028 
4029     vy0123 = _mm_mul_ps(vy0123, vscale);
4030     vy4567 = _mm_mul_ps(vy4567, vscale);
4031     vy89AB = _mm_mul_ps(vy89AB, vscale);
4032     vyCDEF = _mm_mul_ps(vyCDEF, vscale);
4033 
4034     _mm_storeu_ps(y, vy0123);
4035     _mm_storeu_ps(y + 4, vy4567);
4036     _mm_storeu_ps(y + 8, vy89AB);
4037     _mm_storeu_ps(y + 12, vyCDEF);
4038     y += 16;
4039   }
4040   for (; n >= 4 * sizeof(int8_t); n -= 4 * sizeof(int8_t)) {
4041     __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4042     vx = _mm_add_epi32(vx, vminus_zero_point);
4043     x += 4;
4044 
4045     __m128 vy = _mm_cvtepi32_ps(vx);
4046     vy = _mm_mul_ps(vy, vscale);
4047 
4048     _mm_storeu_ps(y, vy);
4049     y += 4;
4050   }
4051   if XNN_UNLIKELY(n != 0) {
4052     assert(n >= 1 * sizeof(int8_t));
4053     assert(n <= 3 * sizeof(int8_t));
4054 
4055     __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
4056     vx = _mm_add_epi32(vx, vminus_zero_point);
4057 
4058     __m128 vy = _mm_cvtepi32_ps(vx);
4059     vy = _mm_mul_ps(vy, vscale);
4060 
4061     if (n & (2 * sizeof(int8_t))) {
4062       _mm_storel_pi((__m64*) y, vy);
4063       vy = _mm_movehl_ps(vy, vy);
4064       y += 2;
4065     }
4066     if (n & (1 * sizeof(int8_t))) {
4067       _mm_store_ss(y, vy);
4068     }
4069   }
4070 }
4071 
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4072 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
4073     size_t rows,
4074     size_t channels,
4075     const int8_t* input,
4076     size_t input_stride,
4077     const int8_t* zero,
4078     int32_t* buffer,
4079     int8_t* output,
4080     const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4081 {
4082   assert(rows > 7);
4083   assert(channels != 0);
4084 
4085   const int8_t* i0 = input;
4086   const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
4087   const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
4088   const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
4089   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
4090   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
4091   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
4092   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
4093 
4094   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
4095   int32_t* b = buffer;
4096   size_t c = channels;
4097   for (; c != 0; c = doz(c, 8)) {
4098     const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4099     i0 += 8;
4100     const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4101     i1 += 8;
4102 
4103     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4104     const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4105     i2 += 8;
4106 
4107     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4108     const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4109     i3 += 8;
4110     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4111     const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4112     i4 += 8;
4113     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4114     const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4115     i5 += 8;
4116     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4117     const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4118     i6 += 8;
4119 
4120     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4121 
4122     __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4123     __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4124 
4125     vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4126     vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4127 
4128     _mm_store_si128((__m128i*) b, vacc0123);
4129     _mm_store_si128((__m128i*) (b + 4), vacc4567);
4130     b += 8;
4131   }
4132 
4133   for (rows -= 7; rows > 7; rows -= 7) {
4134     i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
4135     i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
4136     i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
4137     i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
4138     i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
4139     i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
4140     i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
4141 
4142     int32_t* b = buffer;
4143     size_t c = channels;
4144     for (; c != 0; c = doz(c, 8)) {
4145       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4146       i0 += 8;
4147       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4148       i1 += 8;
4149 
4150       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4151       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4152       i2 += 8;
4153 
4154       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4155       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4156       i3 += 8;
4157       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4158       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4159       i4 += 8;
4160       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4161       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4162       i5 += 8;
4163       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4164       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4165       i6 += 8;
4166 
4167       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4168 
4169       __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4170       __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4171 
4172       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
4173       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
4174 
4175       _mm_store_si128((__m128i*) b, vacc0123);
4176       _mm_store_si128((__m128i*) (b + 4), vacc4567);
4177       b += 8;
4178     }
4179   }
4180 
4181   i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
4182   i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
4183   if XNN_UNPREDICTABLE(rows < 2) {
4184     i1 = zero;
4185   }
4186   i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
4187   if XNN_UNPREDICTABLE(rows <= 2) {
4188     i2 = zero;
4189   }
4190   i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
4191   if XNN_UNPREDICTABLE(rows < 4) {
4192     i3 = zero;
4193   }
4194   i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
4195   if XNN_UNPREDICTABLE(rows <= 4) {
4196     i4 = zero;
4197   }
4198   i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
4199   if XNN_UNPREDICTABLE(rows < 6) {
4200     i5 = zero;
4201   }
4202   i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
4203   if XNN_UNPREDICTABLE(rows <= 6) {
4204     i6 = zero;
4205   }
4206 
4207   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4208   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4209   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4210   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
4211   for (; channels >= 8; channels -= 8) {
4212     const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4213     i0 += 8;
4214     const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4215     i1 += 8;
4216 
4217     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4218     const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4219     i2 += 8;
4220 
4221     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4222     const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4223     i3 += 8;
4224     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4225     const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4226     i4 += 8;
4227     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4228     const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4229     i5 += 8;
4230     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4231     const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4232     i6 += 8;
4233 
4234     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4235 
4236     __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4237     __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4238 
4239     vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
4240     vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
4241     buffer += 8;
4242 
4243     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4244     __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4245 
4246     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4247     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4248 
4249     vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4250     vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4251 
4252     vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4253     vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4254 
4255     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4256 
4257     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4258 
4259     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4260 
4261     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4262     output += 8;
4263   }
4264   if XNN_UNLIKELY(channels != 0) {
4265     {
4266       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4267       i0 += 8;
4268       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4269       i1 += 8;
4270 
4271       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4272       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4273       i2 += 8;
4274 
4275       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4276       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4277       i3 += 8;
4278       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4279       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4280       i4 += 8;
4281       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4282       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4283       i5 += 8;
4284       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4285       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4286       i6 += 8;
4287 
4288       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4289 
4290       __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4291       __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4292 
4293       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
4294       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
4295       buffer += 8;
4296 
4297       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4298       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4299 
4300       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4301       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4302 
4303       vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4304       vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4305 
4306       vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4307       vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4308 
4309       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4310 
4311       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4312       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4313 
4314       if (channels & 4) {
4315         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4316         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4317         output += 4;
4318       }
4319       if (channels & 2) {
4320         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4321         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4322         output += 2;
4323       }
4324       if (channels & 1) {
4325         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4326       }
4327     }
4328   }
4329 }
4330 
xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4331 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
4332     size_t rows,
4333     size_t channels,
4334     const int8_t* input,
4335     size_t input_stride,
4336     const int8_t* zero,
4337     int8_t* output,
4338     const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4339 {
4340   assert(rows != 0);
4341   assert(rows <= 7);
4342   assert(channels != 0);
4343 
4344   const int8_t* i0 = input;
4345   const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
4346   if XNN_UNPREDICTABLE(rows < 2) {
4347     i1 = zero;
4348   }
4349   const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
4350   if XNN_UNPREDICTABLE(rows <= 2) {
4351     i2 = zero;
4352   }
4353   const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
4354   if XNN_UNPREDICTABLE(rows < 4) {
4355     i3 = zero;
4356   }
4357   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
4358   if XNN_UNPREDICTABLE(rows <= 4) {
4359     i4 = zero;
4360   }
4361   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
4362   if XNN_UNPREDICTABLE(rows < 6) {
4363     i5 = zero;
4364   }
4365   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
4366   if XNN_UNPREDICTABLE(rows <= 6) {
4367     i6 = zero;
4368   }
4369 
4370   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
4371   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4372   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4373   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4374   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
4375   for (; channels >= 8; channels -= 8) {
4376     const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4377     i0 += 8;
4378     const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4379     i1 += 8;
4380 
4381     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4382     const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4383     i2 += 8;
4384 
4385     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4386     const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4387     i3 += 8;
4388     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4389     const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4390     i4 += 8;
4391     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4392     const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4393     i5 += 8;
4394     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4395     const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4396     i6 += 8;
4397 
4398     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4399 
4400     __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4401     __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4402 
4403     vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4404     vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4405 
4406     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4407     __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4408 
4409     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4410     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4411 
4412     vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4413     vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4414 
4415     vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4416     vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4417 
4418     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4419 
4420     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4421 
4422     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4423 
4424     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
4425     output += 8;
4426   }
4427   if XNN_UNLIKELY(channels != 0) {
4428     {
4429       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
4430       i0 += 8;
4431       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
4432       i1 += 8;
4433 
4434       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
4435       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
4436       i2 += 8;
4437 
4438       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
4439       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
4440       i3 += 8;
4441       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
4442       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
4443       i4 += 8;
4444       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
4445       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
4446       i5 += 8;
4447       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
4448       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
4449       i6 += 8;
4450 
4451       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
4452 
4453       __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
4454       __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
4455 
4456       vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
4457       vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
4458 
4459       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
4460       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
4461 
4462       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
4463       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
4464 
4465       vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
4466       vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
4467 
4468       vacc0123 = _mm_cvtps_epi32(vfpacc0123);
4469       vacc4567 = _mm_cvtps_epi32(vfpacc4567);
4470 
4471       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4472 
4473       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4474       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
4475 
4476       if (channels & 4) {
4477         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4478         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4479         output += 4;
4480       }
4481       if (channels & 2) {
4482         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4483         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4484         output += 2;
4485       }
4486       if (channels & 1) {
4487         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
4488       }
4489     }
4490   }
4491 }
4492 
xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4493 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
4494     size_t mr,
4495     size_t nc,
4496     size_t kc,
4497     const int8_t* restrict a,
4498     size_t a_stride,
4499     const void* restrict w,
4500     int8_t* restrict c,
4501     size_t cm_stride,
4502     size_t cn_stride,
4503     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4504 {
4505   assert(mr != 0);
4506   assert(mr <= 1);
4507   assert(nc != 0);
4508   assert(kc != 0);
4509   assert(kc % sizeof(int8_t) == 0);
4510   assert(a != NULL);
4511   assert(w != NULL);
4512   assert(c != NULL);
4513 
4514   kc = round_up_po2(kc, 8);
4515   const int8_t* a0 = a;
4516   int8_t* c0 = c;
4517 
4518   do {
4519     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4520     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4521     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4522     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4523     w = (const int32_t*) w + 4;
4524 
4525     size_t k = 0;
4526     while (k < kc) {
4527       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4528       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4529       a0 += 8;
4530 
4531       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4532       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4533 
4534       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4535       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4536       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4537 
4538       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4539       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4540       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4541 
4542       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4543       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4544       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4545 
4546       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4547 
4548       w = (const void*) ((const int8_t*) w + 32);
4549       k += 8 * sizeof(int8_t);
4550     }
4551 
4552     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4553     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4554 
4555     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4556 
4557     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4558 
4559     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4560     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4561 
4562     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4563     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4564 
4565     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4566 
4567     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4568     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4569 
4570 
4571     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4572 
4573     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4574 
4575     if (nc >= 4) {
4576       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4577 
4578       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4579 
4580       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4581 
4582       nc -= 4;
4583     } else {
4584       if (nc & 2) {
4585         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4586         c0 += 2;
4587         vout = _mm_srli_epi32(vout, 16);
4588       }
4589       if (nc & 1) {
4590         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4591       }
4592 
4593       nc = 0;
4594     }
4595   } while (nc != 0);
4596 }
4597 
xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4598 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
4599     size_t mr,
4600     size_t nc,
4601     size_t kc,
4602     const int8_t* restrict a,
4603     size_t a_stride,
4604     const void* restrict w,
4605     int8_t* restrict c,
4606     size_t cm_stride,
4607     size_t cn_stride,
4608     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4609 {
4610   assert(mr != 0);
4611   assert(mr <= 3);
4612   assert(nc != 0);
4613   assert(kc != 0);
4614   assert(kc % sizeof(int8_t) == 0);
4615   assert(a != NULL);
4616   assert(w != NULL);
4617   assert(c != NULL);
4618 
4619   kc = round_up_po2(kc, 8);
4620   const int8_t* a0 = a;
4621   int8_t* c0 = c;
4622   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
4623   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4624   if XNN_UNPREDICTABLE(mr < 2) {
4625     a1 = a0;
4626     c1 = c0;
4627   }
4628   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
4629   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4630   if XNN_UNPREDICTABLE(mr <= 2) {
4631     a2 = a1;
4632     c2 = c1;
4633   }
4634 
4635   do {
4636     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4637     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4638     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4639     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4640     __m128i vacc1x0 = vacc0x0;
4641     __m128i vacc1x1 = vacc0x1;
4642     __m128i vacc1x2 = vacc0x2;
4643     __m128i vacc1x3 = vacc0x3;
4644     __m128i vacc2x0 = vacc0x0;
4645     __m128i vacc2x1 = vacc0x1;
4646     __m128i vacc2x2 = vacc0x2;
4647     __m128i vacc2x3 = vacc0x3;
4648     w = (const int32_t*) w + 4;
4649 
4650     size_t k = 0;
4651     while (k < kc) {
4652       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4653       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4654       a0 += 8;
4655       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4656       const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
4657       a1 += 8;
4658       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4659       const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
4660       a2 += 8;
4661 
4662       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4663       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4664 
4665       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4666       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4667       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4668       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4669       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4670 
4671       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4672       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4673       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4674       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4675       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4676 
4677       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4678       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4679       vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4680       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4681       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4682 
4683       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4684       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4685       vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4686 
4687       w = (const void*) ((const int8_t*) w + 32);
4688       k += 8 * sizeof(int8_t);
4689     }
4690 
4691     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4692     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4693     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
4694     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
4695     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
4696     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
4697 
4698     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4699     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
4700     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
4701 
4702     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4703     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
4704     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
4705 
4706     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4707     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4708     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
4709     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
4710 
4711     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4712     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4713     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
4714     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
4715 
4716     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4717     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
4718     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
4719 
4720     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4721     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
4722     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
4723 
4724 
4725     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
4726 
4727     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4728 
4729     if (nc >= 4) {
4730       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4731       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
4732       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
4733 
4734       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4735       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4736       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4737 
4738       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4739       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
4740       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
4741 
4742       nc -= 4;
4743     } else {
4744       if (nc & 2) {
4745         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4746         c0 += 2;
4747         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
4748         c1 += 2;
4749         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
4750         c2 += 2;
4751         vout = _mm_srli_epi32(vout, 16);
4752       }
4753       if (nc & 1) {
4754         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4755         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
4756         *c2 = (int8_t) _mm_extract_epi8(vout, 8);
4757       }
4758 
4759       nc = 0;
4760     }
4761   } while (nc != 0);
4762 }
4763 
xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4764 void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
4765     size_t mr,
4766     size_t nc,
4767     size_t kc,
4768     size_t ks,
4769     const int8_t** restrict a,
4770     const void* restrict w,
4771     int8_t* restrict c,
4772     size_t cm_stride,
4773     size_t cn_stride,
4774     size_t a_offset,
4775     const int8_t* zero,
4776     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4777 {
4778   assert(mr != 0);
4779   assert(mr <= 1);
4780   assert(nc != 0);
4781   assert(kc != 0);
4782   assert(ks != 0);
4783   assert(ks % (1 * sizeof(void*)) == 0);
4784   assert(a_offset % sizeof(int8_t) == 0);
4785   assert(a != NULL);
4786   assert(w != NULL);
4787   assert(c != NULL);
4788 
4789   kc = round_up_po2(kc, 8);
4790   int8_t* c0 = c;
4791 
4792   do {
4793     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4794     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4795     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4796     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4797     w = (const int32_t*) w + 4;
4798 
4799     size_t p = ks;
4800     do {
4801       const int8_t* restrict a0 = a[0];
4802       if XNN_UNPREDICTABLE(a0 != zero) {
4803         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4804       }
4805       a += 1;
4806 
4807       size_t k = 0;
4808       while (k < kc) {
4809         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4810         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4811         a0 += 8;
4812 
4813         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4814         const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4815 
4816         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4817         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4818         const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4819 
4820         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4821         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4822         const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4823 
4824         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4825         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4826         const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4827 
4828         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4829 
4830         w = (const void*) ((const int8_t*) w + 32);
4831         k += 8 * sizeof(int8_t);
4832       }
4833       p -= 1 * sizeof(void*);
4834     } while (p != 0);
4835 
4836     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4837     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4838 
4839     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4840 
4841     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4842 
4843     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
4844     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
4845 
4846     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
4847     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4848 
4849     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4850 
4851     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
4852     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4853 
4854 
4855     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4856 
4857     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
4858 
4859     if (nc >= 4) {
4860       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4861       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4862 
4863       a = (const int8_t**restrict) ((uintptr_t) a - ks);
4864 
4865       nc -= 4;
4866     } else {
4867       if (nc & 2) {
4868         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4869         c0 += 2;
4870         vout = _mm_srli_epi32(vout, 16);
4871       }
4872       if (nc & 1) {
4873         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
4874       }
4875 
4876       nc = 0;
4877     }
4878   } while (nc != 0);
4879 }
4880 
xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4881 void xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
4882     size_t mr,
4883     size_t nc,
4884     size_t kc,
4885     size_t ks,
4886     const int8_t** restrict a,
4887     const void* restrict w,
4888     int8_t* restrict c,
4889     size_t cm_stride,
4890     size_t cn_stride,
4891     size_t a_offset,
4892     const int8_t* zero,
4893     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4894 {
4895   assert(mr != 0);
4896   assert(mr <= 3);
4897   assert(nc != 0);
4898   assert(kc != 0);
4899   assert(ks != 0);
4900   assert(ks % (3 * sizeof(void*)) == 0);
4901   assert(a_offset % sizeof(int8_t) == 0);
4902   assert(a != NULL);
4903   assert(w != NULL);
4904   assert(c != NULL);
4905 
4906   kc = round_up_po2(kc, 8);
4907   int8_t* c0 = c;
4908   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4909   if XNN_UNPREDICTABLE(mr < 2) {
4910     c1 = c0;
4911   }
4912   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4913   if XNN_UNPREDICTABLE(mr <= 2) {
4914     c2 = c1;
4915   }
4916 
4917   do {
4918     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4919     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4920     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4921     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4922     __m128i vacc1x0 = vacc0x0;
4923     __m128i vacc1x1 = vacc0x1;
4924     __m128i vacc1x2 = vacc0x2;
4925     __m128i vacc1x3 = vacc0x3;
4926     __m128i vacc2x0 = vacc0x0;
4927     __m128i vacc2x1 = vacc0x1;
4928     __m128i vacc2x2 = vacc0x2;
4929     __m128i vacc2x3 = vacc0x3;
4930     w = (const int32_t*) w + 4;
4931 
4932     size_t p = ks;
4933     do {
4934       const int8_t* restrict a0 = a[0];
4935       if XNN_UNPREDICTABLE(a0 != zero) {
4936         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4937       }
4938       const int8_t* restrict a1 = a[1];
4939       if XNN_UNPREDICTABLE(a1 != zero) {
4940         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
4941       }
4942       const int8_t* restrict a2 = a[2];
4943       if XNN_UNPREDICTABLE(a2 != zero) {
4944         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
4945       }
4946       a += 3;
4947 
4948       size_t k = 0;
4949       while (k < kc) {
4950         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4951         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
4952         a0 += 8;
4953         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4954         const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
4955         a1 += 8;
4956         const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4957         const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
4958         a2 += 8;
4959 
4960         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4961         const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
4962 
4963         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4964         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4965         vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4966         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4967         const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
4968 
4969         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4970         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4971         vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4972         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4973         const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
4974 
4975         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4976         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4977         vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4978         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4979         const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
4980 
4981         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4982         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4983         vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4984 
4985         w = (const void*) ((const int8_t*) w + 32);
4986         k += 8 * sizeof(int8_t);
4987       }
4988       p -= 3 * sizeof(void*);
4989     } while (p != 0);
4990 
4991     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
4992     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
4993     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
4994     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
4995     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
4996     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
4997 
4998     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
4999     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
5000     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
5001 
5002     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
5003     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
5004     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
5005 
5006     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5007     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
5008     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
5009     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
5010 
5011     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5012     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
5013     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
5014     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
5015 
5016     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
5017     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
5018     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
5019 
5020     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5021     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
5022     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
5023 
5024 
5025     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
5026 
5027     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5028 
5029     if (nc >= 4) {
5030       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
5031       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
5032       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
5033       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
5034       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
5035       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
5036 
5037       a = (const int8_t**restrict) ((uintptr_t) a - ks);
5038 
5039       nc -= 4;
5040     } else {
5041       if (nc & 2) {
5042         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
5043         c2 += 2;
5044         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
5045         c1 += 2;
5046         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
5047         c0 += 2;
5048         vout = _mm_srli_epi32(vout, 16);
5049       }
5050       if (nc & 1) {
5051         *c2 = (int8_t) _mm_extract_epi8(vout, 8);
5052         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
5053         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
5054       }
5055 
5056       nc = 0;
5057     }
5058   } while (nc != 0);
5059 }
5060 
xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5061 void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(
5062     size_t n,
5063     const int8_t* input_a,
5064     const int8_t* input_b,
5065     int8_t* output,
5066     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5067 {
5068   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul16.bias);
5069   const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
5070   const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
5071   const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
5072   const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
5073   const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
5074   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
5075   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
5076   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul16.output_max);
5077 
5078   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5079     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5080     const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5081     input_a += 8;
5082     input_b += 8;
5083 
5084 
5085     __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5086     __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
5087     const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5088     const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
5089 
5090     vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5091     vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
5092 
5093     vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5094     vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
5095 
5096     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5097     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5098 
5099     vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
5100     vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
5101 
5102     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5103     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5104 
5105     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5106 
5107 
5108     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5109 
5110     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5111 
5112     vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5113 
5114     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5115     output += 8;
5116   }
5117   if XNN_UNLIKELY(n != 0) {
5118     {
5119       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5120       const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5121 
5122 
5123       __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5124       __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
5125       const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5126       const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
5127 
5128       vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5129       vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
5130 
5131       vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5132       vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
5133 
5134       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5135       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5136 
5137       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
5138       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
5139 
5140       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5141       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5142 
5143       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5144 
5145       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5146       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5147       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5148 
5149       if (n & (4 * sizeof(int8_t))) {
5150         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5151         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5152         output += 4;
5153       }
5154       if (n & (2 * sizeof(int8_t))) {
5155         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5156         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5157         output += 2;
5158       }
5159       if (n & (1 * sizeof(int8_t))) {
5160         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5161       }
5162     }
5163   }
5164 }
5165 
xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5166 void xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(
5167     size_t n,
5168     const int8_t* input_a,
5169     const int8_t* input_b,
5170     int8_t* output,
5171     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5172 {
5173   const __m128i vbias = _mm_add_epi32(
5174     _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse4_mul16.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
5175     _mm_load_si128((const __m128i*) params->sse4_mul16.bias));
5176   const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
5177   const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
5178   const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
5179   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
5180   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
5181   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul16.output_max);
5182 
5183   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5184     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5185     input_a += 8;
5186 
5187 
5188     __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5189     const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5190 
5191     vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5192 
5193     vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5194 
5195     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5196     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5197 
5198     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5199     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5200 
5201     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5202 
5203 
5204     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5205 
5206     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5207 
5208     vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5209 
5210     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5211     output += 8;
5212   }
5213   if XNN_UNLIKELY(n != 0) {
5214     {
5215       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5216 
5217 
5218       __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
5219       const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
5220 
5221       vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
5222 
5223       vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
5224 
5225       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
5226       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
5227 
5228       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
5229       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
5230 
5231       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5232 
5233       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5234       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5235       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5236 
5237       if (n & (4 * sizeof(int8_t))) {
5238         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5239         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5240         output += 4;
5241       }
5242       if (n & (2 * sizeof(int8_t))) {
5243         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5244         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5245         output += 2;
5246       }
5247       if (n & (1 * sizeof(int8_t))) {
5248         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5249       }
5250     }
5251   }
5252 }
5253 
xnn_qs8_vcvt_ukernel__sse41_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5254 void xnn_qs8_vcvt_ukernel__sse41_x32(
5255     size_t n,
5256     const int8_t* x,
5257     int8_t* y,
5258     const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5259 {
5260   assert(n != 0);
5261   assert(n % sizeof(int8_t) == 0);
5262   assert(x != NULL);
5263   assert(y != NULL);
5264 
5265   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
5266   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
5267   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
5268   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5269     __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5270     __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
5271     __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
5272     __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
5273     x += 32;
5274 
5275     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
5276     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
5277     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
5278     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
5279 
5280     vacc0 = _mm_slli_epi16(vacc0, 7);
5281     vacc1 = _mm_slli_epi16(vacc1, 7);
5282     vacc2 = _mm_slli_epi16(vacc2, 7);
5283     vacc3 = _mm_slli_epi16(vacc3, 7);
5284 
5285     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
5286     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
5287     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
5288     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
5289 
5290     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
5291     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
5292     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
5293     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
5294 
5295     const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
5296     const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
5297 
5298     _mm_storeu_si128((__m128i*) y, vy0);
5299     _mm_storeu_si128((__m128i*) (y + 16), vy1);
5300     y += 32;
5301   }
5302   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5303     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5304     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5305     vacc = _mm_slli_epi16(vacc, 7);
5306     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5307     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5308     x += 8;
5309 
5310     const __m128i vy = _mm_packs_epi16(vacc, vacc);
5311     _mm_storel_epi64((__m128i*) y, vy);
5312     y += 8;
5313   }
5314   if XNN_UNLIKELY(n != 0) {
5315     assert(n >= 1 * sizeof(int8_t));
5316     assert(n <= 7 * sizeof(int8_t));
5317 
5318     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5319     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5320     vacc = _mm_slli_epi16(vacc, 7);
5321     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5322     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5323 
5324     __m128i vy = _mm_packs_epi16(vacc, vacc);
5325     if (n & (4 * sizeof(int8_t))) {
5326       _mm_storeu_si32(y, vy);
5327       vy = _mm_srli_epi64(vy, 32);
5328       y += 4;
5329     }
5330     if (n & (2 * sizeof(int8_t))) {
5331       _mm_storeu_si16(y, vy);
5332       vy = _mm_srli_epi32(vy, 16);
5333       y += 2;
5334     }
5335     if (n & (1 * sizeof(int8_t))) {
5336       *y = (int8_t) _mm_extract_epi8(vy, 0);
5337     }
5338   }
5339 }
5340 
xnn_qs8_vlrelu_ukernel__sse41_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])5341 void xnn_qs8_vlrelu_ukernel__sse41_x32(
5342     size_t n,
5343     const int8_t* x,
5344     int8_t* y,
5345     const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5346 {
5347   assert(n != 0);
5348   assert(n % sizeof(int8_t) == 0);
5349   assert(x != NULL);
5350   assert(y != NULL);
5351 
5352   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
5353   const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
5354   const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
5355   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
5356   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5357     __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5358     __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
5359     __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
5360     __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
5361     x += 32;
5362 
5363     __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
5364     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
5365     __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
5366     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
5367     __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
5368     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
5369     __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
5370     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
5371 
5372     vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
5373     vacc0 = _mm_slli_epi16(vacc0, 7);
5374     vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
5375     vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
5376     vacc1 = _mm_slli_epi16(vacc1, 7);
5377     vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
5378     vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
5379     vacc2 = _mm_slli_epi16(vacc2, 7);
5380     vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
5381     vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
5382     vacc3 = _mm_slli_epi16(vacc3, 7);
5383     vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
5384 
5385     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
5386     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
5387     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
5388     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
5389 
5390     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
5391     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
5392     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
5393     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
5394 
5395     const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
5396     const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
5397 
5398     _mm_storeu_si128((__m128i*) y, vy0);
5399     _mm_storeu_si128((__m128i*) (y + 16), vy1);
5400     y += 32;
5401   }
5402   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5403     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5404     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
5405     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5406     vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
5407     vacc = _mm_slli_epi16(vacc, 7);
5408     vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
5409     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5410     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5411     x += 8;
5412 
5413     const __m128i vy = _mm_packs_epi16(vacc, vacc);
5414     _mm_storel_epi64((__m128i*) y, vy);
5415     y += 8;
5416   }
5417   if XNN_UNLIKELY(n != 0) {
5418     assert(n >= 1 * sizeof(int8_t));
5419     assert(n <= 7 * sizeof(int8_t));
5420 
5421     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
5422     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
5423     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
5424     vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
5425     vacc = _mm_slli_epi16(vacc, 7);
5426     vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
5427     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
5428     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
5429 
5430     __m128i vy = _mm_packs_epi16(vacc, vacc);
5431     if (n & (4 * sizeof(int8_t))) {
5432       _mm_storeu_si32(y, vy);
5433       vy = _mm_srli_epi64(vy, 32);
5434       y += 4;
5435     }
5436     if (n & (2 * sizeof(int8_t))) {
5437       _mm_storeu_si16(y, vy);
5438       vy = _mm_srli_epi32(vy, 16);
5439       y += 2;
5440     }
5441     if (n & (1 * sizeof(int8_t))) {
5442       *y = (int8_t) _mm_extract_epi8(vy, 0);
5443     }
5444   }
5445 }
5446 
xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5447 void xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
5448     size_t n,
5449     const int8_t* input_a,
5450     const int8_t* input_b,
5451     int8_t* output,
5452     const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5453 
5454 {
5455   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
5456   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point);
5457   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5458   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5459   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5460   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
5461 
5462   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5463     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5464     const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5465     const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5466     const __m128i vb89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
5467     input_a += 16;
5468     input_b += 16;
5469 
5470 
5471     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5472     const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
5473     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
5474     const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
5475 
5476     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
5477     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
5478     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
5479     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
5480 
5481     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5482     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5483     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5484     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5485 
5486     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5487     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5488     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
5489     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
5490 
5491     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5492     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5493     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
5494     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
5495 
5496     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5497     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5498     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
5499     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
5500 
5501     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5502     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5503 
5504 
5505     __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5506 
5507     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5508 
5509     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5510 
5511     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5512     output += 16;
5513   }
5514   if XNN_UNLIKELY(n != 0) {
5515     do {
5516       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5517       const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
5518       input_a += 8;
5519       input_b += 8;
5520 
5521 
5522       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5523       const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
5524 
5525       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
5526       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
5527 
5528       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5529       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5530 
5531       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5532       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5533 
5534       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5535       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5536 
5537       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5538       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5539 
5540       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5541 
5542       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5543       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5544       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5545 
5546       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5547         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5548         output += 8;
5549         n -= 8 * sizeof(int8_t);
5550       } else {
5551         if (n & (4 * sizeof(int8_t))) {
5552           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5553           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5554           output += 4;
5555         }
5556         if (n & (2 * sizeof(int8_t))) {
5557           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5558           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5559           output += 2;
5560         }
5561         if (n & (1 * sizeof(int8_t))) {
5562           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5563         }
5564         n = 0;
5565       }
5566     } while (n != 0);
5567   }
5568 }
5569 
xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5570 void xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
5571     size_t n,
5572     const int8_t* input_a,
5573     const int8_t* input_b,
5574     int8_t* output,
5575     const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5576 
5577 {
5578   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
5579   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
5580   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5581   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5582   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
5583 
5584   __m128i vxb = _mm_sub_epi16(
5585     _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
5586     _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point));
5587   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
5588     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5589     const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
5590     input_a += 16;
5591 
5592 
5593     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5594     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
5595 
5596     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
5597     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
5598     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
5599     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
5600 
5601     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5602     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5603     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5604     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
5605 
5606     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5607     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5608     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
5609     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
5610 
5611     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5612     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5613     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
5614     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
5615 
5616     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5617     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5618     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
5619     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
5620 
5621     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5622     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5623 
5624 
5625     __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5626 
5627     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5628 
5629     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
5630 
5631     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5632     output += 16;
5633   }
5634   if XNN_UNLIKELY(n != 0) {
5635     do {
5636       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
5637       input_a += 8;
5638 
5639 
5640       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
5641 
5642       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
5643       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
5644 
5645       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
5646       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
5647 
5648       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
5649       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
5650 
5651       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
5652       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
5653 
5654       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
5655       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
5656 
5657       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5658 
5659       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5660       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
5661       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
5662 
5663       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
5664         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5665         output += 8;
5666         n -= 8 * sizeof(int8_t);
5667       } else {
5668         if (n & (4 * sizeof(int8_t))) {
5669           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5670           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5671           output += 4;
5672         }
5673         if (n & (2 * sizeof(int8_t))) {
5674           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5675           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5676           output += 2;
5677         }
5678         if (n & (1 * sizeof(int8_t))) {
5679           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5680         }
5681         n = 0;
5682       }
5683     } while (n != 0);
5684   }
5685 }
5686 
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5687 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16(
5688     size_t channels,
5689     size_t output_width,
5690     const uint8_t** input,
5691     const void* weights,
5692     uint8_t* output,
5693     size_t input_stride,
5694     size_t output_increment,
5695     size_t input_offset,
5696     const uint8_t* zero,
5697     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5698 {
5699   assert(channels != 0);
5700   assert(output_width != 0);
5701 
5702   do {
5703     const uint8_t* i0 = input[0];
5704     assert(i0 != NULL);
5705     if XNN_UNPREDICTABLE(i0 != zero) {
5706       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
5707     }
5708     const uint8_t* i1 = input[1];
5709     assert(i1 != NULL);
5710     if XNN_UNPREDICTABLE(i1 != zero) {
5711       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
5712     }
5713     const uint8_t* i2 = input[2];
5714     assert(i2 != NULL);
5715     if XNN_UNPREDICTABLE(i2 != zero) {
5716       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
5717     }
5718     const uint8_t* i3 = input[3];
5719     assert(i3 != NULL);
5720     if XNN_UNPREDICTABLE(i3 != zero) {
5721       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
5722     }
5723     const uint8_t* i4 = input[4];
5724     assert(i4 != NULL);
5725     if XNN_UNPREDICTABLE(i4 != zero) {
5726       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
5727     }
5728     const uint8_t* i5 = input[5];
5729     assert(i5 != NULL);
5730     if XNN_UNPREDICTABLE(i5 != zero) {
5731       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
5732     }
5733     const uint8_t* i6 = input[6];
5734     assert(i6 != NULL);
5735     if XNN_UNPREDICTABLE(i6 != zero) {
5736       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
5737     }
5738     const uint8_t* i7 = input[7];
5739     assert(i7 != NULL);
5740     if XNN_UNPREDICTABLE(i7 != zero) {
5741       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
5742     }
5743     const uint8_t* i8 = input[8];
5744     assert(i8 != NULL);
5745     if XNN_UNPREDICTABLE(i8 != zero) {
5746       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
5747     }
5748     const uint8_t* i9 = input[9];
5749     assert(i9 != NULL);
5750     if XNN_UNPREDICTABLE(i9 != zero) {
5751       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
5752     }
5753     const uint8_t* i10 = input[10];
5754     assert(i10 != NULL);
5755     if XNN_UNPREDICTABLE(i10 != zero) {
5756       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
5757     }
5758     const uint8_t* i11 = input[11];
5759     assert(i11 != NULL);
5760     if XNN_UNPREDICTABLE(i11 != zero) {
5761       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
5762     }
5763     const uint8_t* i12 = input[12];
5764     assert(i12 != NULL);
5765     if XNN_UNPREDICTABLE(i12 != zero) {
5766       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
5767     }
5768     const uint8_t* i13 = input[13];
5769     assert(i13 != NULL);
5770     if XNN_UNPREDICTABLE(i13 != zero) {
5771       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
5772     }
5773     const uint8_t* i14 = input[14];
5774     assert(i14 != NULL);
5775     if XNN_UNPREDICTABLE(i14 != zero) {
5776       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
5777     }
5778     const uint8_t* i15 = input[15];
5779     assert(i15 != NULL);
5780     if XNN_UNPREDICTABLE(i15 != zero) {
5781       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
5782     }
5783     const uint8_t* i16 = input[16];
5784     assert(i16 != NULL);
5785     if XNN_UNPREDICTABLE(i16 != zero) {
5786       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
5787     }
5788     const uint8_t* i17 = input[17];
5789     assert(i17 != NULL);
5790     if XNN_UNPREDICTABLE(i17 != zero) {
5791       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
5792     }
5793     const uint8_t* i18 = input[18];
5794     assert(i18 != NULL);
5795     if XNN_UNPREDICTABLE(i18 != zero) {
5796       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
5797     }
5798     const uint8_t* i19 = input[19];
5799     assert(i19 != NULL);
5800     if XNN_UNPREDICTABLE(i19 != zero) {
5801       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
5802     }
5803     const uint8_t* i20 = input[20];
5804     assert(i20 != NULL);
5805     if XNN_UNPREDICTABLE(i20 != zero) {
5806       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
5807     }
5808     const uint8_t* i21 = input[21];
5809     assert(i21 != NULL);
5810     if XNN_UNPREDICTABLE(i21 != zero) {
5811       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
5812     }
5813     const uint8_t* i22 = input[22];
5814     assert(i22 != NULL);
5815     if XNN_UNPREDICTABLE(i22 != zero) {
5816       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
5817     }
5818     const uint8_t* i23 = input[23];
5819     assert(i23 != NULL);
5820     if XNN_UNPREDICTABLE(i23 != zero) {
5821       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
5822     }
5823     const uint8_t* i24 = input[24];
5824     assert(i24 != NULL);
5825     if XNN_UNPREDICTABLE(i24 != zero) {
5826       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
5827     }
5828     input = (const uint8_t**) ((uintptr_t) input + input_stride);
5829 
5830     size_t c = channels;
5831     const void* w = weights;
5832     const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
5833     for (; c >= 8; c -= 8) {
5834       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5835       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5836 
5837 
5838       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5839       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
5840       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
5841       const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
5842       i0 += 8;
5843 
5844 
5845       const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5846       const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
5847 
5848       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
5849       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
5850 
5851       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5852       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
5853       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
5854       const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
5855       i1 += 8;
5856 
5857 
5858       const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
5859       const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
5860 
5861       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
5862       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
5863 
5864       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5865       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
5866       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
5867       const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
5868       i2 += 8;
5869 
5870 
5871       const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5872       const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
5873 
5874       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
5875       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
5876 
5877       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5878       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
5879       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
5880       const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
5881       i3 += 8;
5882 
5883 
5884       const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
5885       const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
5886 
5887       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
5888       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
5889 
5890       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5891       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
5892       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
5893       const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
5894       i4 += 8;
5895 
5896 
5897       const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5898       const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
5899 
5900       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
5901       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
5902 
5903       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5904       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
5905       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
5906       const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
5907       i5 += 8;
5908 
5909 
5910       const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
5911       const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
5912 
5913       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
5914       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
5915 
5916       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5917       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
5918       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
5919       const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
5920       i6 += 8;
5921 
5922 
5923       const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5924       const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
5925 
5926       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
5927       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
5928 
5929       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5930       const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
5931       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
5932       const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
5933       i7 += 8;
5934 
5935 
5936       const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
5937       const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
5938 
5939       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
5940       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
5941 
5942       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5943       const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
5944       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
5945       const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
5946       i8 += 8;
5947 
5948 
5949       const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5950       const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
5951 
5952       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
5953       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
5954 
5955       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5956       const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
5957       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
5958       const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
5959       i9 += 8;
5960 
5961 
5962       const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
5963       const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
5964 
5965       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
5966       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
5967 
5968       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5969       const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
5970       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
5971       const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
5972       i10 += 8;
5973 
5974 
5975       const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5976       const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
5977 
5978       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
5979       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
5980 
5981       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5982       const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
5983       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
5984       const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
5985       i11 += 8;
5986 
5987 
5988       const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
5989       const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
5990 
5991       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
5992       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
5993 
5994       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5995       const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
5996       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
5997       const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
5998       i12 += 8;
5999 
6000 
6001       const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
6002       const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
6003 
6004       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
6005       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
6006 
6007       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
6008       const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
6009       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
6010       const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
6011       i13 += 8;
6012 
6013 
6014       const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
6015       const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
6016 
6017       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
6018       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
6019 
6020       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
6021       const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
6022       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
6023       const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
6024       i14 += 8;
6025 
6026 
6027       const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
6028       const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
6029 
6030       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
6031       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
6032 
6033       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
6034       const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
6035       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
6036       const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
6037       i15 += 8;
6038 
6039 
6040       const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
6041       const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
6042 
6043       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
6044       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
6045 
6046       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
6047       const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
6048       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
6049       const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
6050       i16 += 8;
6051 
6052 
6053       const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
6054       const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
6055 
6056       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
6057       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
6058 
6059       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
6060       const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
6061       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
6062       const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
6063       i17 += 8;
6064 
6065 
6066       const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
6067       const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
6068 
6069       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
6070       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
6071 
6072       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
6073       const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
6074       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
6075       const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
6076       i18 += 8;
6077 
6078 
6079       const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
6080       const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
6081 
6082       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
6083       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
6084 
6085       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
6086       const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
6087       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
6088       const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
6089       i19 += 8;
6090 
6091 
6092       const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
6093       const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
6094 
6095       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
6096       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
6097 
6098       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
6099       const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
6100       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
6101       const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
6102       i20 += 8;
6103 
6104 
6105       const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
6106       const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
6107 
6108       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
6109       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
6110 
6111       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
6112       const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
6113       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
6114       const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
6115       i21 += 8;
6116 
6117 
6118       const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
6119       const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
6120 
6121       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
6122       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
6123 
6124       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
6125       const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
6126       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
6127       const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
6128       i22 += 8;
6129 
6130 
6131       const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
6132       const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
6133 
6134       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
6135       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
6136 
6137       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
6138       const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
6139       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
6140       const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
6141       i23 += 8;
6142 
6143 
6144       const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
6145       const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
6146 
6147       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
6148       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
6149 
6150       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
6151       const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
6152       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
6153       const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
6154       i24 += 8;
6155 
6156 
6157       const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
6158       const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
6159 
6160       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
6161       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
6162 
6163       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
6164 
6165       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6166       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6167 
6168       const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6169       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6170       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6171 
6172       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6173       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6174       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6175 
6176       vacc0123 = _mm_cvtps_epi32(vscaled0123);
6177       vacc4567 = _mm_cvtps_epi32(vscaled4567);
6178 
6179       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6180       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6181 
6182       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6183 
6184       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6185       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6186 
6187       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6188       output += 8;
6189     }
6190     if XNN_UNLIKELY(c != 0) {
6191       {
6192         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6193         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6194 
6195 
6196         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6197         const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6198         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6199         const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6200 
6201 
6202         const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6203         const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6204 
6205         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6206         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6207 
6208         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6209         const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6210         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6211         const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6212 
6213 
6214         const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6215         const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6216 
6217         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6218         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6219 
6220         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6221         const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6222         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6223         const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6224 
6225 
6226         const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6227         const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6228 
6229         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6230         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6231 
6232         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6233         const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6234         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6235         const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6236 
6237 
6238         const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6239         const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6240 
6241         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6242         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6243 
6244         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6245         const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6246         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6247         const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6248 
6249 
6250         const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6251         const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6252 
6253         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6254         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6255 
6256         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6257         const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6258         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6259         const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6260 
6261 
6262         const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6263         const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6264 
6265         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6266         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6267 
6268         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6269         const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6270         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6271         const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6272 
6273 
6274         const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6275         const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6276 
6277         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6278         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6279 
6280         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6281         const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6282         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6283         const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6284 
6285 
6286         const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6287         const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6288 
6289         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6290         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6291 
6292         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6293         const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6294         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6295         const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6296 
6297 
6298         const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6299         const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6300 
6301         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6302         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6303 
6304         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
6305         const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
6306         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
6307         const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
6308 
6309 
6310         const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
6311         const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
6312 
6313         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
6314         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
6315 
6316         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
6317         const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
6318         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
6319         const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
6320 
6321 
6322         const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
6323         const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
6324 
6325         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
6326         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
6327 
6328         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
6329         const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
6330         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
6331         const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
6332 
6333 
6334         const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
6335         const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
6336 
6337         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
6338         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
6339 
6340         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
6341         const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
6342         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
6343         const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
6344 
6345 
6346         const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
6347         const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
6348 
6349         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
6350         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
6351 
6352         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
6353         const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
6354         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
6355         const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
6356 
6357 
6358         const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
6359         const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
6360 
6361         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
6362         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
6363 
6364         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
6365         const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
6366         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
6367         const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
6368 
6369 
6370         const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
6371         const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
6372 
6373         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
6374         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
6375 
6376         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
6377         const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
6378         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
6379         const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
6380 
6381 
6382         const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
6383         const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
6384 
6385         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
6386         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
6387 
6388         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
6389         const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
6390         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
6391         const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
6392 
6393 
6394         const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
6395         const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
6396 
6397         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
6398         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
6399 
6400         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
6401         const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
6402         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
6403         const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
6404 
6405 
6406         const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
6407         const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
6408 
6409         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
6410         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
6411 
6412         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
6413         const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
6414         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
6415         const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
6416 
6417 
6418         const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
6419         const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
6420 
6421         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
6422         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
6423 
6424         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
6425         const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
6426         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
6427         const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
6428 
6429 
6430         const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
6431         const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
6432 
6433         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
6434         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
6435 
6436         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
6437         const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
6438         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
6439         const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
6440 
6441 
6442         const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
6443         const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
6444 
6445         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
6446         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
6447 
6448         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
6449         const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
6450         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
6451         const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
6452 
6453 
6454         const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
6455         const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
6456 
6457         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
6458         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
6459 
6460         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
6461         const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
6462         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
6463         const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
6464 
6465 
6466         const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
6467         const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
6468 
6469         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
6470         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
6471 
6472         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
6473         const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
6474         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
6475         const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
6476 
6477 
6478         const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
6479         const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
6480 
6481         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
6482         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
6483 
6484         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
6485         const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
6486         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
6487         const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
6488 
6489 
6490         const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
6491         const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
6492 
6493         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
6494         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
6495 
6496 
6497         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6498         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6499 
6500         const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6501         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6502         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6503 
6504         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6505         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6506         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6507 
6508         vacc0123 = _mm_cvtps_epi32(vscaled0123);
6509         vacc4567 = _mm_cvtps_epi32(vscaled4567);
6510 
6511 
6512         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6513         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6514 
6515         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6516 
6517         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
6518 
6519         if (c & 4) {
6520           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6521           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6522           output += 4;
6523         }
6524         if (c & 2) {
6525           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6526           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6527           output += 2;
6528         }
6529         if (c & 1) {
6530           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6531           output += 1;
6532         }
6533       }
6534     }
6535 
6536     output = (uint8_t*) ((uintptr_t) output + output_increment);
6537   } while (--output_width != 0);
6538 }
6539 
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6540 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16(
6541     size_t channels,
6542     size_t output_width,
6543     const uint8_t** input,
6544     const void* weights,
6545     uint8_t* output,
6546     size_t input_stride,
6547     size_t output_increment,
6548     size_t input_offset,
6549     const uint8_t* zero,
6550     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6551 {
6552   assert(channels != 0);
6553   assert(output_width != 0);
6554 
6555   do {
6556     const uint8_t* i0 = input[0];
6557     assert(i0 != NULL);
6558     if XNN_UNPREDICTABLE(i0 != zero) {
6559       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
6560     }
6561     const uint8_t* i1 = input[1];
6562     assert(i1 != NULL);
6563     if XNN_UNPREDICTABLE(i1 != zero) {
6564       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
6565     }
6566     const uint8_t* i2 = input[2];
6567     assert(i2 != NULL);
6568     if XNN_UNPREDICTABLE(i2 != zero) {
6569       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
6570     }
6571     const uint8_t* i3 = input[3];
6572     assert(i3 != NULL);
6573     if XNN_UNPREDICTABLE(i3 != zero) {
6574       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
6575     }
6576     const uint8_t* i4 = input[4];
6577     assert(i4 != NULL);
6578     if XNN_UNPREDICTABLE(i4 != zero) {
6579       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
6580     }
6581     const uint8_t* i5 = input[5];
6582     assert(i5 != NULL);
6583     if XNN_UNPREDICTABLE(i5 != zero) {
6584       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
6585     }
6586     const uint8_t* i6 = input[6];
6587     assert(i6 != NULL);
6588     if XNN_UNPREDICTABLE(i6 != zero) {
6589       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
6590     }
6591     const uint8_t* i7 = input[7];
6592     assert(i7 != NULL);
6593     if XNN_UNPREDICTABLE(i7 != zero) {
6594       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
6595     }
6596     const uint8_t* i8 = input[8];
6597     assert(i8 != NULL);
6598     if XNN_UNPREDICTABLE(i8 != zero) {
6599       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
6600     }
6601     input = (const uint8_t**) ((uintptr_t) input + input_stride);
6602 
6603     size_t c = channels;
6604     const void* w = weights;
6605     const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
6606     for (; c >= 8; c -= 8) {
6607       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6608       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6609 
6610 
6611       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6612       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6613       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6614       const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6615       i0 += 8;
6616 
6617 
6618       const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6619       const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6620 
6621       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6622       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6623 
6624       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6625       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6626       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6627       const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6628       i1 += 8;
6629 
6630 
6631       const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6632       const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6633 
6634       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6635       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6636 
6637       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6638       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6639       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6640       const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6641       i2 += 8;
6642 
6643 
6644       const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6645       const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6646 
6647       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6648       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6649 
6650       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6651       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6652       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6653       const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6654       i3 += 8;
6655 
6656 
6657       const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6658       const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6659 
6660       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6661       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6662 
6663       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6664       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6665       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6666       const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6667       i4 += 8;
6668 
6669 
6670       const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6671       const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6672 
6673       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6674       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6675 
6676       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6677       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6678       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6679       const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6680       i5 += 8;
6681 
6682 
6683       const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6684       const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6685 
6686       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6687       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6688 
6689       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6690       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6691       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6692       const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6693       i6 += 8;
6694 
6695 
6696       const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6697       const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6698 
6699       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6700       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6701 
6702       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6703       const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6704       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6705       const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6706       i7 += 8;
6707 
6708 
6709       const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6710       const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6711 
6712       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6713       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6714 
6715       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6716       const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6717       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6718       const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6719       i8 += 8;
6720 
6721 
6722       const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6723       const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6724 
6725       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6726       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6727 
6728       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
6729 
6730       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6731       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6732 
6733       const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6734       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6735       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6736 
6737       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6738       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6739       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6740 
6741       vacc0123 = _mm_cvtps_epi32(vscaled0123);
6742       vacc4567 = _mm_cvtps_epi32(vscaled4567);
6743 
6744       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6745       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6746 
6747       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6748 
6749       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6750       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
6751 
6752       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6753       output += 8;
6754     }
6755     if XNN_UNLIKELY(c != 0) {
6756       {
6757         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6758         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6759 
6760 
6761         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6762         const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
6763         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
6764         const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
6765 
6766 
6767         const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6768         const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
6769 
6770         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
6771         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
6772 
6773         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6774         const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
6775         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
6776         const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
6777 
6778 
6779         const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
6780         const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
6781 
6782         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
6783         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
6784 
6785         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6786         const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
6787         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
6788         const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
6789 
6790 
6791         const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6792         const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
6793 
6794         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
6795         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
6796 
6797         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6798         const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
6799         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
6800         const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
6801 
6802 
6803         const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
6804         const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
6805 
6806         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
6807         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
6808 
6809         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6810         const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
6811         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
6812         const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
6813 
6814 
6815         const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6816         const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
6817 
6818         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
6819         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
6820 
6821         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6822         const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
6823         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
6824         const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
6825 
6826 
6827         const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
6828         const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
6829 
6830         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
6831         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
6832 
6833         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6834         const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
6835         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
6836         const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
6837 
6838 
6839         const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6840         const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
6841 
6842         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
6843         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
6844 
6845         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6846         const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
6847         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
6848         const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
6849 
6850 
6851         const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
6852         const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
6853 
6854         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
6855         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
6856 
6857         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6858         const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
6859         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
6860         const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
6861 
6862 
6863         const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6864         const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
6865 
6866         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
6867         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
6868 
6869 
6870         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6871         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6872 
6873         const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6874         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
6875         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
6876 
6877         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6878         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6879         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6880 
6881         vacc0123 = _mm_cvtps_epi32(vscaled0123);
6882         vacc4567 = _mm_cvtps_epi32(vscaled4567);
6883 
6884 
6885         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6886         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6887 
6888         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
6889 
6890         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
6891 
6892         if (c & 4) {
6893           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6894           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6895           output += 4;
6896         }
6897         if (c & 2) {
6898           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6899           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6900           output += 2;
6901         }
6902         if (c & 1) {
6903           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
6904           output += 1;
6905         }
6906       }
6907     }
6908 
6909     output = (uint8_t*) ((uintptr_t) output + output_increment);
6910   } while (--output_width != 0);
6911 }
6912 
xnn_qu8_f32_vcvt_ukernel__sse41_x16(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])6913 void xnn_qu8_f32_vcvt_ukernel__sse41_x16(
6914     size_t n,
6915     const uint8_t* x,
6916     float* y,
6917     const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6918 {
6919   assert(n != 0);
6920   assert(n % sizeof(uint8_t) == 0);
6921   assert(x != NULL);
6922   assert(y != NULL);
6923 
6924   const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->sse4.minus_zero_point);
6925   const __m128 vscale = _mm_load_ps(params->sse4.scale);
6926   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
6927     __m128i vx0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6928     __m128i vx4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
6929     __m128i vx89AB = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
6930     __m128i vxCDEF = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
6931     x += 16;
6932 
6933     vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
6934     vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
6935     vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
6936     vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
6937 
6938     __m128 vy0123 = _mm_cvtepi32_ps(vx0123);
6939     __m128 vy4567 = _mm_cvtepi32_ps(vx4567);
6940     __m128 vy89AB = _mm_cvtepi32_ps(vx89AB);
6941     __m128 vyCDEF = _mm_cvtepi32_ps(vxCDEF);
6942 
6943     vy0123 = _mm_mul_ps(vy0123, vscale);
6944     vy4567 = _mm_mul_ps(vy4567, vscale);
6945     vy89AB = _mm_mul_ps(vy89AB, vscale);
6946     vyCDEF = _mm_mul_ps(vyCDEF, vscale);
6947 
6948     _mm_storeu_ps(y, vy0123);
6949     _mm_storeu_ps(y + 4, vy4567);
6950     _mm_storeu_ps(y + 8, vy89AB);
6951     _mm_storeu_ps(y + 12, vyCDEF);
6952     y += 16;
6953   }
6954   for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
6955     __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6956     vx = _mm_add_epi32(vx, vminus_zero_point);
6957     x += 4;
6958 
6959     __m128 vy = _mm_cvtepi32_ps(vx);
6960     vy = _mm_mul_ps(vy, vscale);
6961 
6962     _mm_storeu_ps(y, vy);
6963     y += 4;
6964   }
6965   if XNN_UNLIKELY(n != 0) {
6966     assert(n >= 1 * sizeof(uint8_t));
6967     assert(n <= 3 * sizeof(uint8_t));
6968 
6969     __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
6970     vx = _mm_add_epi32(vx, vminus_zero_point);
6971 
6972     __m128 vy = _mm_cvtepi32_ps(vx);
6973     vy = _mm_mul_ps(vy, vscale);
6974 
6975     if (n & (2 * sizeof(uint8_t))) {
6976       _mm_storel_pi((__m64*) y, vy);
6977       vy = _mm_movehl_ps(vy, vy);
6978       y += 2;
6979     }
6980     if (n & (1 * sizeof(uint8_t))) {
6981       _mm_store_ss(y, vy);
6982     }
6983   }
6984 }
6985 
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6986 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
6987     size_t rows,
6988     size_t channels,
6989     const uint8_t* input,
6990     size_t input_stride,
6991     const uint8_t* zero,
6992     int32_t* buffer,
6993     uint8_t* output,
6994     const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6995 {
6996   assert(rows > 7);
6997   assert(channels != 0);
6998 
6999   const uint8_t* i0 = input;
7000   const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
7001   const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
7002   const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
7003   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
7004   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
7005   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
7006   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
7007 
7008   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
7009   int32_t* b = buffer;
7010   size_t c = channels;
7011   for (; c != 0; c = doz(c, 8)) {
7012     const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7013     i0 += 8;
7014     const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7015     i1 += 8;
7016 
7017     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7018     const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7019     i2 += 8;
7020 
7021     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7022     const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7023     i3 += 8;
7024     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7025     const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7026     i4 += 8;
7027     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7028     const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7029     i5 += 8;
7030     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7031     const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7032     i6 += 8;
7033 
7034     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7035 
7036     const __m128i vzero = _mm_setzero_si128();
7037     __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7038     __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7039 
7040     vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7041     vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7042 
7043     _mm_store_si128((__m128i*) b, vacc0123);
7044     _mm_store_si128((__m128i*) (b + 4), vacc4567);
7045     b += 8;
7046   }
7047 
7048   for (rows -= 7; rows > 7; rows -= 7) {
7049     i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
7050     i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
7051     i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
7052     i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
7053     i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
7054     i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
7055     i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
7056 
7057     int32_t* b = buffer;
7058     size_t c = channels;
7059     for (; c != 0; c = doz(c, 8)) {
7060       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7061       i0 += 8;
7062       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7063       i1 += 8;
7064 
7065       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7066       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7067       i2 += 8;
7068 
7069       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7070       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7071       i3 += 8;
7072       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7073       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7074       i4 += 8;
7075       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7076       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7077       i5 += 8;
7078       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7079       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7080       i6 += 8;
7081 
7082       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7083 
7084       const __m128i vzero = _mm_setzero_si128();
7085       __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7086       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7087 
7088       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
7089       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
7090 
7091       _mm_store_si128((__m128i*) b, vacc0123);
7092       _mm_store_si128((__m128i*) (b + 4), vacc4567);
7093       b += 8;
7094     }
7095   }
7096 
7097   i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
7098   i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
7099   if XNN_UNPREDICTABLE(rows < 2) {
7100     i1 = zero;
7101   }
7102   i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
7103   if XNN_UNPREDICTABLE(rows <= 2) {
7104     i2 = zero;
7105   }
7106   i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
7107   if XNN_UNPREDICTABLE(rows < 4) {
7108     i3 = zero;
7109   }
7110   i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
7111   if XNN_UNPREDICTABLE(rows <= 4) {
7112     i4 = zero;
7113   }
7114   i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
7115   if XNN_UNPREDICTABLE(rows < 6) {
7116     i5 = zero;
7117   }
7118   i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
7119   if XNN_UNPREDICTABLE(rows <= 6) {
7120     i6 = zero;
7121   }
7122 
7123   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7124   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7125   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7126   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7127   for (; channels >= 8; channels -= 8) {
7128     const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7129     i0 += 8;
7130     const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7131     i1 += 8;
7132 
7133     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7134     const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7135     i2 += 8;
7136 
7137     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7138     const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7139     i3 += 8;
7140     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7141     const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7142     i4 += 8;
7143     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7144     const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7145     i5 += 8;
7146     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7147     const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7148     i6 += 8;
7149 
7150     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7151 
7152     const __m128i vzero = _mm_setzero_si128();
7153     __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7154     __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7155 
7156     vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
7157     vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
7158     buffer += 8;
7159 
7160     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7161     __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7162 
7163     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7164     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7165 
7166     vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7167     vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7168 
7169     vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7170     vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7171 
7172     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7173 
7174     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7175 
7176     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7177 
7178     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7179     output += 8;
7180   }
7181   if XNN_UNLIKELY(channels != 0) {
7182     {
7183       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7184       i0 += 8;
7185       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7186       i1 += 8;
7187 
7188       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7189       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7190       i2 += 8;
7191 
7192       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7193       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7194       i3 += 8;
7195       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7196       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7197       i4 += 8;
7198       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7199       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7200       i5 += 8;
7201       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7202       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7203       i6 += 8;
7204 
7205       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7206 
7207       __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7208       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
7209 
7210       vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
7211       vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
7212       buffer += 8;
7213 
7214       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7215       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7216 
7217       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7218       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7219 
7220       vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7221       vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7222 
7223       vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7224       vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7225 
7226       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7227 
7228       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7229       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7230 
7231       if (channels & 4) {
7232         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7233         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7234         output += 4;
7235       }
7236       if (channels & 2) {
7237         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7238         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7239         output += 2;
7240       }
7241       if (channels & 1) {
7242         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7243       }
7244     }
7245   }
7246 }
7247 
xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7248 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
7249     size_t rows,
7250     size_t channels,
7251     const uint8_t* input,
7252     size_t input_stride,
7253     const uint8_t* zero,
7254     uint8_t* output,
7255     const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7256 {
7257   assert(rows != 0);
7258   assert(rows <= 7);
7259   assert(channels != 0);
7260 
7261   const uint8_t* i0 = input;
7262   const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
7263   if XNN_UNPREDICTABLE(rows < 2) {
7264     i1 = zero;
7265   }
7266   const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
7267   if XNN_UNPREDICTABLE(rows <= 2) {
7268     i2 = zero;
7269   }
7270   const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
7271   if XNN_UNPREDICTABLE(rows < 4) {
7272     i3 = zero;
7273   }
7274   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
7275   if XNN_UNPREDICTABLE(rows <= 4) {
7276     i4 = zero;
7277   }
7278   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
7279   if XNN_UNPREDICTABLE(rows < 6) {
7280     i5 = zero;
7281   }
7282   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
7283   if XNN_UNPREDICTABLE(rows <= 6) {
7284     i6 = zero;
7285   }
7286 
7287   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
7288   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7289   const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7290   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7291   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7292   for (; channels >= 8; channels -= 8) {
7293     const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7294     i0 += 8;
7295     const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7296     i1 += 8;
7297 
7298     __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7299     const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7300     i2 += 8;
7301 
7302     vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7303     const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7304     i3 += 8;
7305     vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7306     const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7307     i4 += 8;
7308     vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7309     const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7310     i5 += 8;
7311     vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7312     const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7313     i6 += 8;
7314 
7315     vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7316 
7317     const __m128i vzero = _mm_setzero_si128();
7318     __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7319     __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
7320 
7321     vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7322     vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7323 
7324     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7325     __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7326 
7327     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7328     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7329 
7330     vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7331     vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7332 
7333     vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7334     vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7335 
7336     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7337 
7338     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7339 
7340     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7341 
7342     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7343     output += 8;
7344   }
7345   if XNN_UNLIKELY(channels != 0) {
7346     {
7347       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
7348       i0 += 8;
7349       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
7350       i1 += 8;
7351 
7352       __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
7353       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
7354       i2 += 8;
7355 
7356       vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
7357       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
7358       i3 += 8;
7359       vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
7360       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
7361       i4 += 8;
7362       vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
7363       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
7364       i5 += 8;
7365       vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
7366       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
7367       i6 += 8;
7368 
7369       vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
7370 
7371       __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
7372       __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
7373 
7374       vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
7375       vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
7376 
7377       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
7378       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
7379 
7380       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7381       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7382 
7383       vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
7384       vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
7385 
7386       vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7387       vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7388 
7389       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7390 
7391       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
7392       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
7393 
7394       if (channels & 4) {
7395         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7396         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7397         output += 4;
7398       }
7399       if (channels & 2) {
7400         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7401         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7402         output += 2;
7403       }
7404       if (channels & 1) {
7405         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
7406       }
7407     }
7408   }
7409 }
7410 
xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7411 void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
7412     size_t mr,
7413     size_t nc,
7414     size_t kc,
7415     const uint8_t* restrict a,
7416     size_t a_stride,
7417     const void* restrict w,
7418     uint8_t* restrict c,
7419     size_t cm_stride,
7420     size_t cn_stride,
7421     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7422 {
7423   assert(mr != 0);
7424   assert(mr <= 1);
7425   assert(nc != 0);
7426   assert(kc != 0);
7427   assert(kc % sizeof(uint8_t) == 0);
7428   assert(a != NULL);
7429   assert(w != NULL);
7430   assert(c != NULL);
7431 
7432   kc = round_up_po2(kc, 8);
7433   const uint8_t* a0 = a;
7434   uint8_t* c0 = c;
7435 
7436   do {
7437     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7438     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7439     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7440     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7441     w = (const int32_t*) w + 4;
7442 
7443     size_t k = 0;
7444     const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7445     while (k < kc) {
7446       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7447       const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7448       a0 += 8;
7449 
7450       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7451       const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7452 
7453       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7454       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7455       const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7456 
7457       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7458       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7459       const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7460 
7461       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7462       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7463       const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7464 
7465       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7466 
7467       w = (const void*) ((const uint8_t*) w + 32);
7468       k += 8 * sizeof(uint8_t);
7469     }
7470 
7471     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7472     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7473 
7474     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7475 
7476     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7477 
7478     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7479     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7480 
7481     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7482     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7483 
7484     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7485 
7486     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7487     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
7488 
7489     __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
7490 
7491     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7492 
7493     if (nc >= 4) {
7494       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7495 
7496       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7497 
7498       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7499 
7500       nc -= 4;
7501     } else {
7502       if (nc & 2) {
7503         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7504         c0 += 2;
7505         vout = _mm_srli_epi32(vout, 16);
7506       }
7507       if (nc & 1) {
7508         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7509       }
7510 
7511       nc = 0;
7512     }
7513   } while (nc != 0);
7514 }
7515 
xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7516 void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
7517     size_t mr,
7518     size_t nc,
7519     size_t kc,
7520     const uint8_t* restrict a,
7521     size_t a_stride,
7522     const void* restrict w,
7523     uint8_t* restrict c,
7524     size_t cm_stride,
7525     size_t cn_stride,
7526     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7527 {
7528   assert(mr != 0);
7529   assert(mr <= 3);
7530   assert(nc != 0);
7531   assert(kc != 0);
7532   assert(kc % sizeof(uint8_t) == 0);
7533   assert(a != NULL);
7534   assert(w != NULL);
7535   assert(c != NULL);
7536 
7537   kc = round_up_po2(kc, 8);
7538   const uint8_t* a0 = a;
7539   uint8_t* c0 = c;
7540   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
7541   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7542   if XNN_UNPREDICTABLE(mr < 2) {
7543     a1 = a0;
7544     c1 = c0;
7545   }
7546   const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
7547   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7548   if XNN_UNPREDICTABLE(mr <= 2) {
7549     a2 = a1;
7550     c2 = c1;
7551   }
7552 
7553   do {
7554     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7555     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7556     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7557     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7558     __m128i vacc1x0 = vacc0x0;
7559     __m128i vacc1x1 = vacc0x1;
7560     __m128i vacc1x2 = vacc0x2;
7561     __m128i vacc1x3 = vacc0x3;
7562     __m128i vacc2x0 = vacc0x0;
7563     __m128i vacc2x1 = vacc0x1;
7564     __m128i vacc2x2 = vacc0x2;
7565     __m128i vacc2x3 = vacc0x3;
7566     w = (const int32_t*) w + 4;
7567 
7568     size_t k = 0;
7569     const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7570     while (k < kc) {
7571       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7572       const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7573       a0 += 8;
7574       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
7575       const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
7576       a1 += 8;
7577       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
7578       const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
7579       a2 += 8;
7580 
7581       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7582       const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7583 
7584       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7585       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
7586       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
7587       const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7588       const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7589 
7590       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7591       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
7592       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
7593       const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7594       const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7595 
7596       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7597       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
7598       vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
7599       const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7600       const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7601 
7602       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7603       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
7604       vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
7605 
7606       w = (const void*) ((const uint8_t*) w + 32);
7607       k += 8 * sizeof(uint8_t);
7608     }
7609 
7610     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7611     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7612     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
7613     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
7614     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
7615     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
7616 
7617     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7618     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
7619     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
7620 
7621     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7622     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
7623     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
7624 
7625     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7626     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7627     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
7628     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
7629 
7630     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7631     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7632     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
7633     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
7634 
7635     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7636     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
7637     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
7638 
7639     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7640     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
7641     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
7642 
7643     __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
7644 
7645     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7646 
7647     if (nc >= 4) {
7648       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7649       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
7650       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
7651 
7652       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7653       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7654       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7655 
7656       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
7657       a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
7658       a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
7659 
7660       nc -= 4;
7661     } else {
7662       if (nc & 2) {
7663         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7664         c0 += 2;
7665         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
7666         c1 += 2;
7667         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
7668         c2 += 2;
7669         vout = _mm_srli_epi32(vout, 16);
7670       }
7671       if (nc & 1) {
7672         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7673         *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
7674         *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
7675       }
7676 
7677       nc = 0;
7678     }
7679   } while (nc != 0);
7680 }
7681 
xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7682 void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
7683     size_t mr,
7684     size_t nc,
7685     size_t kc,
7686     size_t ks,
7687     const uint8_t** restrict a,
7688     const void* restrict w,
7689     uint8_t* restrict c,
7690     size_t cm_stride,
7691     size_t cn_stride,
7692     size_t a_offset,
7693     const uint8_t* zero,
7694     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7695 {
7696   assert(mr != 0);
7697   assert(mr <= 1);
7698   assert(nc != 0);
7699   assert(kc != 0);
7700   assert(ks != 0);
7701   assert(ks % (1 * sizeof(void*)) == 0);
7702   assert(a_offset % sizeof(uint8_t) == 0);
7703   assert(a != NULL);
7704   assert(w != NULL);
7705   assert(c != NULL);
7706 
7707   kc = round_up_po2(kc, 8);
7708   uint8_t* c0 = c;
7709 
7710   do {
7711     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7712     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7713     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7714     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7715     w = (const int32_t*) w + 4;
7716 
7717     size_t p = ks;
7718     do {
7719       const uint8_t* restrict a0 = a[0];
7720       if XNN_UNPREDICTABLE(a0 != zero) {
7721         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7722       }
7723       a += 1;
7724 
7725       size_t k = 0;
7726       const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7727       while (k < kc) {
7728         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7729         const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7730         a0 += 8;
7731 
7732         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7733         const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7734 
7735         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7736         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7737         const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7738 
7739         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7740         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7741         const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7742 
7743         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7744         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7745         const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7746 
7747         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7748 
7749         w = (const void*) ((const uint8_t*) w + 32);
7750         k += 8 * sizeof(uint8_t);
7751       }
7752       p -= 1 * sizeof(void*);
7753     } while (p != 0);
7754 
7755     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7756     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7757 
7758     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7759 
7760     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7761 
7762     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7763     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7764 
7765     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7766     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7767 
7768     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7769 
7770     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7771     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
7772 
7773     __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
7774 
7775     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7776 
7777     if (nc >= 4) {
7778       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7779       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7780 
7781       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7782 
7783       nc -= 4;
7784     } else {
7785       if (nc & 2) {
7786         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7787         c0 += 2;
7788         vout = _mm_srli_epi32(vout, 16);
7789       }
7790       if (nc & 1) {
7791         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7792       }
7793 
7794       nc = 0;
7795     }
7796   } while (nc != 0);
7797 }
7798 
xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7799 void xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
7800     size_t mr,
7801     size_t nc,
7802     size_t kc,
7803     size_t ks,
7804     const uint8_t** restrict a,
7805     const void* restrict w,
7806     uint8_t* restrict c,
7807     size_t cm_stride,
7808     size_t cn_stride,
7809     size_t a_offset,
7810     const uint8_t* zero,
7811     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7812 {
7813   assert(mr != 0);
7814   assert(mr <= 3);
7815   assert(nc != 0);
7816   assert(kc != 0);
7817   assert(ks != 0);
7818   assert(ks % (3 * sizeof(void*)) == 0);
7819   assert(a_offset % sizeof(uint8_t) == 0);
7820   assert(a != NULL);
7821   assert(w != NULL);
7822   assert(c != NULL);
7823 
7824   kc = round_up_po2(kc, 8);
7825   uint8_t* c0 = c;
7826   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
7827   if XNN_UNPREDICTABLE(mr < 2) {
7828     c1 = c0;
7829   }
7830   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
7831   if XNN_UNPREDICTABLE(mr <= 2) {
7832     c2 = c1;
7833   }
7834 
7835   do {
7836     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
7837     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
7838     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
7839     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
7840     __m128i vacc1x0 = vacc0x0;
7841     __m128i vacc1x1 = vacc0x1;
7842     __m128i vacc1x2 = vacc0x2;
7843     __m128i vacc1x3 = vacc0x3;
7844     __m128i vacc2x0 = vacc0x0;
7845     __m128i vacc2x1 = vacc0x1;
7846     __m128i vacc2x2 = vacc0x2;
7847     __m128i vacc2x3 = vacc0x3;
7848     w = (const int32_t*) w + 4;
7849 
7850     size_t p = ks;
7851     do {
7852       const uint8_t* restrict a0 = a[0];
7853       if XNN_UNPREDICTABLE(a0 != zero) {
7854         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
7855       }
7856       const uint8_t* restrict a1 = a[1];
7857       if XNN_UNPREDICTABLE(a1 != zero) {
7858         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
7859       }
7860       const uint8_t* restrict a2 = a[2];
7861       if XNN_UNPREDICTABLE(a2 != zero) {
7862         a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
7863       }
7864       a += 3;
7865 
7866       size_t k = 0;
7867       const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
7868       while (k < kc) {
7869         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
7870         const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
7871         a0 += 8;
7872         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
7873         const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
7874         a1 += 8;
7875         const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
7876         const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
7877         a2 += 8;
7878 
7879         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
7880         const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
7881 
7882         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
7883         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
7884         vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
7885         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
7886         const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
7887 
7888         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
7889         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
7890         vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
7891         const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
7892         const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
7893 
7894         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
7895         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
7896         vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
7897         const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
7898         const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
7899 
7900         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
7901         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
7902         vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
7903 
7904         w = (const void*) ((const uint8_t*) w + 32);
7905         k += 8 * sizeof(uint8_t);
7906       }
7907       p -= 3 * sizeof(void*);
7908     } while (p != 0);
7909 
7910     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
7911     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
7912     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
7913     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
7914     const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
7915     const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
7916 
7917     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
7918     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
7919     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
7920 
7921     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
7922     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
7923     __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
7924 
7925     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7926     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
7927     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
7928     vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
7929 
7930     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
7931     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
7932     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
7933     vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
7934 
7935     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
7936     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
7937     vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
7938 
7939     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7940     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
7941     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
7942 
7943     __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
7944 
7945     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
7946 
7947     if (nc >= 4) {
7948       unaligned_store_u32(c2, (uint32_t) _mm_extract_epi32(vout, 2));
7949       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
7950       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
7951       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
7952       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7953       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
7954 
7955       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
7956 
7957       nc -= 4;
7958     } else {
7959       if (nc & 2) {
7960         unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
7961         c2 += 2;
7962         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
7963         c1 += 2;
7964         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7965         c0 += 2;
7966         vout = _mm_srli_epi32(vout, 16);
7967       }
7968       if (nc & 1) {
7969         *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
7970         *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
7971         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
7972       }
7973 
7974       nc = 0;
7975     }
7976   } while (nc != 0);
7977 }
7978 
xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7979 void xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(
7980     size_t n,
7981     const uint8_t* input_a,
7982     const uint8_t* input_b,
7983     uint8_t* output,
7984     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7985 {
7986   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
7987   const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
7988   const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
7989   const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
7990   const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
7991   const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
7992   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
7993   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
7994   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
7995 
7996   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
7997     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
7998     const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
7999     input_a += 8;
8000     input_b += 8;
8001 
8002 
8003     __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8004     __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
8005     const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8006     const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
8007 
8008     vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8009     vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
8010 
8011 
8012     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8013     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8014 
8015     vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
8016     vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
8017 
8018     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8019     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8020 
8021     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8022 
8023 
8024     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8025 
8026     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8027 
8028     vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8029 
8030     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8031     output += 8;
8032   }
8033   if XNN_UNLIKELY(n != 0) {
8034     {
8035       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8036       const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8037 
8038 
8039       __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8040       __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
8041       const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8042       const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
8043 
8044       vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8045       vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
8046 
8047 
8048       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8049       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8050 
8051       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
8052       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
8053 
8054       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8055       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8056 
8057       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8058 
8059       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8060       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8061       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8062 
8063       if (n & (4 * sizeof(uint8_t))) {
8064         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8065         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8066         output += 4;
8067       }
8068       if (n & (2 * sizeof(uint8_t))) {
8069         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8070         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8071         output += 2;
8072       }
8073       if (n & (1 * sizeof(uint8_t))) {
8074         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8075       }
8076     }
8077   }
8078 }
8079 
xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8080 void xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8(
8081     size_t n,
8082     const uint8_t* input_a,
8083     const uint8_t* input_b,
8084     uint8_t* output,
8085     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8086 {
8087   const __m128i vbias = _mm_add_epi32(
8088     _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse2.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
8089     _mm_load_si128((const __m128i*) params->sse2.bias));
8090   const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
8091   const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
8092   const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
8093   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
8094   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
8095   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
8096 
8097   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8098     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8099     input_a += 8;
8100 
8101 
8102     __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8103     const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8104 
8105     vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8106 
8107 
8108     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8109     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8110 
8111     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8112     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8113 
8114     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8115 
8116 
8117     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8118 
8119     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8120 
8121     vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8122 
8123     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8124     output += 8;
8125   }
8126   if XNN_UNLIKELY(n != 0) {
8127     {
8128       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8129 
8130 
8131       __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
8132       const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
8133 
8134       vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
8135 
8136 
8137       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
8138       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
8139 
8140       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8141       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8142 
8143       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8144 
8145       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8146       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8147       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8148 
8149       if (n & (4 * sizeof(uint8_t))) {
8150         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8151         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8152         output += 4;
8153       }
8154       if (n & (2 * sizeof(uint8_t))) {
8155         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8156         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8157         output += 2;
8158       }
8159       if (n & (1 * sizeof(uint8_t))) {
8160         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8161       }
8162     }
8163   }
8164 }
8165 
xnn_qu8_vcvt_ukernel__sse41_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8166 void xnn_qu8_vcvt_ukernel__sse41_x32(
8167     size_t n,
8168     const uint8_t* x,
8169     uint8_t* y,
8170     const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8171 {
8172   assert(n != 0);
8173   assert(n % sizeof(uint8_t) == 0);
8174   assert(x != NULL);
8175   assert(y != NULL);
8176 
8177   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
8178   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
8179   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
8180   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
8181     __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8182     __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8183     __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
8184     __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
8185     x += 32;
8186 
8187     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
8188     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
8189     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
8190     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
8191 
8192     vacc0 = _mm_slli_epi16(vacc0, 7);
8193     vacc1 = _mm_slli_epi16(vacc1, 7);
8194     vacc2 = _mm_slli_epi16(vacc2, 7);
8195     vacc3 = _mm_slli_epi16(vacc3, 7);
8196 
8197     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
8198     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
8199     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
8200     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
8201 
8202     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
8203     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
8204     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
8205     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
8206 
8207     const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
8208     const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
8209 
8210     _mm_storeu_si128((__m128i*) y, vy0);
8211     _mm_storeu_si128((__m128i*) (y + 16), vy1);
8212     y += 32;
8213   }
8214   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8215     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8216     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8217     vacc = _mm_slli_epi16(vacc, 7);
8218     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8219     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8220     x += 8;
8221 
8222     const __m128i vy = _mm_packus_epi16(vacc, vacc);
8223     _mm_storel_epi64((__m128i*) y, vy);
8224     y += 8;
8225   }
8226   if XNN_UNLIKELY(n != 0) {
8227     assert(n >= 1 * sizeof(uint8_t));
8228     assert(n <= 7 * sizeof(uint8_t));
8229 
8230     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8231     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8232     vacc = _mm_slli_epi16(vacc, 7);
8233     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8234     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8235 
8236     __m128i vy = _mm_packus_epi16(vacc, vacc);
8237     if (n & (4 * sizeof(uint8_t))) {
8238       _mm_storeu_si32(y, vy);
8239       vy = _mm_srli_epi64(vy, 32);
8240       y += 4;
8241     }
8242     if (n & (2 * sizeof(uint8_t))) {
8243       _mm_storeu_si16(y, vy);
8244       vy = _mm_srli_epi32(vy, 16);
8245       y += 2;
8246     }
8247     if (n & (1 * sizeof(uint8_t))) {
8248       *y = (uint8_t) _mm_extract_epi8(vy, 0);
8249     }
8250   }
8251 }
8252 
xnn_qu8_vlrelu_ukernel__sse41_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])8253 void xnn_qu8_vlrelu_ukernel__sse41_x32(
8254     size_t n,
8255     const uint8_t* x,
8256     uint8_t* y,
8257     const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8258 {
8259   assert(n != 0);
8260   assert(n % sizeof(uint8_t) == 0);
8261   assert(x != NULL);
8262   assert(y != NULL);
8263 
8264   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
8265   const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
8266   const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
8267   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
8268   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
8269     __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8270     __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8271     __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
8272     __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
8273     x += 32;
8274 
8275     __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
8276     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
8277     __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
8278     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
8279     __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
8280     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
8281     __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
8282     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
8283 
8284     vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
8285     vacc0 = _mm_slli_epi16(vacc0, 7);
8286     vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
8287     vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
8288     vacc1 = _mm_slli_epi16(vacc1, 7);
8289     vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
8290     vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
8291     vacc2 = _mm_slli_epi16(vacc2, 7);
8292     vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
8293     vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
8294     vacc3 = _mm_slli_epi16(vacc3, 7);
8295     vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
8296 
8297     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
8298     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
8299     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
8300     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
8301 
8302     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
8303     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
8304     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
8305     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
8306 
8307     const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
8308     const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
8309 
8310     _mm_storeu_si128((__m128i*) y, vy0);
8311     _mm_storeu_si128((__m128i*) (y + 16), vy1);
8312     y += 32;
8313   }
8314   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
8315     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8316     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
8317     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8318     vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
8319     vacc = _mm_slli_epi16(vacc, 7);
8320     vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
8321     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8322     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8323     x += 8;
8324 
8325     const __m128i vy = _mm_packus_epi16(vacc, vacc);
8326     _mm_storel_epi64((__m128i*) y, vy);
8327     y += 8;
8328   }
8329   if XNN_UNLIKELY(n != 0) {
8330     assert(n >= 1 * sizeof(uint8_t));
8331     assert(n <= 7 * sizeof(uint8_t));
8332 
8333     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
8334     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
8335     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
8336     vmultiplier = _mm_and_si128(vmultiplier, vmultiplier_diff);
8337     vacc = _mm_slli_epi16(vacc, 7);
8338     vmultiplier = _mm_xor_si128(vmultiplier, vmultiplier_base);
8339     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
8340     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
8341 
8342     __m128i vy = _mm_packus_epi16(vacc, vacc);
8343     if (n & (4 * sizeof(uint8_t))) {
8344       _mm_storeu_si32(y, vy);
8345       vy = _mm_srli_epi64(vy, 32);
8346       y += 4;
8347     }
8348     if (n & (2 * sizeof(uint8_t))) {
8349       _mm_storeu_si16(y, vy);
8350       vy = _mm_srli_epi32(vy, 16);
8351       y += 2;
8352     }
8353     if (n & (1 * sizeof(uint8_t))) {
8354       *y = (uint8_t) _mm_extract_epi8(vy, 0);
8355     }
8356   }
8357 }
8358 
xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8359 void xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
8360     size_t n,
8361     const uint8_t* input_a,
8362     const uint8_t* input_b,
8363     uint8_t* output,
8364     const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8365 
8366 {
8367   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
8368   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
8369   const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
8370   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
8371   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
8372   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
8373 
8374   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
8375     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8376     const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8377     const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
8378     const __m128i vb89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
8379     input_a += 16;
8380     input_b += 16;
8381 
8382 
8383     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8384     const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
8385     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
8386     const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
8387 
8388     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
8389     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
8390     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
8391     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
8392 
8393     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8394     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8395     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8396     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8397 
8398     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8399     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8400     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
8401     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
8402 
8403     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8404     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8405     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
8406     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
8407 
8408     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8409     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8410     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
8411     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
8412 
8413     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8414     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8415 
8416 
8417     __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
8418 
8419     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
8420 
8421     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
8422 
8423     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8424     output += 16;
8425   }
8426   if XNN_UNLIKELY(n != 0) {
8427     do {
8428       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8429       const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
8430       input_a += 8;
8431       input_b += 8;
8432 
8433 
8434       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8435       const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
8436 
8437       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
8438       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
8439 
8440       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8441       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8442 
8443       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8444       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8445 
8446       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8447       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8448 
8449       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8450       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8451 
8452       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8453 
8454       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8455       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8456       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8457 
8458       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
8459         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8460         output += 8;
8461         n -= 8 * sizeof(uint8_t);
8462       } else {
8463         if (n & (4 * sizeof(uint8_t))) {
8464           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8465           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8466           output += 4;
8467         }
8468         if (n & (2 * sizeof(uint8_t))) {
8469           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8470           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8471           output += 2;
8472         }
8473         if (n & (1 * sizeof(uint8_t))) {
8474           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8475         }
8476         n = 0;
8477       }
8478     } while (n != 0);
8479   }
8480 }
8481 
xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8482 void xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16(
8483     size_t n,
8484     const uint8_t* input_a,
8485     const uint8_t* input_b,
8486     uint8_t* output,
8487     const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8488 
8489 {
8490   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
8491   const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
8492   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
8493   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
8494   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
8495 
8496   __m128i vxb = _mm_sub_epi16(
8497     _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
8498     _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
8499   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
8500     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8501     const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
8502     input_a += 16;
8503 
8504 
8505     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8506     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
8507 
8508     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
8509     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
8510     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
8511     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
8512 
8513     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8514     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8515     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8516     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
8517 
8518     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8519     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8520     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
8521     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
8522 
8523     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8524     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8525     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
8526     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
8527 
8528     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8529     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8530     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
8531     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
8532 
8533     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8534     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8535 
8536 
8537     __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
8538 
8539     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
8540 
8541     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
8542 
8543     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8544     output += 16;
8545   }
8546   if XNN_UNLIKELY(n != 0) {
8547     do {
8548       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
8549       input_a += 8;
8550 
8551 
8552       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
8553 
8554       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
8555       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
8556 
8557       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
8558       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
8559 
8560       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
8561       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
8562 
8563       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
8564       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
8565 
8566       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
8567       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
8568 
8569       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8570 
8571       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8572       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8573       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
8574 
8575       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
8576         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8577         output += 8;
8578         n -= 8 * sizeof(uint8_t);
8579       } else {
8580         if (n & (4 * sizeof(uint8_t))) {
8581           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8582           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8583           output += 4;
8584         }
8585         if (n & (2 * sizeof(uint8_t))) {
8586           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8587           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8588           output += 2;
8589         }
8590         if (n & (1 * sizeof(uint8_t))) {
8591           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
8592         }
8593         n = 0;
8594       }
8595     } while (n != 0);
8596   }
8597 }
8598 
xnn_s8_ibilinear_ukernel__sse41_c16(size_t output_pixels,size_t channels,const int8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,int8_t * restrict output,size_t output_increment)8599 void xnn_s8_ibilinear_ukernel__sse41_c16(
8600     size_t output_pixels,
8601     size_t channels,
8602     const int8_t**restrict input,
8603     size_t input_offset,
8604     const int16_t*restrict weights,
8605     int8_t*restrict output,
8606     size_t output_increment) XNN_OOB_READS
8607 {
8608   assert(output_pixels != 0);
8609   assert(channels != 0);
8610 
8611   do {
8612     const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
8613     const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
8614     const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
8615     const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
8616     input += 4;
8617 
8618     const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
8619     weights += 2;
8620     __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
8621     valphah = _mm_unpacklo_epi64(valphah, valphah);
8622     __m128i valphav = _mm_srli_epi32(valpha, 16);
8623     valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0));
8624 
8625     valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA);
8626 
8627     const __m128i vrounding = _mm_set1_epi32(0x00200000);
8628 
8629     size_t c = channels;
8630     for (; c >= 16 * sizeof(int8_t); c -= 16 * sizeof(int8_t)) {
8631       const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8632       const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8633       const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8634       const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8635       const __m128i vtl89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
8636       const __m128i vtr89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
8637       const __m128i vbl89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
8638       const __m128i vbr89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
8639       i0 += 16;
8640       i1 += 16;
8641       i2 += 16;
8642       i3 += 16;
8643 
8644 
8645       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8646       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8647       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8648       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8649       const __m128i vdr89ABCDEF = _mm_sub_epi16(vbr89ABCDEF, vtr89ABCDEF);
8650       const __m128i vt89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
8651       const __m128i vdl89ABCDEF = _mm_sub_epi16(vbl89ABCDEF, vtl89ABCDEF);
8652       const __m128i vtCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
8653 
8654       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8655       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8656       const __m128i vd89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
8657       const __m128i vdCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
8658 
8659       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8660       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8661       __m128i vacc89AB = _mm_mullo_epi32(vd89AB, valphav);
8662       __m128i vaccCDEF = _mm_mullo_epi32(vdCDEF, valphav);
8663 
8664       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8665       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8666       vacc89AB = _mm_add_epi32(_mm_slli_epi32(vt89AB, 11), vacc89AB);
8667       vaccCDEF = _mm_add_epi32(_mm_slli_epi32(vtCDEF, 11), vaccCDEF);
8668 
8669       vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8670       vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8671       vacc89AB = _mm_srai_epi32(_mm_add_epi16(vacc89AB, vrounding), 22);
8672       vaccCDEF = _mm_srai_epi32(_mm_add_epi16(vaccCDEF, vrounding), 22);
8673 
8674       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8675       const __m128i vacc89ABCDEF = _mm_packs_epi32(vacc89AB, vaccCDEF);
8676 
8677       const __m128i vo0123456789ABCDEF = _mm_packs_epi16(vacc01234567, vacc89ABCDEF);
8678 
8679       _mm_storeu_si128((__m128i*) output, vo0123456789ABCDEF);
8680       output += 16;
8681     }
8682     for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) {
8683       const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8684       i0 += 8;
8685       const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8686       i1 += 8;
8687       const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8688       i2 += 8;
8689       const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8690       i3 += 8;
8691 
8692 
8693       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8694       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8695       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8696       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8697 
8698       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8699       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8700 
8701       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8702       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8703 
8704       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8705       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8706 
8707       vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8708       vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8709 
8710       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8711 
8712       const __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
8713 
8714       _mm_storel_epi64((__m128i*) output, vo01234567);
8715       output += 8;
8716     }
8717     if XNN_UNLIKELY(c != 0) {
8718       const __m128i vtl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
8719       const __m128i vtr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
8720       const __m128i vbl01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
8721       const __m128i vbr01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
8722 
8723 
8724       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
8725       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
8726       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
8727       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
8728 
8729       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
8730       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
8731 
8732       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
8733       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
8734 
8735       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
8736       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
8737 
8738       vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
8739       vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
8740 
8741       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
8742 
8743       __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
8744 
8745       if (c & (4 * sizeof(int8_t))) {
8746         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
8747         output += 4;
8748         vo01234567 = _mm_srli_epi64(vo01234567, 32);
8749       }
8750       if (c & (2 * sizeof(int8_t))) {
8751         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0));
8752         output += 2;
8753         vo01234567 = _mm_srli_epi32(vo01234567, 16);
8754       }
8755       if (c & (1 * sizeof(int8_t))) {
8756         *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0);
8757       }
8758     }
8759 
8760     output = (int8_t*) ((uintptr_t) output + output_increment);
8761   } while (--output_pixels != 0);
8762 }
8763 
xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16(size_t output_pixels,size_t kernel_elements,size_t channels,const int8_t ** input,size_t input_offset,int8_t * output,size_t input_increment,size_t output_increment,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8764 void xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16(
8765     size_t output_pixels,
8766     size_t kernel_elements,
8767     size_t channels,
8768     const int8_t** input,
8769     size_t input_offset,
8770     int8_t* output,
8771     size_t input_increment,
8772     size_t output_increment,
8773     const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8774 {
8775   assert(output_pixels != 0);
8776   assert(kernel_elements != 0);
8777   assert(channels != 0);
8778 
8779   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.max);
8780   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.min);
8781 
8782   do {
8783     int8_t* o = output;
8784     {
8785       const int8_t* i0 = *input++;
8786       const int8_t* i1 = *input++;
8787       const int8_t* i2 = *input++;
8788       const int8_t* i3 = *input++;
8789       const int8_t* i4 = *input++;
8790       const int8_t* i5 = *input++;
8791       const int8_t* i6 = *input++;
8792       const int8_t* i7 = *input++;
8793       const int8_t* i8 = *input++;
8794       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
8795       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
8796       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
8797       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
8798       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
8799       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
8800       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
8801       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
8802       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
8803       if (kernel_elements < 2) {
8804         i1 = i0;
8805       }
8806       if (kernel_elements <= 2) {
8807         i2 = i0;
8808       }
8809       if (kernel_elements < 4) {
8810         i3 = i0;
8811       }
8812       if (kernel_elements <= 4) {
8813         i4 = i0;
8814       }
8815       if (kernel_elements < 6) {
8816         i5 = i0;
8817       }
8818       if (kernel_elements <= 6) {
8819         i6 = i0;
8820       }
8821       if (kernel_elements < 8) {
8822         i7 = i0;
8823       }
8824       if (kernel_elements <= 8) {
8825         i8 = i0;
8826       }
8827 
8828       size_t c = channels;
8829       for (; c >= 16; c -= 16) {
8830         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
8831         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
8832         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
8833         const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
8834         const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
8835         const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
8836         const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
8837         const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
8838         const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8); i8 += 16;
8839 
8840         const __m128i vmax018 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vi8);
8841         const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8842         const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8843         const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8844 
8845         const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8846         const __m128i vmax01678 = _mm_max_epi8(vmax018, vmax67);
8847         __m128i vout = _mm_max_epi8(vmax2345, vmax01678);
8848         vout = _mm_max_epi8(vout, voutput_min);
8849         vout = _mm_min_epi8(vout, voutput_max);
8850 
8851         _mm_storeu_si128((__m128i*) o, vout); o += 16;
8852       }
8853       if (c != 0) {
8854         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
8855         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
8856         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
8857         const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
8858         const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
8859         const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
8860         const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
8861         const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
8862         const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8);
8863 
8864         const __m128i vmax018 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vi8);
8865         const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8866         const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8867         const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8868 
8869         const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8870         const __m128i vmax01678 = _mm_max_epi8(vmax018, vmax67);
8871         __m128i vout = _mm_max_epi8(vmax2345, vmax01678);
8872         vout = _mm_max_epi8(vout, voutput_min);
8873         vout = _mm_min_epi8(vout, voutput_max);
8874 
8875         if (c & 8) {
8876           _mm_storel_epi64((__m128i*) o, vout);
8877           vout = _mm_unpackhi_epi64(vout, vout);
8878           o += 8;
8879         }
8880         if (c & 4) {
8881           unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
8882           vout = _mm_srli_epi64(vout, 32);
8883           o += 4;
8884         }
8885         if (c & 2) {
8886           unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
8887           vout = _mm_srli_epi32(vout, 16);
8888           o += 2;
8889         }
8890         if (c & 1) {
8891           *o = (int8_t) _mm_cvtsi128_si32(vout);
8892           o += 1;
8893         }
8894       }
8895     }
8896 
8897     for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
8898       const int8_t* i0 = *input++;
8899       const int8_t* i1 = *input++;
8900       const int8_t* i2 = *input++;
8901       const int8_t* i3 = *input++;
8902       const int8_t* i4 = *input++;
8903       const int8_t* i5 = *input++;
8904       const int8_t* i6 = *input++;
8905       const int8_t* i7 = *input++;
8906       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
8907       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
8908       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
8909       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
8910       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
8911       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
8912       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
8913       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
8914       if (k < 2) {
8915         i1 = i0;
8916       }
8917       if (k <= 2) {
8918         i2 = i0;
8919       }
8920       if (k < 4) {
8921         i3 = i0;
8922       }
8923       if (k <= 4) {
8924         i4 = i0;
8925       }
8926       if (k < 6) {
8927         i5 = i0;
8928       }
8929       if (k <= 6) {
8930         i6 = i0;
8931       }
8932       if (k < 8) {
8933         i7 = i0;
8934       }
8935 
8936       o = output;
8937       size_t c = channels;
8938       for (; c >= 16; c -= 16) {
8939         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
8940         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
8941         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
8942         const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
8943         const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
8944         const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
8945         const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
8946         const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
8947         const __m128i vo = _mm_loadu_si128((const __m128i*) o);
8948 
8949         const __m128i vmax01 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vo);
8950         const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8951         const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8952         const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8953 
8954         const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8955         const __m128i vmax0167 = _mm_max_epi8(vmax01, vmax67);
8956         __m128i vout = _mm_max_epi8(vmax2345, vmax0167);
8957         vout = _mm_max_epi8(vout, voutput_min);
8958         vout = _mm_min_epi8(vout, voutput_max);
8959 
8960         _mm_storeu_si128((__m128i*) o, vout);
8961         o += 16;
8962       }
8963       if (c != 0) {
8964         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
8965         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
8966         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
8967         const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
8968         const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
8969         const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
8970         const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
8971         const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
8972         const __m128i vo = _mm_loadu_si128((const __m128i*) o);
8973 
8974         const __m128i vmax01 = _mm_max_epi8(_mm_max_epi8(vi0, vi1), vo);
8975         const __m128i vmax23 = _mm_max_epi8(vi2, vi3);
8976         const __m128i vmax45 = _mm_max_epi8(vi4, vi5);
8977         const __m128i vmax67 = _mm_max_epi8(vi6, vi7);
8978 
8979         const __m128i vmax2345 = _mm_max_epi8(vmax23, vmax45);
8980         const __m128i vmax0167 = _mm_max_epi8(vmax01, vmax67);
8981         __m128i vout = _mm_max_epi8(vmax2345, vmax0167);
8982         vout = _mm_max_epi8(vout, voutput_min);
8983         vout = _mm_min_epi8(vout, voutput_max);
8984 
8985         if (c & 8) {
8986           _mm_storel_epi64((__m128i*) o, vout);
8987           vout = _mm_unpackhi_epi64(vout, vout);
8988           o += 8;
8989         }
8990         if (c & 4) {
8991           unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
8992           vout = _mm_srli_epi64(vout, 32);
8993           o += 4;
8994         }
8995         if (c & 2) {
8996           unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
8997           vout = _mm_srli_epi32(vout, 16);
8998           o += 2;
8999         }
9000         if (c & 1) {
9001           *o = (int8_t) _mm_cvtsi128_si32(vout);
9002           o += 1;
9003         }
9004       }
9005     }
9006     input = (const int8_t**) ((uintptr_t) input + input_increment);
9007     output = (int8_t*) ((uintptr_t) o + output_increment);
9008   } while (--output_pixels != 0);
9009 }
9010 
xnn_s8_vclamp_ukernel__sse41_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9011 void xnn_s8_vclamp_ukernel__sse41_x64(
9012     size_t n,
9013     const int8_t* x,
9014     int8_t* y,
9015     const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9016 {
9017   assert(n != 0);
9018 
9019   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.max);
9020   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.min);
9021   for (; n >= 64; n -= 64) {
9022     __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
9023     __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
9024     __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
9025     __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
9026     x += 64;
9027 
9028     vacc0 = _mm_max_epi8(vacc0, voutput_min);
9029     vacc1 = _mm_max_epi8(vacc1, voutput_min);
9030     vacc2 = _mm_max_epi8(vacc2, voutput_min);
9031     vacc3 = _mm_max_epi8(vacc3, voutput_min);
9032 
9033     vacc0 = _mm_min_epi8(vacc0, voutput_max);
9034     vacc1 = _mm_min_epi8(vacc1, voutput_max);
9035     vacc2 = _mm_min_epi8(vacc2, voutput_max);
9036     vacc3 = _mm_min_epi8(vacc3, voutput_max);
9037 
9038     _mm_storeu_si128((__m128i*) y, vacc0);
9039     _mm_storeu_si128((__m128i*) y + 1, vacc1);
9040     _mm_storeu_si128((__m128i*) y + 2, vacc2);
9041     _mm_storeu_si128((__m128i*) y + 3, vacc3);
9042     y += 64;
9043   }
9044   for (; n >= 16; n -= 16) {
9045     __m128i vacc = _mm_loadu_si128((const __m128i*) x);
9046     x += 16;
9047 
9048     vacc = _mm_min_epi8(vacc, voutput_max);
9049     vacc = _mm_max_epi8(vacc, voutput_min);
9050 
9051     _mm_storeu_si128((__m128i*) y, vacc);
9052     y += 16;
9053   }
9054   if XNN_UNLIKELY(n != 0) {
9055     __m128i vacc = _mm_loadu_si128((const __m128i*) x);
9056 
9057     vacc = _mm_min_epi8(vacc, voutput_max);
9058     vacc = _mm_max_epi8(vacc, voutput_min);
9059 
9060     if (n & 8) {
9061       _mm_storel_epi64((__m128i*) y, vacc);
9062       y += 8;
9063       vacc = _mm_unpackhi_epi64(vacc, vacc);
9064     }
9065     if (n & 4) {
9066       unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vacc));
9067       y += 4;
9068       vacc = _mm_srli_epi64(vacc, 32);
9069     }
9070     if (n & 2) {
9071       unaligned_store_u16(y, (uint16_t) _mm_cvtsi128_si32(vacc));
9072       y += 2;
9073       vacc = _mm_srli_epi32(vacc, 16);
9074     }
9075     if (n & 1) {
9076       *y = (int8_t) _mm_cvtsi128_si32(vacc);
9077     }
9078   }
9079 }
9080 
xnn_u8_ibilinear_ukernel__sse41_c16(size_t output_pixels,size_t channels,const uint8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,uint8_t * restrict output,size_t output_increment)9081 void xnn_u8_ibilinear_ukernel__sse41_c16(
9082     size_t output_pixels,
9083     size_t channels,
9084     const uint8_t**restrict input,
9085     size_t input_offset,
9086     const int16_t*restrict weights,
9087     uint8_t*restrict output,
9088     size_t output_increment) XNN_OOB_READS
9089 {
9090   assert(output_pixels != 0);
9091   assert(channels != 0);
9092 
9093   do {
9094     const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
9095     const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
9096     const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
9097     const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
9098     input += 4;
9099 
9100     const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
9101     weights += 2;
9102     __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
9103     valphah = _mm_unpacklo_epi64(valphah, valphah);
9104     __m128i valphav = _mm_srli_epi32(valpha, 16);
9105     valphav = _mm_shuffle_epi32(valphav, _MM_SHUFFLE(0, 0, 0, 0));
9106 
9107     valphah = _mm_blend_epi16(valphah, _mm_sub_epi16(_mm_set1_epi32(0x08000000), valphah), 0xAA);
9108 
9109     const __m128i vrounding = _mm_set1_epi32(0x00200000);
9110 
9111     size_t c = channels;
9112     for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
9113       const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9114       const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9115       const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9116       const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9117       const __m128i vtl89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
9118       const __m128i vtr89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
9119       const __m128i vbl89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
9120       const __m128i vbr89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
9121       i0 += 16;
9122       i1 += 16;
9123       i2 += 16;
9124       i3 += 16;
9125 
9126 
9127       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9128       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9129       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9130       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9131       const __m128i vdr89ABCDEF = _mm_sub_epi16(vbr89ABCDEF, vtr89ABCDEF);
9132       const __m128i vt89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
9133       const __m128i vdl89ABCDEF = _mm_sub_epi16(vbl89ABCDEF, vtl89ABCDEF);
9134       const __m128i vtCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vtr89ABCDEF, vtl89ABCDEF), valphah);
9135 
9136       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9137       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9138       const __m128i vd89AB = _mm_madd_epi16(_mm_unpacklo_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
9139       const __m128i vdCDEF = _mm_madd_epi16(_mm_unpackhi_epi16(vdr89ABCDEF, vdl89ABCDEF), valphah);
9140 
9141       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9142       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9143       __m128i vacc89AB = _mm_mullo_epi32(vd89AB, valphav);
9144       __m128i vaccCDEF = _mm_mullo_epi32(vdCDEF, valphav);
9145 
9146       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9147       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9148       vacc89AB = _mm_add_epi32(_mm_slli_epi32(vt89AB, 11), vacc89AB);
9149       vaccCDEF = _mm_add_epi32(_mm_slli_epi32(vtCDEF, 11), vaccCDEF);
9150 
9151       vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9152       vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9153       vacc89AB = _mm_srli_epi32(_mm_add_epi16(vacc89AB, vrounding), 22);
9154       vaccCDEF = _mm_srli_epi32(_mm_add_epi16(vaccCDEF, vrounding), 22);
9155 
9156       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9157       const __m128i vacc89ABCDEF = _mm_packs_epi32(vacc89AB, vaccCDEF);
9158 
9159       const __m128i vo0123456789ABCDEF = _mm_packus_epi16(vacc01234567, vacc89ABCDEF);
9160 
9161       _mm_storeu_si128((__m128i*) output, vo0123456789ABCDEF);
9162       output += 16;
9163     }
9164     for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) {
9165       const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9166       i0 += 8;
9167       const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9168       i1 += 8;
9169       const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9170       i2 += 8;
9171       const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9172       i3 += 8;
9173 
9174 
9175       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9176       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9177       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9178       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9179 
9180       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9181       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9182 
9183       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9184       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9185 
9186       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9187       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9188 
9189       vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9190       vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9191 
9192       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9193 
9194       const __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
9195 
9196       _mm_storel_epi64((__m128i*) output, vo01234567);
9197       output += 8;
9198     }
9199     if XNN_UNLIKELY(c != 0) {
9200       const __m128i vtl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
9201       const __m128i vtr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
9202       const __m128i vbl01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
9203       const __m128i vbr01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
9204 
9205 
9206       const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
9207       const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
9208       const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
9209       const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
9210 
9211       const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
9212       const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
9213 
9214       __m128i vacc0123 = _mm_mullo_epi32(vd0123, valphav);
9215       __m128i vacc4567 = _mm_mullo_epi32(vd4567, valphav);
9216 
9217       vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
9218       vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
9219 
9220       vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
9221       vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
9222 
9223       const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
9224 
9225       __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
9226 
9227       if (c & (4 * sizeof(uint8_t))) {
9228         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
9229         output += 4;
9230         vo01234567 = _mm_srli_epi64(vo01234567, 32);
9231       }
9232       if (c & (2 * sizeof(uint8_t))) {
9233         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vo01234567, 0));
9234         output += 2;
9235         vo01234567 = _mm_srli_epi32(vo01234567, 16);
9236       }
9237       if (c & (1 * sizeof(uint8_t))) {
9238         *output++ = (uint8_t) _mm_extract_epi8(vo01234567, 0);
9239       }
9240     }
9241 
9242     output = (uint8_t*) ((uintptr_t) output + output_increment);
9243   } while (--output_pixels != 0);
9244 }
9245