Lines Matching refs:C
91 $for C in range(4, CHANNEL_TILE, 4):
92 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
94 $for C in range(0, CHANNEL_TILE, 4):
95 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
97 $for C in range(0, CHANNEL_TILE, 4):
98 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
100 $for C in range(0, CHANNEL_TILE, 4):
102 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
104 $for C in range(0, CHANNEL_TILE, 4):
105 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
107 $for C in range(0, CHANNEL_TILE, 4):
109 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
111 $for C in range(0, CHANNEL_TILE, 4):
112 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
114 $for C in range(0, CHANNEL_TILE, 4):
116 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
118 $for C in range(0, CHANNEL_TILE, 4):
119 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
121 $for C in range(0, CHANNEL_TILE, 4):
123 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
125 $for C in range(0, CHANNEL_TILE, 4):
126 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
128 $for C in range(0, CHANNEL_TILE, 4):
130 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
132 $for C in range(0, CHANNEL_TILE, 4):
133 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
135 $for C in range(0, CHANNEL_TILE, 4):
137 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
139 $for C in range(0, CHANNEL_TILE, 4):
140 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
142 $for C in range(0, CHANNEL_TILE, 4):
144 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
146 $for C in range(0, CHANNEL_TILE, 4):
147 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
149 $for C in range(0, CHANNEL_TILE, 4):
151 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
153 $for C in range(0, CHANNEL_TILE, 4):
154 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
156 $for C in range(0, CHANNEL_TILE, 4):
158 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
164 $for C in range(0, CHANNEL_TILE, 4):
165 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
167 $for C in range(0, CHANNEL_TILE, 4):
169 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
171 $for C in range(0, CHANNEL_TILE, 4):
172 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
174 $for C in range(0, CHANNEL_TILE, 4):
176 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
178 $for C in range(0, CHANNEL_TILE, 4):
179 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
181 $for C in range(0, CHANNEL_TILE, 4):
183 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
185 $for C in range(0, CHANNEL_TILE, 4):
186 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
188 $for C in range(0, CHANNEL_TILE, 4):
190 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
192 $for C in range(0, CHANNEL_TILE, 4):
193 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
195 $for C in range(0, CHANNEL_TILE, 4):
197 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
199 $for C in range(0, CHANNEL_TILE, 4):
200 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
202 $for C in range(0, CHANNEL_TILE, 4):
204 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
206 $for C in range(0, CHANNEL_TILE, 4):
207 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
209 $for C in range(0, CHANNEL_TILE, 4):
211 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
213 $for C in range(0, CHANNEL_TILE, 4):
214 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
216 $for C in range(0, CHANNEL_TILE, 4):
218 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
220 $for C in range(0, CHANNEL_TILE, 4):
221 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
223 $for C in range(0, CHANNEL_TILE, 4):
225 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
227 $for C in range(0, CHANNEL_TILE, 4):
228 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
230 $for C in range(0, CHANNEL_TILE, 4):
232 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_…
234 $for C in range(0, CHANNEL_TILE, 4):
235 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
237 $for C in range(0, CHANNEL_TILE, 4):
239 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_…
241 $for C in range(0, CHANNEL_TILE, 4):
242 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
244 $for C in range(0, CHANNEL_TILE, 4):
246 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_…
252 $for C in range(0, CHANNEL_TILE, 4):
253 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
255 $for C in range(0, CHANNEL_TILE, 4):
257 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vi${Y*2}x2…
259 $for C in range(0, CHANNEL_TILE, 4):
260 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
262 $for C in range(0, CHANNEL_TILE, 4):
264 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vi${Y*2+1}…
266 $for C in range(0, CHANNEL_TILE, 4):
267 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
269 $for C in range(0, CHANNEL_TILE, 4):
271 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vi${Y*2+2}…
273 $for C in range(0, CHANNEL_TILE, 4):
274 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
276 $for C in range(0, CHANNEL_TILE, 4):
278 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vi${Y*2}x2…
280 $for C in range(0, CHANNEL_TILE, 4):
281 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
283 $for C in range(0, CHANNEL_TILE, 4):
285 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vi${Y*2+1}…
287 $for C in range(0, CHANNEL_TILE, 4):
288 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
290 $for C in range(0, CHANNEL_TILE, 4):
292 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vi${Y*2+2}…
301 $for C in range(0, CHANNEL_TILE, 4):
303 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
305 $for C in range(0, CHANNEL_TILE, 4):
307 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
312 $for C in range(4, CHANNEL_TILE, 4):
313 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
325 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
327 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
328 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
345 $for C in range(4, CHANNEL_TILE, 4):
346 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
348 $for C in range(0, CHANNEL_TILE, 4):
349 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
351 $for C in range(0, CHANNEL_TILE, 4):
352 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
354 $for C in range(0, CHANNEL_TILE, 4):
356 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
358 $for C in range(0, CHANNEL_TILE, 4):
359 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
361 $for C in range(0, CHANNEL_TILE, 4):
363 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
365 $for C in range(0, CHANNEL_TILE, 4):
366 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
368 $for C in range(0, CHANNEL_TILE, 4):
370 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
372 $for C in range(0, CHANNEL_TILE, 4):
373 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
375 $for C in range(0, CHANNEL_TILE, 4):
377 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
379 $for C in range(0, CHANNEL_TILE, 4):
380 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
382 $for C in range(0, CHANNEL_TILE, 4):
384 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
386 $for C in range(0, CHANNEL_TILE, 4):
387 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
389 $for C in range(0, CHANNEL_TILE, 4):
391 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
393 $for C in range(0, CHANNEL_TILE, 4):
394 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
396 $for C in range(0, CHANNEL_TILE, 4):
398 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
400 $for C in range(0, CHANNEL_TILE, 4):
401 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
403 $for C in range(0, CHANNEL_TILE, 4):
405 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
407 $for C in range(0, CHANNEL_TILE, 4):
408 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
410 $for C in range(0, CHANNEL_TILE, 4):
412 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
418 $for C in range(0, CHANNEL_TILE, 4):
419 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
421 $for C in range(0, CHANNEL_TILE, 4):
423 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
425 $for C in range(0, CHANNEL_TILE, 4):
426 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
428 $for C in range(0, CHANNEL_TILE, 4):
430 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
432 $for C in range(0, CHANNEL_TILE, 4):
433 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
435 $for C in range(0, CHANNEL_TILE, 4):
437 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
439 $for C in range(0, CHANNEL_TILE, 4):
440 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
442 $for C in range(0, CHANNEL_TILE, 4):
444 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
446 $for C in range(0, CHANNEL_TILE, 4):
447 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
449 $for C in range(0, CHANNEL_TILE, 4):
451 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
453 $for C in range(0, CHANNEL_TILE, 4):
454 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
456 $for C in range(0, CHANNEL_TILE, 4):
458 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
460 $for C in range(0, CHANNEL_TILE, 4):
461 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
463 $for C in range(0, CHANNEL_TILE, 4):
465 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
467 $for C in range(0, CHANNEL_TILE, 4):
468 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
470 $for C in range(0, CHANNEL_TILE, 4):
472 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
474 $for C in range(0, CHANNEL_TILE, 4):
475 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
477 $for C in range(0, CHANNEL_TILE, 4):
479 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
485 $for C in range(0, CHANNEL_TILE, 4):
487 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
489 $for C in range(0, CHANNEL_TILE, 4):
491 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
496 $for C in range(4, CHANNEL_TILE, 4):
497 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
509 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
511 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
512 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};