Lines Matching refs:h
42 dup v0.8h, w8
46 urshr v0.8h, v0.8h, #1
51 st1 {v0.4h}, [x0], x1
52 st1 {v0.4h}, [x6], x1
54 st1 {v0.4h}, [x0], x1
55 st1 {v0.4h}, [x6], x1
61 st1 {v0.8h}, [x0], x1
62 st1 {v0.8h}, [x6], x1
64 st1 {v0.8h}, [x0], x1
65 st1 {v0.8h}, [x6], x1
72 st1 {v0.8h, v1.8h}, [x0], x1
73 st1 {v0.8h, v1.8h}, [x6], x1
75 st1 {v0.8h, v1.8h}, [x0], x1
76 st1 {v0.8h, v1.8h}, [x6], x1
85 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
86 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
88 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
89 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
101 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
104 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
105 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
106 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
107 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
136 ld1 {v0.4h}, [x2]
138 st1 {v0.4h}, [x0], x1
139 st1 {v0.4h}, [x6], x1
141 st1 {v0.4h}, [x0], x1
142 st1 {v0.4h}, [x6], x1
147 ld1 {v0.8h}, [x2]
149 st1 {v0.8h}, [x0], x1
150 st1 {v0.8h}, [x6], x1
152 st1 {v0.8h}, [x0], x1
153 st1 {v0.8h}, [x6], x1
158 ld1 {v0.8h, v1.8h}, [x2]
160 st1 {v0.8h, v1.8h}, [x0], x1
161 st1 {v0.8h, v1.8h}, [x6], x1
163 st1 {v0.8h, v1.8h}, [x0], x1
164 st1 {v0.8h, v1.8h}, [x6], x1
169 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
174 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
175 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
180 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
182 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
184 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
185 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
186 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
187 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
189 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
190 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
192 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
223 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
224 st1 {v3.4h}, [x0], x1
225 st1 {v2.4h}, [x6], x1
227 st1 {v1.4h}, [x0], x1
228 st1 {v0.4h}, [x6], x1
234 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
235 st1 {v3.8h}, [x0], x1
236 st1 {v2.8h}, [x6], x1
238 st1 {v1.8h}, [x0], x1
239 st1 {v0.8h}, [x6], x1
245 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
248 st1 {v3.8h}, [x0], x1
249 st1 {v2.8h}, [x6], x1
253 st1 {v1.8h}, [x0], x1
254 st1 {v0.8h}, [x6], x1
260 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
265 st1 {v3.8h}, [x0], x1
266 st1 {v2.8h}, [x6], x1
272 st1 {v1.8h}, [x0], x1
273 st1 {v0.8h}, [x6], x1
279 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
288 st1 {v3.8h}, [x0], x1
289 st1 {v2.8h}, [x6], x1
299 st1 {v1.8h}, [x0], x1
300 st1 {v0.8h}, [x6], x1
329 ld1 {v0.4h}, [x2]
330 addv h0, v0.4h
331 urshr v0.4h, v0.4h, #2
332 dup v0.4h, v0.h[0]
334 st1 {v0.4h}, [x0], x1
335 st1 {v0.4h}, [x6], x1
337 st1 {v0.4h}, [x0], x1
338 st1 {v0.4h}, [x6], x1
343 ld1 {v0.8h}, [x2]
344 addv h0, v0.8h
345 urshr v0.4h, v0.4h, #3
346 dup v0.8h, v0.h[0]
348 st1 {v0.8h}, [x0], x1
349 st1 {v0.8h}, [x6], x1
351 st1 {v0.8h}, [x0], x1
352 st1 {v0.8h}, [x6], x1
357 ld1 {v0.8h, v1.8h}, [x2]
358 addp v0.8h, v0.8h, v1.8h
359 addv h0, v0.8h
360 urshr v2.4h, v0.4h, #4
361 dup v0.8h, v2.h[0]
362 dup v1.8h, v2.h[0]
364 st1 {v0.8h, v1.8h}, [x0], x1
365 st1 {v0.8h, v1.8h}, [x6], x1
367 st1 {v0.8h, v1.8h}, [x0], x1
368 st1 {v0.8h, v1.8h}, [x6], x1
373 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
374 addp v0.8h, v0.8h, v1.8h
375 addp v2.8h, v2.8h, v3.8h
376 addp v0.8h, v0.8h, v2.8h
377 uaddlv s0, v0.8h
378 rshrn v4.4h, v0.4s, #5
379 dup v0.8h, v4.h[0]
380 dup v1.8h, v4.h[0]
381 dup v2.8h, v4.h[0]
382 dup v3.8h, v4.h[0]
384 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
385 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
387 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
388 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
393 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
394 addp v0.8h, v0.8h, v1.8h
395 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
396 addp v2.8h, v2.8h, v3.8h
397 addp v4.8h, v4.8h, v5.8h
398 addp v6.8h, v6.8h, v7.8h
399 addp v0.8h, v0.8h, v2.8h
400 addp v4.8h, v4.8h, v6.8h
401 addp v0.8h, v0.8h, v4.8h
402 uaddlv s0, v0.8h
403 rshrn v4.4h, v0.4s, #6
405 dup v0.8h, v4.h[0]
406 dup v1.8h, v4.h[0]
407 dup v2.8h, v4.h[0]
408 dup v3.8h, v4.h[0]
410 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
411 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
412 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
413 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
415 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
416 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
417 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
418 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
452 ld1 {v0.4h}, [x2]
453 addv h0, v0.4h
454 urshr v0.4h, v0.4h, #2
455 dup v0.8h, v0.h[0]
460 st1 {v0.4h}, [x0], x1
461 st1 {v0.4h}, [x6], x1
463 st1 {v0.4h}, [x0], x1
464 st1 {v0.4h}, [x6], x1
470 ld1 {v0.8h}, [x2]
471 addv h0, v0.8h
472 urshr v0.4h, v0.4h, #3
473 dup v0.8h, v0.h[0]
478 st1 {v0.8h}, [x0], x1
479 st1 {v0.8h}, [x6], x1
481 st1 {v0.8h}, [x0], x1
482 st1 {v0.8h}, [x6], x1
488 ld1 {v0.8h, v1.8h}, [x2]
489 addp v0.8h, v0.8h, v1.8h
490 addv h0, v0.8h
491 urshr v2.4h, v0.4h, #4
492 dup v0.8h, v2.h[0]
493 dup v1.8h, v2.h[0]
499 st1 {v0.8h, v1.8h}, [x0], x1
500 st1 {v0.8h, v1.8h}, [x6], x1
502 st1 {v0.8h, v1.8h}, [x0], x1
503 st1 {v0.8h, v1.8h}, [x6], x1
509 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
510 addp v0.8h, v0.8h, v1.8h
511 addp v2.8h, v2.8h, v3.8h
512 addp v0.8h, v0.8h, v2.8h
513 uaddlp v0.4s, v0.8h
515 rshrn v4.4h, v0.4s, #5
516 dup v0.8h, v4.h[0]
524 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
525 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
527 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
528 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
534 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
535 addp v0.8h, v0.8h, v1.8h
536 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
537 addp v2.8h, v2.8h, v3.8h
538 addp v4.8h, v4.8h, v5.8h
539 addp v6.8h, v6.8h, v7.8h
540 addp v0.8h, v0.8h, v2.8h
541 addp v4.8h, v4.8h, v6.8h
542 addp v0.8h, v0.8h, v4.8h
543 uaddlv s0, v0.8h
544 rshrn v4.4h, v0.4s, #6
545 dup v0.8h, v4.h[0]
554 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
555 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
556 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
557 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
559 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
560 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
561 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
562 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
608 ld1 {v0.4h}, [x2], #8
609 uaddlv s0, v0.4h
614 ld1 {v1.4h}, [x2]
616 uaddlv s1, v1.4h
630 dup v0.4h, v0.h[0]
632 st1 {v0.4h}, [x0], x1
633 st1 {v0.4h}, [x6], x1
635 st1 {v0.4h}, [x0], x1
636 st1 {v0.4h}, [x6], x1
642 ld1 {v0.8h}, [x2], #16
643 uaddlv s0, v0.8h
648 ld1 {v1.8h}, [x2]
650 uaddlv s1, v1.8h
664 dup v0.8h, v0.h[0]
666 st1 {v0.8h}, [x0], x1
667 st1 {v0.8h}, [x6], x1
669 st1 {v0.8h}, [x0], x1
670 st1 {v0.8h}, [x6], x1
676 ld1 {v0.8h, v1.8h}, [x2], #32
677 addp v0.8h, v0.8h, v1.8h
679 uaddlv s0, v0.8h
683 ld1 {v1.8h, v2.8h}, [x2]
685 addp v1.8h, v1.8h, v2.8h
686 uaddlv s1, v1.8h
700 dup v0.8h, v4.h[0]
701 dup v1.8h, v4.h[0]
703 st1 {v0.8h, v1.8h}, [x0], x1
704 st1 {v0.8h, v1.8h}, [x6], x1
706 st1 {v0.8h, v1.8h}, [x0], x1
707 st1 {v0.8h, v1.8h}, [x6], x1
713 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
714 addp v0.8h, v0.8h, v1.8h
715 addp v2.8h, v2.8h, v3.8h
716 addp v0.8h, v0.8h, v2.8h
718 uaddlv s0, v0.8h
722 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
724 addp v1.8h, v1.8h, v2.8h
725 addp v3.8h, v3.8h, v4.8h
726 addp v1.8h, v1.8h, v3.8h
727 uaddlv s1, v1.8h
741 dup v0.8h, v4.h[0]
742 dup v1.8h, v4.h[0]
743 dup v2.8h, v4.h[0]
744 dup v3.8h, v4.h[0]
746 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
747 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
750 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
756 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
757 addp v0.8h, v0.8h, v1.8h
758 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
759 addp v2.8h, v2.8h, v3.8h
760 addp v4.8h, v4.8h, v5.8h
761 addp v6.8h, v6.8h, v7.8h
762 addp v0.8h, v0.8h, v2.8h
763 addp v4.8h, v4.8h, v6.8h
764 addp v0.8h, v0.8h, v4.8h
766 uaddlv s0, v0.8h
770 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
772 addp v1.8h, v1.8h, v2.8h
773 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
774 addp v3.8h, v3.8h, v4.8h
775 addp v20.8h, v20.8h, v21.8h
776 addp v22.8h, v22.8h, v23.8h
777 addp v1.8h, v1.8h, v3.8h
778 addp v20.8h, v20.8h, v22.8h
779 addp v1.8h, v1.8h, v20.8h
780 uaddlv s1, v1.8h
795 dup v0.8h, v4.h[0]
796 dup v1.8h, v4.h[0]
797 dup v2.8h, v4.h[0]
798 dup v3.8h, v4.h[0]
800 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
801 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
802 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
803 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
805 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
806 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
807 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
808 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
835 ld1r {v4.8h}, [x2]
846 sub v6.8h, v5.8h, v4.8h // top - topleft
848 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
851 add v16.8h, v6.8h, v0.8h // base
852 add v17.8h, v6.8h, v2.8h
853 sabd v20.8h, v5.8h, v16.8h // tdiff
854 sabd v21.8h, v5.8h, v17.8h
855 sabd v22.8h, v4.8h, v16.8h // tldiff
856 sabd v23.8h, v4.8h, v17.8h
857 sabd v16.8h, v0.8h, v16.8h // ldiff
858 sabd v17.8h, v2.8h, v17.8h
859 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
860 umin v19.8h, v21.8h, v23.8h
861 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
862 cmge v21.8h, v23.8h, v21.8h
863 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
864 cmge v17.8h, v19.8h, v17.8h
881 ld1 {v5.8h}, [x8], #16
889 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
891 sub v6.8h, v5.8h, v4.8h // top - topleft
892 add v16.8h, v6.8h, v0.8h // base
893 add v17.8h, v6.8h, v1.8h
894 add v18.8h, v6.8h, v2.8h
895 add v19.8h, v6.8h, v3.8h
896 sabd v20.8h, v5.8h, v16.8h // tdiff
897 sabd v21.8h, v5.8h, v17.8h
898 sabd v22.8h, v5.8h, v18.8h
899 sabd v23.8h, v5.8h, v19.8h
900 sabd v24.8h, v4.8h, v16.8h // tldiff
901 sabd v25.8h, v4.8h, v17.8h
902 sabd v26.8h, v4.8h, v18.8h
903 sabd v27.8h, v4.8h, v19.8h
904 sabd v16.8h, v0.8h, v16.8h // ldiff
905 sabd v17.8h, v1.8h, v17.8h
906 sabd v18.8h, v2.8h, v18.8h
907 sabd v19.8h, v3.8h, v19.8h
908 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
909 umin v29.8h, v21.8h, v25.8h
910 umin v30.8h, v22.8h, v26.8h
911 umin v31.8h, v23.8h, v27.8h
912 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
913 cmge v21.8h, v25.8h, v21.8h
914 cmge v22.8h, v26.8h, v22.8h
915 cmge v23.8h, v27.8h, v23.8h
916 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
917 cmge v17.8h, v29.8h, v17.8h
918 cmge v18.8h, v30.8h, v18.8h
919 cmge v19.8h, v31.8h, v19.8h
928 st1 {v23.8h}, [x0], #16
929 st1 {v22.8h}, [x6], #16
931 st1 {v21.8h}, [x5], #16
932 st1 {v20.8h}, [x10], #16
934 ld1 {v5.8h}, [x8], #16
944 ld1 {v5.8h}, [x8], #16
974 ld1r {v4.8h}, [x12] // bottom
986 dup v5.8h, v6.h[3] // right
987 sub v6.8h, v6.8h, v4.8h // top-bottom
988 uxtl v7.8h, v7.8b // weights_hor
989 add v31.4h, v4.4h, v5.4h // bottom+right
991 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
993 ushll v20.4s, v31.4h, #8 // (bottom+right)*256
994 ushll v21.4s, v31.4h, #8
995 ushll v22.4s, v31.4h, #8
996 ushll v23.4s, v31.4h, #8
1001 sub v0.8h, v0.8h, v5.8h // left-right
1002 sub v1.8h, v1.8h, v5.8h
1003 uxtl v16.8h, v16.8b // weights_ver
1004 uxtl v18.8h, v18.8b
1005 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
1006 smlal2 v21.4s, v0.8h, v7.8h
1007 smlal v22.4s, v1.4h, v7.4h
1008 smlal2 v23.4s, v1.8h, v7.8h
1009 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
1010 smlal2 v21.4s, v6.8h, v16.8h
1011 smlal v22.4s, v6.4h, v18.4h
1012 smlal2 v23.4s, v6.8h, v18.8h
1013 rshrn v20.4h, v20.4s, #9
1014 rshrn v21.4h, v21.4s, #9
1015 rshrn v22.4h, v22.4s, #9
1016 rshrn v23.4h, v23.4s, #9
1017 st1 {v20.4h}, [x0], x1
1018 st1 {v21.4h}, [x6], x1
1020 st1 {v22.4h}, [x0], x1
1021 st1 {v23.4h}, [x6], x1
1026 ld1 {v6.8h}, [x8] // top
1030 dup v5.8h, v6.h[7] // right
1031 sub v6.8h, v6.8h, v4.8h // top-bottom
1032 uxtl v7.8h, v7.8b // weights_hor
1033 add v31.4h, v4.4h, v5.4h // bottom+right
1035 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
1037 ushll v20.4s, v31.4h, #8 // (bottom+right)*256
1038 ushll v21.4s, v31.4h, #8
1039 ushll v22.4s, v31.4h, #8
1040 ushll v23.4s, v31.4h, #8
1041 ushll v24.4s, v31.4h, #8
1042 ushll v25.4s, v31.4h, #8
1043 ushll v26.4s, v31.4h, #8
1044 ushll v27.4s, v31.4h, #8
1045 sub v0.8h, v0.8h, v5.8h // left-right
1046 sub v1.8h, v1.8h, v5.8h
1047 sub v2.8h, v2.8h, v5.8h
1048 sub v3.8h, v3.8h, v5.8h
1049 uxtl v16.8h, v16.8b // weights_ver
1050 uxtl v17.8h, v17.8b
1051 uxtl v18.8h, v18.8b
1052 uxtl v19.8h, v19.8b
1053 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
1054 smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
1055 smlal v22.4s, v2.4h, v7.4h
1056 smlal2 v23.4s, v2.8h, v7.8h
1057 smlal v24.4s, v1.4h, v7.4h
1058 smlal2 v25.4s, v1.8h, v7.8h
1059 smlal v26.4s, v0.4h, v7.4h
1060 smlal2 v27.4s, v0.8h, v7.8h
1061 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
1062 smlal2 v21.4s, v6.8h, v16.8h
1063 smlal v22.4s, v6.4h, v17.4h
1064 smlal2 v23.4s, v6.8h, v17.8h
1065 smlal v24.4s, v6.4h, v18.4h
1066 smlal2 v25.4s, v6.8h, v18.8h
1067 smlal v26.4s, v6.4h, v19.4h
1068 smlal2 v27.4s, v6.8h, v19.8h
1069 rshrn v20.4h, v20.4s, #9
1070 rshrn2 v20.8h, v21.4s, #9
1071 rshrn v21.4h, v22.4s, #9
1072 rshrn2 v21.8h, v23.4s, #9
1073 rshrn v22.4h, v24.4s, #9
1074 rshrn2 v22.8h, v25.4s, #9
1075 rshrn v23.4h, v26.4s, #9
1076 rshrn2 v23.8h, v27.4s, #9
1077 st1 {v20.8h}, [x0], x1
1078 st1 {v21.8h}, [x6], x1
1080 st1 {v22.8h}, [x0], x1
1081 st1 {v23.8h}, [x6], x1
1090 ld1r {v5.8h}, [x12] // right
1094 add v31.4h, v4.4h, v5.4h // bottom+right
1097 ld2r {v0.8h, v1.8h}, [x2], x7 // left
1099 sub v0.8h, v0.8h, v5.8h // left-right
1100 sub v1.8h, v1.8h, v5.8h
1101 uxtl v16.8h, v16.8b // weights_ver
1102 uxtl v17.8h, v17.8b
1105 ld1 {v2.8h, v3.8h}, [x8], #32 // top
1106 ushll v20.4s, v31.4h, #8 // (bottom+right)*256
1107 ushll v21.4s, v31.4h, #8
1108 ushll v22.4s, v31.4h, #8
1109 ushll v23.4s, v31.4h, #8
1110 ushll v24.4s, v31.4h, #8
1111 ushll v25.4s, v31.4h, #8
1112 ushll v26.4s, v31.4h, #8
1113 ushll v27.4s, v31.4h, #8
1114 uxtl v6.8h, v7.8b // weights_hor
1115 uxtl2 v7.8h, v7.16b
1116 sub v2.8h, v2.8h, v4.8h // top-bottom
1117 sub v3.8h, v3.8h, v4.8h
1118 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
1119 smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
1120 smlal v22.4s, v1.4h, v7.4h
1121 smlal2 v23.4s, v1.8h, v7.8h
1122 smlal v24.4s, v0.4h, v6.4h
1123 smlal2 v25.4s, v0.8h, v6.8h
1124 smlal v26.4s, v0.4h, v7.4h
1125 smlal2 v27.4s, v0.8h, v7.8h
1126 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
1127 smlal2 v21.4s, v2.8h, v16.8h
1128 smlal v22.4s, v3.4h, v16.4h
1129 smlal2 v23.4s, v3.8h, v16.8h
1130 smlal v24.4s, v2.4h, v17.4h
1131 smlal2 v25.4s, v2.8h, v17.8h
1132 smlal v26.4s, v3.4h, v17.4h
1133 smlal2 v27.4s, v3.8h, v17.8h
1134 rshrn v20.4h, v20.4s, #9
1135 rshrn2 v20.8h, v21.4s, #9
1136 rshrn v21.4h, v22.4s, #9
1137 rshrn2 v21.8h, v23.4s, #9
1138 rshrn v22.4h, v24.4s, #9
1139 rshrn2 v22.8h, v25.4s, #9
1140 rshrn v23.4h, v26.4s, #9
1141 rshrn2 v23.8h, v27.4s, #9
1143 st1 {v20.8h, v21.8h}, [x0], #32
1144 st1 {v22.8h, v23.8h}, [x6], #32
1178 ld1r {v4.8h}, [x8] // bottom
1187 sub v6.8h, v6.8h, v4.8h // top-bottom
1192 ushll v16.8h, v16.8b, #7 // weights_ver << 7
1193 ushll v18.8h, v18.8b, #7
1194 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
1195 sqrdmulh v21.8h, v6.8h, v18.8h
1196 add v20.8h, v20.8h, v4.8h
1197 add v21.8h, v21.8h, v4.8h
1207 ld1 {v6.8h}, [x2] // top
1208 sub v6.8h, v6.8h, v4.8h // top-bottom
1211 ushll v16.8h, v16.8b, #7 // weights_ver << 7
1212 ushll v17.8h, v17.8b, #7
1213 ushll v18.8h, v18.8b, #7
1214 ushll v19.8h, v19.8b, #7
1215 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
1216 sqrdmulh v21.8h, v6.8h, v17.8h
1217 sqrdmulh v22.8h, v6.8h, v18.8h
1218 sqrdmulh v23.8h, v6.8h, v19.8h
1219 add v20.8h, v20.8h, v4.8h
1220 add v21.8h, v21.8h, v4.8h
1221 add v22.8h, v22.8h, v4.8h
1222 add v23.8h, v23.8h, v4.8h
1223 st1 {v20.8h}, [x0], x1
1224 st1 {v21.8h}, [x6], x1
1226 st1 {v22.8h}, [x0], x1
1227 st1 {v23.8h}, [x6], x1
1243 ushll v16.8h, v16.8b, #7 // weights_ver << 7
1244 ushll v17.8h, v17.8b, #7
1245 ushll v18.8h, v18.8b, #7
1246 ushll v19.8h, v19.8b, #7
1248 ld1 {v2.8h, v3.8h}, [x2], #32 // top
1249 sub v2.8h, v2.8h, v4.8h // top-bottom
1250 sub v3.8h, v3.8h, v4.8h
1251 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
1252 sqrdmulh v21.8h, v3.8h, v16.8h
1253 sqrdmulh v22.8h, v2.8h, v17.8h
1254 sqrdmulh v23.8h, v3.8h, v17.8h
1255 sqrdmulh v24.8h, v2.8h, v18.8h
1256 sqrdmulh v25.8h, v3.8h, v18.8h
1257 sqrdmulh v26.8h, v2.8h, v19.8h
1258 sqrdmulh v27.8h, v3.8h, v19.8h
1259 add v20.8h, v20.8h, v4.8h
1260 add v21.8h, v21.8h, v4.8h
1261 add v22.8h, v22.8h, v4.8h
1262 add v23.8h, v23.8h, v4.8h
1263 add v24.8h, v24.8h, v4.8h
1264 add v25.8h, v25.8h, v4.8h
1265 add v26.8h, v26.8h, v4.8h
1266 add v27.8h, v27.8h, v4.8h
1268 st1 {v20.8h, v21.8h}, [x0], #32
1269 st1 {v22.8h, v23.8h}, [x6], #32
1270 st1 {v24.8h, v25.8h}, [x5], #32
1271 st1 {v26.8h, v27.8h}, [x8], #32
1306 ld1r {v5.8h}, [x12] // right
1316 ushll v7.8h, v7.8b, #7 // weights_hor << 7
1318 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
1321 sub v0.8h, v0.8h, v5.8h // left-right
1322 sub v1.8h, v1.8h, v5.8h
1323 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
1324 sqrdmulh v21.8h, v1.8h, v7.8h
1325 add v20.8h, v20.8h, v5.8h
1326 add v21.8h, v21.8h, v5.8h
1339 ushll v7.8h, v7.8b, #7 // weights_hor << 7
1341 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
1342 sub v3.8h, v3.8h, v5.8h // left-right
1343 sub v2.8h, v2.8h, v5.8h
1344 sub v1.8h, v1.8h, v5.8h
1345 sub v0.8h, v0.8h, v5.8h
1346 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
1347 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
1348 sqrdmulh v22.8h, v1.8h, v7.8h
1349 sqrdmulh v23.8h, v0.8h, v7.8h
1350 add v20.8h, v20.8h, v5.8h
1351 add v21.8h, v21.8h, v5.8h
1352 add v22.8h, v22.8h, v5.8h
1353 add v23.8h, v23.8h, v5.8h
1354 st1 {v20.8h}, [x0], x1
1355 st1 {v21.8h}, [x6], x1
1357 st1 {v22.8h}, [x0], x1
1358 st1 {v23.8h}, [x6], x1
1375 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
1376 sub v0.8h, v0.8h, v5.8h // left-right
1377 sub v1.8h, v1.8h, v5.8h
1378 sub v2.8h, v2.8h, v5.8h
1379 sub v3.8h, v3.8h, v5.8h
1382 ushll v6.8h, v7.8b, #7 // weights_hor << 7
1383 ushll2 v7.8h, v7.16b, #7
1384 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
1385 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
1386 sqrdmulh v22.8h, v2.8h, v6.8h
1387 sqrdmulh v23.8h, v2.8h, v7.8h
1388 sqrdmulh v24.8h, v1.8h, v6.8h
1389 sqrdmulh v25.8h, v1.8h, v7.8h
1390 sqrdmulh v26.8h, v0.8h, v6.8h
1391 sqrdmulh v27.8h, v0.8h, v7.8h
1392 add v20.8h, v20.8h, v5.8h
1393 add v21.8h, v21.8h, v5.8h
1394 add v22.8h, v22.8h, v5.8h
1395 add v23.8h, v23.8h, v5.8h
1396 add v24.8h, v24.8h, v5.8h
1397 add v25.8h, v25.8h, v5.8h
1398 add v26.8h, v26.8h, v5.8h
1399 add v27.8h, v27.8h, v5.8h
1401 st1 {v20.8h, v21.8h}, [x0], #32
1402 st1 {v22.8h, v23.8h}, [x6], #32
1403 st1 {v24.8h, v25.8h}, [x5], #32
1404 st1 {v26.8h, v27.8h}, [x10], #32
1447 dup v30.8h, w4 // bitdepth_max
1449 ld1 {v0.8h, v1.8h}, [x2] // in[]
1453 ld1r {v2.8h}, [x5] // padding
1454 ld1 {v3.8h, v4.8h}, [x4] // padding_mask
1456 movi v31.8h, #9
1468 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2]
1469 add v19.8h, v5.8h, v7.8h
1470 add v20.8h, v0.8h, v16.8h
1471 add v21.8h, v1.8h, v17.8h
1472 umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2])
1473 umull2 v23.4s, v18.8h, v31.8h
1474 umull v24.4s, v19.4h, v31.4h
1475 umull2 v25.4s, v19.8h, v31.8h
1476 usubw v22.4s, v22.4s, v20.4h
1477 usubw2 v23.4s, v23.4s, v20.8h
1478 usubw v24.4s, v24.4s, v21.4h
1479 usubw2 v25.4s, v25.4s, v21.8h
1481 sqrshrun v16.4h, v22.4s, #4
1482 sqrshrun2 v16.8h, v23.4s, #4
1483 sqrshrun v17.4h, v24.4s, #4
1484 sqrshrun2 v17.8h, v25.4s, #4
1486 smin v16.8h, v16.8h, v30.8h
1487 smin v17.8h, v17.8h, v30.8h
1489 zip1 v0.8h, v4.8h, v16.8h
1490 zip2 v1.8h, v4.8h, v16.8h
1491 zip1 v2.8h, v5.8h, v17.8h
1492 zip2 v3.8h, v5.8h, v17.8h
1494 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
1503 dup v30.8h, w3 // bitdepth_max
1506 ld1 {v0.8h, v1.8h}, [x2] // in[]
1510 ld1r {v3.8h}, [x2] // in[0] for padding
1511 ld1r {v2.8h}, [x5] // padding
1512 ld1 {v4.8h, v5.8h}, [x4] // padding_mask
1514 movi v31.8h, #9
1523 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1]
1524 add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2]
1525 umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2])
1526 umull2 v19.4s, v16.8h, v31.8h
1527 usubw v18.4s, v18.4s, v17.4h
1528 usubw2 v19.4s, v19.4s, v17.8h
1530 sqrshrun v16.4h, v18.4s, #4
1531 sqrshrun2 v16.8h, v19.4s, #4
1535 smin v16.8h, v16.8h, v30.8h
1537 zip1 v4.8h, v0.8h, v16.8h
1538 zip2 v5.8h, v0.8h, v16.8h
1540 st1 {v2.h}[0], [x5]
1542 st1 {v4.8h, v5.8h}, [x0]
1566 ld1 {v0.8h}, [x2], #16
1568 dup v30.8h, v31.h[0]
1569 dup v31.8h, v31.h[1]
1575 ld1 {v1.8h, v2.8h}, [x2], #32
1581 mul v16.8h, v0.8h, v30.8h
1582 mla v16.8h, v3.8h, v31.8h
1583 mla v16.8h, v5.8h, v30.8h
1584 mul v17.8h, v1.8h, v30.8h
1585 mla v17.8h, v4.8h, v31.8h
1586 mla v17.8h, v6.8h, v30.8h
1589 urshr v16.8h, v16.8h, #4
1590 urshr v17.8h, v17.8h, #4
1592 st1 {v16.8h, v17.8h}, [x0], #32
1604 ld1 {v3.8h, v4.8h}, [x5] // padding_mask
1606 ld1r {v2.8h}, [x6]
1615 mul v16.8h, v0.8h, v30.8h
1616 mla v16.8h, v3.8h, v31.8h
1617 mla v16.8h, v5.8h, v30.8h
1618 mul v17.8h, v1.8h, v30.8h
1619 mla v17.8h, v4.8h, v31.8h
1620 mla v17.8h, v6.8h, v30.8h
1622 urshr v16.8h, v16.8h, #4
1623 urshr v17.8h, v17.8h, #4
1624 st1 {v16.8h, v17.8h}, [x0], #32
1637 movi v29.8h, #2
1638 ld1 {v0.8h}, [x2], #16
1639 movi v30.8h, #4
1640 movi v31.8h, #4
1641 ins v0.h[0], v0.h[1]
1647 ld1 {v1.8h, v2.8h}, [x2], #32
1657 mul v20.8h, v0.8h, v29.8h
1658 mla v20.8h, v3.8h, v30.8h
1659 mla v20.8h, v5.8h, v31.8h
1660 mla v20.8h, v16.8h, v30.8h
1661 mla v20.8h, v18.8h, v29.8h
1662 mul v21.8h, v1.8h, v29.8h
1663 mla v21.8h, v4.8h, v30.8h
1664 mla v21.8h, v6.8h, v31.8h
1665 mla v21.8h, v17.8h, v30.8h
1666 mla v21.8h, v19.8h, v29.8h
1669 urshr v20.8h, v20.8h, #4
1670 urshr v21.8h, v21.8h, #4
1672 st1 {v20.8h, v21.8h}, [x0], #32
1684 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
1686 ld1r {v28.8h}, [x6]
1700 mul v20.8h, v0.8h, v29.8h
1701 mla v20.8h, v3.8h, v30.8h
1702 mla v20.8h, v5.8h, v31.8h
1703 mla v20.8h, v16.8h, v30.8h
1704 mla v20.8h, v18.8h, v29.8h
1705 mul v21.8h, v1.8h, v29.8h
1706 mla v21.8h, v4.8h, v30.8h
1707 mla v21.8h, v6.8h, v31.8h
1708 mla v21.8h, v17.8h, v30.8h
1709 mla v21.8h, v19.8h, v29.8h
1714 urshr v20.8h, v20.8h, #4
1715 urshr v21.8h, v21.8h, #4
1717 st1 {v20.8h, v21.8h}, [x0], #32
1727 st1 {v28.8h}, [x0], #16
1736 dup v0.8h, w1
1739 st1 {v0.8h}, [x0], #16
1755 ld1r {v31.8h}, [x10] // padding
1773 dup v4.4h, w9 // frac
1774 dup v5.4h, w11
1777 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
1778 sub v7.4h, v3.4h, v2.4h
1779 ushll v16.4s, v0.4h, #6 // top[base]*64
1780 ushll v17.4s, v2.4h, #6
1781 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
1782 smlal v17.4s, v7.4h, v5.4h
1783 rshrn v16.4h, v16.4s, #6
1784 rshrn v17.4h, v17.4s, #6
1785 st1 {v16.4h}, [x0], x1
1788 st1 {v17.4h}, [x0], x1
1793 st1 {v31.4h}, [x0], x1
1795 st1 {v31.4h}, [x0], x1
1811 dup v4.8h, w9 // frac
1812 dup v5.8h, w11
1813 ld1 {v0.8h}, [x8] // top[base]
1814 ld1 {v2.8h}, [x10]
1819 dup v6.8h, w9 // 64 - frac
1820 dup v7.8h, w11
1823 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
1824 umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac
1825 umull2 v17.4s, v0.8h, v6.8h
1826 umlal2 v17.4s, v1.8h, v4.8h
1827 umull v18.4s, v2.4h, v7.4h
1828 umlal v18.4s, v3.4h, v5.4h
1829 umull2 v19.4s, v2.8h, v7.8h
1830 umlal2 v19.4s, v3.8h, v5.8h
1831 rshrn v16.4h, v16.4s, #6
1832 rshrn2 v16.8h, v17.4s, #6
1833 rshrn v17.4h, v18.4s, #6
1834 rshrn2 v17.8h, v19.4s, #6
1835 st1 {v16.8h}, [x0], x1
1838 st1 {v17.8h}, [x0], x1
1843 st1 {v31.8h}, [x0], x1
1845 st1 {v31.8h}, [x0], x1
1869 dup v6.8h, w9 // frac
1870 dup v7.8h, w11
1871 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base]
1872 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
1875 dup v16.8h, w9 // 64 - frac
1876 dup v17.8h, w11
1884 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac)
1885 umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac
1886 umull2 v23.4s, v0.8h, v16.8h
1887 umlal2 v23.4s, v18.8h, v6.8h
1888 umull v24.4s, v1.4h, v16.4h
1889 umlal v24.4s, v19.4h, v6.4h
1890 umull2 v25.4s, v1.8h, v16.8h
1891 umlal2 v25.4s, v19.8h, v6.8h
1892 umull v26.4s, v3.4h, v17.4h
1893 umlal v26.4s, v20.4h, v7.4h
1894 umull2 v27.4s, v3.8h, v17.8h
1895 umlal2 v27.4s, v20.8h, v7.8h
1896 umull v28.4s, v4.4h, v17.4h
1897 umlal v28.4s, v21.4h, v7.4h
1898 umull2 v29.4s, v4.8h, v17.8h
1899 umlal2 v29.4s, v21.8h, v7.8h
1900 rshrn v22.4h, v22.4s, #6
1901 rshrn2 v22.8h, v23.4s, #6
1902 rshrn v23.4h, v24.4s, #6
1903 rshrn2 v23.8h, v25.4s, #6
1904 rshrn v24.4h, v26.4s, #6
1905 rshrn2 v24.8h, v27.4s, #6
1906 rshrn v25.4h, v28.4s, #6
1907 rshrn2 v25.8h, v29.4s, #6
1908 st1 {v22.8h, v23.8h}, [x0], #32
1909 st1 {v24.8h, v25.8h}, [x13], #32
1912 ld1 {v1.8h, v2.8h}, [x8], #32 // top[base]
1914 ld1 {v4.8h, v5.8h}, [x10], #32
1928 st1 {v31.8h}, [x0], #16
1930 st1 {v31.8h}, [x13], #16
1968 dup v4.4h, w9 // frac
1969 dup v5.4h, w11
1970 uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
1971 uzp1 v0.8h, v0.8h, v0.8h // top[base]
1972 uzp2 v3.8h, v2.8h, v2.8h
1973 uzp1 v2.8h, v2.8h, v2.8h
1974 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
1975 sub v7.4h, v3.4h, v2.4h
1976 ushll v16.4s, v0.4h, #6 // top[base]*64
1977 ushll v17.4s, v2.4h, #6
1978 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
1979 smlal v17.4s, v7.4h, v5.4h
1980 rshrn v16.4h, v16.4s, #6
1981 rshrn v17.4h, v17.4s, #6
1982 st1 {v16.4h}, [x0], x1
1985 st1 {v17.4h}, [x0], x1
1990 st1 {v31.4h}, [x0], x1
1992 st1 {v31.4h}, [x0], x1
2006 dup v4.8h, w9 // frac
2007 dup v5.8h, w11
2008 ld1 {v0.8h, v1.8h}, [x8] // top[base]
2009 ld1 {v2.8h, v3.8h}, [x10]
2012 dup v6.8h, w9 // 64 - frac
2013 dup v7.8h, w11
2014 uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
2015 uzp1 v0.8h, v0.8h, v1.8h // top[base]
2016 uzp2 v21.8h, v2.8h, v3.8h
2017 uzp1 v2.8h, v2.8h, v3.8h
2018 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
2019 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
2020 umull2 v17.4s, v0.8h, v6.8h
2021 umlal2 v17.4s, v20.8h, v4.8h
2022 umull v18.4s, v2.4h, v7.4h
2023 umlal v18.4s, v21.4h, v5.4h
2024 umull2 v19.4s, v2.8h, v7.8h
2025 umlal2 v19.4s, v21.8h, v5.8h
2026 rshrn v16.4h, v16.4s, #6
2027 rshrn2 v16.8h, v17.4s, #6
2028 rshrn v17.4h, v18.4s, #6
2029 rshrn2 v17.8h, v19.4s, #6
2030 st1 {v16.8h}, [x0], x1
2033 st1 {v17.8h}, [x0], x1
2038 st1 {v31.8h}, [x0], x1
2040 st1 {v31.8h}, [x0], x1
2052 ld1 {v0.8h}, [x1]
2054 rev64 v0.8h, v0.8h
2081 ld1 {v31.8h}, [x11] // increments
2088 dup v30.4h, w7 // -dy
2091 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
2092 movi v25.8h, #0x3e
2093 add v30.4h, v16.4h, v30.4h // -= dy
2096 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
2098 movi v26.8h, #64
2101 shrn v29.8b, v30.8h, #6 // ypos >> 6
2106 movi v23.4h, #1, lsl #8
2119 sub v28.4h, v26.4h, v27.4h // 64 - frac_y
2129 dup v16.4h, w8 // xpos
2138 dup v17.4h, w8 // xpos
2150 sshr v20.8h, v16.8h, #6 // first base_x for each row
2162 sub v17.8h, v26.8h, v16.8h // 64 - frac_x
2164 add v20.8h, v20.8h, v31.8h // actual base_x
2166 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2167 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2168 umull2 v22.4s, v18.8h, v28.8h
2169 umlal2 v22.4s, v19.8h, v27.8h
2171 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
2172 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
2173 umull2 v24.4s, v4.8h, v17.8h
2174 umlal2 v24.4s, v5.8h, v16.8h
2176 cmge v20.8h, v20.8h, #0
2178 rshrn v21.4h, v21.4s, #6
2179 rshrn2 v21.8h, v22.4s, #6
2180 rshrn v22.4h, v23.4s, #6
2181 rshrn2 v22.8h, v24.4s, #6
2200 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2201 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2202 umull2 v21.4s, v18.8h, v28.8h
2203 umlal2 v21.4s, v19.8h, v27.8h
2205 rshrn v20.4h, v20.4s, #6
2206 rshrn2 v20.8h, v21.4s, #6
2228 dup v18.8h, w7 // -dy
2231 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
2232 movi v25.8h, #0x3e
2233 add v16.8h, v16.8h, v18.8h // -= dy
2236 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2237 ld1r {v15.8h}, [x2] // left[0] == top[0]
2239 movi v26.8h, #64
2242 shrn v29.8b, v16.8h, #6 // ypos >> 6
2245 movi v23.8h, #1, lsl #8
2259 sub v28.8h, v26.8h, v27.8h // 64 - frac_y
2264 dup v16.8h, w8 // xpos
2270 dup v17.8h, w8 // xpos
2275 ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
2277 ld1 {v6.8h, v7.8h}, [x11]
2283 sshr v21.8h, v16.8h, #6 // first base_x
2284 sshr v22.8h, v17.8h, #6
2294 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2295 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2297 sub v8.8h, v26.8h, v16.8h // 64 - frac_x
2298 sub v9.8h, v26.8h, v17.8h
2300 umull2 v11.4s, v18.8h, v28.8h
2301 umlal2 v11.4s, v19.8h, v27.8h
2303 add v21.8h, v21.8h, v31.8h // actual base_x
2304 add v22.8h, v22.8h, v31.8h
2306 umull v12.4s, v19.4h, v28.4h
2307 umlal v12.4s, v20.4h, v27.4h
2308 umull2 v13.4s, v19.8h, v28.8h
2309 umlal2 v13.4s, v20.8h, v27.8h
2311 rshrn v10.4h, v10.4s, #6
2312 rshrn2 v10.8h, v11.4s, #6
2313 rshrn v11.4h, v12.4s, #6
2314 rshrn2 v11.8h, v13.4s, #6
2316 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
2317 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
2318 umull2 v13.4s, v4.8h, v8.8h
2319 umlal2 v13.4s, v5.8h, v16.8h
2320 umull v14.4s, v6.4h, v9.4h
2321 umlal v14.4s, v7.4h, v17.4h
2322 umull2 v18.4s, v6.8h, v9.8h
2323 umlal2 v18.4s, v7.8h, v17.8h
2325 cmge v21.8h, v21.8h, #0
2326 cmge v22.8h, v22.8h, #0
2328 rshrn v12.4h, v12.4s, #6
2329 rshrn2 v12.8h, v13.4s, #6
2330 rshrn v13.4h, v14.4s, #6
2331 rshrn2 v13.8h, v18.4s, #6
2336 st1 {v10.8h}, [x0], x1
2339 st1 {v11.8h}, [x0], x1
2353 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2354 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2355 umull2 v5.4s, v18.8h, v28.8h
2356 umlal2 v5.4s, v19.8h, v27.8h
2357 umull v6.4s, v19.4h, v28.4h
2358 umlal v6.4s, v20.4h, v27.4h
2359 umull2 v7.4s, v19.8h, v28.8h
2360 umlal2 v7.4s, v20.8h, v27.8h
2362 rshrn v4.4h, v4.4s, #6
2363 rshrn2 v4.8h, v5.4s, #6
2364 rshrn v5.4h, v6.4s, #6
2365 rshrn2 v5.8h, v7.4s, #6
2367 st1 {v4.8h}, [x0], x1
2369 st1 {v5.8h}, [x0], x1
2394 dup v25.8h, w7 // -dy
2401 movi v11.8h, #8
2402 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
2403 add v26.8h, v26.8h, v25.8h // -= dy
2404 mul v25.8h, v25.8h, v11.8h // -8*dy
2409 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2410 ld1r {v15.8h}, [x2] // left[0] == top[0]
2419 dup v16.8h, w8 // xpos
2425 dup v17.8h, w8 // xpos
2431 sshr v21.8h, v16.8h, #6 // first base_x
2432 sshr v22.8h, v17.8h, #6
2434 ld1 {v4.8h}, [x9], #16 // top[base_x]
2435 ld1 {v6.8h}, [x11], #16
2437 movi v10.8h, #0x3e
2438 movi v11.8h, #64
2443 sub v8.8h, v11.8h, v16.8h // 64 - frac_x
2444 sub v9.8h, v11.8h, v17.8h
2446 add v21.8h, v21.8h, v31.8h // actual base_x
2447 add v22.8h, v22.8h, v31.8h
2450 smov w10, v22.h[0]
2452 shrn v29.8b, v23.8h, #6 // ypos >> 6
2453 movi v12.8h, #64
2456 movi v10.8h, #0x3e
2463 sub v28.8h, v12.8h, v27.8h // 64 - frac_y
2469 movi v11.8h, #1, lsl #8
2496 ld3 {v18.h, v19.h, v20.h}[0], [x10]
2499 ld3 {v18.h, v19.h, v20.h}[1], [x15]
2502 ld3 {v18.h, v19.h, v20.h}[2], [x16]
2505 ld3 {v18.h, v19.h, v20.h}[3], [x17]
2509 ld3 {v18.h, v19.h, v20.h}[4], [x10]
2511 ld3 {v18.h, v19.h, v20.h}[5], [x15]
2512 ld3 {v18.h, v19.h, v20.h}[6], [x16]
2513 ld3 {v18.h, v19.h, v20.h}[7], [x17]
2517 ld1 {v5.8h}, [x9], #16 // top[base_x]
2518 ld1 {v7.8h}, [x11], #16
2520 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
2522 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2523 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2524 umull2 v11.4s, v18.8h, v28.8h
2525 umlal2 v11.4s, v19.8h, v27.8h
2526 umull v12.4s, v19.4h, v28.4h
2527 umlal v12.4s, v20.4h, v27.4h
2528 umull2 v13.4s, v19.8h, v28.8h
2529 umlal2 v13.4s, v20.8h, v27.8h
2534 rshrn v10.4h, v10.4s, #6
2535 rshrn2 v10.8h, v11.4s, #6
2536 rshrn v11.4h, v12.4s, #6
2537 rshrn2 v11.8h, v13.4s, #6
2539 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
2540 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
2541 umull2 v13.4s, v4.8h, v8.8h
2542 umlal2 v13.4s, v18.8h, v16.8h
2543 umull v14.4s, v6.4h, v9.4h
2544 umlal v14.4s, v19.4h, v17.4h
2545 umull2 v20.4s, v6.8h, v9.8h
2546 umlal2 v20.4s, v19.8h, v17.8h
2548 cmge v18.8h, v21.8h, #0
2549 cmge v19.8h, v22.8h, #0
2551 rshrn v12.4h, v12.4s, #6
2552 rshrn2 v12.8h, v13.4s, #6
2553 rshrn v13.4h, v14.4s, #6
2554 rshrn2 v13.8h, v20.4s, #6
2559 st1 {v10.8h}, [x0], #16
2561 st1 {v11.8h}, [x13], #16
2564 movi v10.8h, #8
2567 add v21.8h, v21.8h, v10.8h // base_x += 8
2568 add v22.8h, v22.8h, v10.8h
2574 movi v10.8h, #128
2578 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
2582 ld1 {v5.8h}, [x9], #16 // top[base_x]
2583 ld1 {v7.8h}, [x11], #16
2588 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
2589 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
2590 umull2 v13.4s, v4.8h, v8.8h
2591 umlal2 v13.4s, v18.8h, v16.8h
2592 umull v14.4s, v6.4h, v9.4h
2593 umlal v14.4s, v19.4h, v17.4h
2594 umull2 v20.4s, v6.8h, v9.8h
2595 umlal2 v20.4s, v19.8h, v17.8h
2597 rshrn v12.4h, v12.4s, #6
2598 rshrn2 v12.8h, v13.4s, #6
2599 rshrn v13.4h, v14.4s, #6
2600 rshrn2 v13.8h, v20.4s, #6
2602 st1 {v12.8h}, [x0], #16
2604 st1 {v13.8h}, [x13], #16
2615 movi v12.8h, #64
2616 movi v10.8h, #0x3e
2618 shrn v29.8b, v23.8h, #6 // ypos >> 6
2624 movi v11.8h, #1, lsl #8
2626 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
2634 sub v28.8h, v12.8h, v27.8h // 64 - frac_y
2653 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2654 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2655 umull2 v11.4s, v18.8h, v28.8h
2656 umlal2 v11.4s, v19.8h, v27.8h
2657 umull v12.4s, v19.4h, v28.4h
2658 umlal v12.4s, v20.4h, v27.4h
2659 umull2 v13.4s, v19.8h, v28.8h
2660 umlal2 v13.4s, v20.8h, v27.8h
2662 rshrn v10.4h, v10.4s, #6
2663 rshrn2 v10.8h, v11.4s, #6
2664 rshrn v11.4h, v12.4s, #6
2665 rshrn2 v11.8h, v13.4s, #6
2667 st1 {v10.8h}, [x0], x1
2669 st1 {v11.8h}, [x13], x1
2683 ld1 {v18.h}[0], [x10]
2686 ld1 {v18.h}[1], [x15]
2689 ld1 {v18.h}[2], [x16]
2692 ld1 {v18.h}[3], [x17]
2696 ld1 {v18.h}[4], [x10]
2698 ld1 {v18.h}[5], [x15]
2700 ld1 {v18.h}[6], [x16]
2701 ld1 {v18.h}[7], [x17]
2715 ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10]
2718 ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15]
2721 ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16]
2724 ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17]
2728 ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10]
2730 ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15]
2731 ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16]
2733 ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17]
2735 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2736 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2737 umull2 v11.4s, v18.8h, v28.8h
2738 umlal2 v11.4s, v19.8h, v27.8h
2739 umull v12.4s, v19.4h, v28.4h
2740 umlal v12.4s, v20.4h, v27.4h
2741 umull2 v13.4s, v19.8h, v28.8h
2742 umlal2 v13.4s, v20.8h, v27.8h
2744 rshrn v10.4h, v10.4s, #6
2745 rshrn2 v10.8h, v11.4s, #6
2746 rshrn v11.4h, v12.4s, #6
2747 rshrn2 v11.8h, v13.4s, #6
2749 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y)
2750 umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y
2751 umull2 v13.4s, v20.8h, v28.8h
2752 umlal2 v13.4s, v21.8h, v27.8h
2753 umull v14.4s, v21.4h, v28.4h
2754 umlal v14.4s, v22.4h, v27.4h
2755 umull2 v18.4s, v21.8h, v28.8h
2756 umlal2 v18.4s, v22.8h, v27.8h
2758 rshrn v12.4h, v12.4s, #6
2759 rshrn2 v12.8h, v13.4s, #6
2760 rshrn v13.4h, v14.4s, #6
2761 rshrn2 v13.8h, v18.4s, #6
2763 st1 {v10.8h}, [x0], x1
2765 st1 {v11.8h}, [x13], x1
2766 st1 {v12.8h}, [x0], x1
2767 st1 {v13.8h}, [x13], x1
2780 ld2 {v19.h, v20.h}[0], [x10]
2783 ld2 {v19.h, v20.h}[1], [x15]
2786 ld2 {v19.h, v20.h}[2], [x16]
2789 ld2 {v19.h, v20.h}[3], [x17]
2793 ld2 {v19.h, v20.h}[4], [x10]
2795 ld2 {v19.h, v20.h}[5], [x15]
2796 ld2 {v19.h, v20.h}[6], [x16]
2798 ld2 {v19.h, v20.h}[7], [x17]
2800 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2801 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2802 umull2 v11.4s, v18.8h, v28.8h
2803 umlal2 v11.4s, v19.8h, v27.8h
2804 umull v12.4s, v19.4h, v28.4h
2805 umlal v12.4s, v20.4h, v27.4h
2806 umull2 v13.4s, v19.8h, v28.8h
2807 umlal2 v13.4s, v20.8h, v27.8h
2809 rshrn v10.4h, v10.4s, #6
2810 rshrn2 v10.8h, v11.4s, #6
2811 rshrn v11.4h, v12.4s, #6
2812 rshrn2 v11.8h, v13.4s, #6
2814 st1 {v10.8h}, [x0], x1
2815 st1 {v11.8h}, [x13], x1
2853 ld1 {v31.8h}, [x11] // increments
2858 dup v30.4h, w7 // -dy
2861 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
2862 movi v25.8h, #0x3e
2863 add v30.4h, v16.4h, v30.4h // -= dy
2867 ld1 {v0.8h, v1.8h}, [x3] // left[]
2869 movi v26.8h, #64
2872 shrn v29.8b, v30.8h, #6 // ypos >> 6
2877 movi v23.4h, #1, lsl #8
2890 sub v28.4h, v26.4h, v27.4h // 64 - frac_y
2898 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6}
2901 dup v16.4h, w8 // xpos
2910 dup v17.4h, w8 // xpos
2919 sshr v20.8h, v16.8h, #6 // first base_x for each row
2921 uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1]
2922 uzp1 v4.8h, v4.8h, v6.8h // top[base_x]
2928 sub v17.8h, v26.8h, v16.8h // 64 - frac_x
2930 add v20.8h, v20.8h, v31.8h // actual base_x
2932 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2933 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2934 umull2 v22.4s, v18.8h, v28.8h
2935 umlal2 v22.4s, v19.8h, v27.8h
2937 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
2938 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
2939 umull2 v24.4s, v4.8h, v17.8h
2940 umlal2 v24.4s, v5.8h, v16.8h
2942 cmge v20.8h, v20.8h, #0
2944 rshrn v21.4h, v21.4s, #6
2945 rshrn2 v21.8h, v22.4s, #6
2946 rshrn v22.4h, v23.4s, #6
2947 rshrn2 v22.8h, v24.4s, #6
2966 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
2967 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
2968 umull2 v21.4s, v18.8h, v28.8h
2969 umlal2 v21.4s, v19.8h, v27.8h
2971 rshrn v20.4h, v20.4s, #6
2972 rshrn2 v20.8h, v21.4s, #6
2992 dup v18.8h, w7 // -dy
2995 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
2996 movi v25.8h, #0x3e
2997 add v16.8h, v16.8h, v18.8h // -= dy
3001 ld1 {v0.8h, v1.8h}, [x3] // left[]
3003 movi v26.8h, #64
3006 shrn v29.8b, v16.8h, #6 // ypos >> 6
3011 movi v23.8h, #1, lsl #8
3024 sub v28.8h, v26.8h, v27.8h // 64 - frac_y
3030 dup v16.8h, w8 // xpos
3036 dup v17.8h, w8 // xpos
3041 ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
3042 ld1 {v6.8h, v7.8h}, [x11]
3046 sshr v21.8h, v16.8h, #6 // first base_x
3047 sshr v22.8h, v17.8h, #6
3051 uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1]
3052 uzp1 v4.8h, v4.8h, v5.8h // top[base_x]
3053 uzp2 v3.8h, v6.8h, v7.8h
3054 uzp1 v6.8h, v6.8h, v7.8h
3061 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3062 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3064 sub v8.8h, v26.8h, v16.8h // 64 - frac_x
3065 sub v9.8h, v26.8h, v17.8h
3067 umull2 v11.4s, v18.8h, v28.8h
3068 umlal2 v11.4s, v19.8h, v27.8h
3070 add v21.8h, v21.8h, v31.8h // actual base_x
3071 add v22.8h, v22.8h, v31.8h
3073 umull v12.4s, v19.4h, v28.4h
3074 umlal v12.4s, v20.4h, v27.4h
3075 umull2 v13.4s, v19.8h, v28.8h
3076 umlal2 v13.4s, v20.8h, v27.8h
3078 rshrn v10.4h, v10.4s, #6
3079 rshrn2 v10.8h, v11.4s, #6
3080 rshrn v11.4h, v12.4s, #6
3081 rshrn2 v11.8h, v13.4s, #6
3083 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
3084 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
3085 umull2 v13.4s, v4.8h, v8.8h
3086 umlal2 v13.4s, v5.8h, v16.8h
3087 umull v14.4s, v6.4h, v9.4h
3088 umlal v14.4s, v7.4h, v17.4h
3089 umull2 v18.4s, v6.8h, v9.8h
3090 umlal2 v18.4s, v7.8h, v17.8h
3092 cmge v21.8h, v21.8h, #0
3093 cmge v22.8h, v22.8h, #0
3095 rshrn v12.4h, v12.4s, #6
3096 rshrn2 v12.8h, v13.4s, #6
3097 rshrn v13.4h, v14.4s, #6
3098 rshrn2 v13.8h, v18.4s, #6
3103 st1 {v10.8h}, [x0], x1
3106 st1 {v11.8h}, [x0], x1
3118 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3119 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3120 umull2 v5.4s, v18.8h, v28.8h
3121 umlal2 v5.4s, v19.8h, v27.8h
3122 umull v6.4s, v19.4h, v28.4h
3123 umlal v6.4s, v20.4h, v27.4h
3124 umull2 v7.4s, v19.8h, v28.8h
3125 umlal2 v7.4s, v20.8h, v27.8h
3127 rshrn v4.4h, v4.4s, #6
3128 rshrn2 v4.8h, v5.4s, #6
3129 rshrn v5.4h, v6.4s, #6
3130 rshrn2 v5.8h, v7.4s, #6
3132 st1 {v4.8h}, [x0], x1
3134 st1 {v5.8h}, [x0], x1
3156 ld1 {v31.8h}, [x11] // increments
3161 dup v30.4h, w7 // -dy
3164 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
3165 movi v25.8h, #0x3e
3166 add v30.4h, v16.4h, v30.4h // -= dy
3169 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
3171 movi v26.8h, #64
3174 shrn v29.8b, v30.8h, #6 // ypos >> 6
3179 movi v23.4h, #1, lsl #8
3196 sub v28.4h, v26.4h, v27.4h // 64 - frac_y
3204 dup v16.4h, w8 // xpos
3213 dup v17.4h, w8 // xpos
3223 sshr v20.8h, v16.8h, #6 // first base_x for each row
3233 sub v17.8h, v26.8h, v16.8h // 64 - frac_x
3235 add v20.8h, v20.8h, v31.8h // actual base_x
3237 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3238 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3239 umull2 v22.4s, v18.8h, v28.8h
3240 umlal2 v22.4s, v19.8h, v27.8h
3242 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
3243 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
3244 umull2 v24.4s, v4.8h, v17.8h
3245 umlal2 v24.4s, v5.8h, v16.8h
3247 cmge v20.8h, v20.8h, #0
3249 rshrn v21.4h, v21.4s, #6
3250 rshrn2 v21.8h, v22.4s, #6
3251 rshrn v22.4h, v23.4s, #6
3252 rshrn2 v22.8h, v24.4s, #6
3272 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3273 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3274 umull2 v21.4s, v18.8h, v28.8h
3275 umlal2 v21.4s, v19.8h, v27.8h
3277 rshrn v20.4h, v20.4s, #6
3278 rshrn2 v20.8h, v21.4s, #6
3298 dup v18.8h, w7 // -dy
3301 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
3302 movi v25.8h, #0x3e
3303 add v16.8h, v16.8h, v18.8h // -= dy
3306 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
3308 movi v26.8h, #64
3311 shrn v29.8b, v16.8h, #6 // ypos >> 6
3316 movi v23.8h, #1, lsl #8
3324 sub v28.8h, v26.8h, v27.8h // 64 - frac_y
3329 dup v16.8h, w8 // xpos
3335 dup v17.8h, w8 // xpos
3340 ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
3341 ld1 {v6.8h, v7.8h}, [x11]
3348 sshr v22.8h, v16.8h, #6 // first base_x
3350 sshr v23.8h, v17.8h, #6
3359 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3360 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3362 sub v8.8h, v26.8h, v16.8h // 64 - frac_x
3363 sub v9.8h, v26.8h, v17.8h
3365 umull2 v11.4s, v18.8h, v28.8h
3366 umlal2 v11.4s, v19.8h, v27.8h
3368 add v22.8h, v22.8h, v31.8h // actual base_x
3369 add v23.8h, v23.8h, v31.8h
3371 umull v12.4s, v20.4h, v28.4h
3372 umlal v12.4s, v21.4h, v27.4h
3373 umull2 v13.4s, v20.8h, v28.8h
3374 umlal2 v13.4s, v21.8h, v27.8h
3376 rshrn v10.4h, v10.4s, #6
3377 rshrn2 v10.8h, v11.4s, #6
3378 rshrn v11.4h, v12.4s, #6
3379 rshrn2 v11.8h, v13.4s, #6
3381 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
3382 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
3383 umull2 v13.4s, v4.8h, v8.8h
3384 umlal2 v13.4s, v5.8h, v16.8h
3385 umull v14.4s, v6.4h, v9.4h
3386 umlal v14.4s, v7.4h, v17.4h
3387 umull2 v18.4s, v6.8h, v9.8h
3388 umlal2 v18.4s, v7.8h, v17.8h
3390 cmge v22.8h, v22.8h, #0
3391 cmge v23.8h, v23.8h, #0
3393 rshrn v12.4h, v12.4s, #6
3394 rshrn2 v12.8h, v13.4s, #6
3395 rshrn v13.4h, v14.4s, #6
3396 rshrn2 v13.8h, v18.4s, #6
3401 st1 {v10.8h}, [x0], x1
3404 st1 {v11.8h}, [x0], x1
3419 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
3420 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
3421 umull2 v5.4s, v18.8h, v28.8h
3422 umlal2 v5.4s, v19.8h, v27.8h
3423 umull v6.4s, v20.4h, v28.4h
3424 umlal v6.4s, v21.4h, v27.4h
3425 umull2 v7.4s, v20.8h, v28.8h
3426 umlal2 v7.4s, v21.8h, v27.8h
3428 rshrn v4.4h, v4.4s, #6
3429 rshrn2 v4.8h, v5.4s, #6
3430 rshrn v5.4h, v6.4s, #6
3431 rshrn2 v5.8h, v7.4s, #6
3433 st1 {v4.8h}, [x0], x1
3435 st1 {v5.8h}, [x0], x1
3461 ld1r {v31.8h}, [x10] // padding
3482 dup v4.8h, w9 // frac
3483 dup v5.8h, w11
3486 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
3487 sub v7.4h, v3.4h, v2.4h
3488 ushll v16.4s, v0.4h, #6 // top[base]*64
3489 ushll v17.4s, v2.4h, #6
3490 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
3491 smlal v17.4s, v7.4h, v5.4h
3492 rshrn v16.4h, v16.4s, #6
3493 rshrn v17.4h, v17.4s, #6
3495 zip1 v18.8h, v16.8h, v17.8h
3522 dup v4.8h, w9 // frac
3523 dup v5.8h, w11
3524 ld1 {v0.8h}, [x8] // left[base]
3525 ld1 {v2.8h}, [x10]
3530 dup v6.8h, w9 // 64 - frac
3531 dup v7.8h, w11
3534 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac)
3535 umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac
3536 umull2 v17.4s, v0.8h, v6.8h
3537 umlal2 v17.4s, v1.8h, v4.8h
3538 umull v18.4s, v2.4h, v7.4h
3539 umlal v18.4s, v3.4h, v5.4h
3540 umull2 v19.4s, v2.8h, v7.8h
3541 umlal2 v19.4s, v3.8h, v5.8h
3542 rshrn v16.4h, v16.4s, #6
3543 rshrn2 v16.8h, v17.4s, #6
3544 rshrn v17.4h, v18.4s, #6
3545 rshrn2 v17.8h, v19.4s, #6
3547 zip1 v18.8h, v16.8h, v17.8h
3548 zip2 v19.8h, v16.8h, v17.8h
3582 dup v6.8h, w9 // frac
3583 dup v7.8h, w11
3584 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base]
3585 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
3588 dup v16.8h, w9 // 64 - frac
3589 dup v17.8h, w11
3597 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac)
3598 umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac
3599 umull2 v23.4s, v0.8h, v16.8h
3600 umlal2 v23.4s, v18.8h, v6.8h
3601 umull v24.4s, v1.4h, v16.4h
3602 umlal v24.4s, v19.4h, v6.4h
3603 umull2 v25.4s, v1.8h, v16.8h
3604 umlal2 v25.4s, v19.8h, v6.8h
3605 umull v26.4s, v3.4h, v17.4h
3606 umlal v26.4s, v20.4h, v7.4h
3607 umull2 v27.4s, v3.8h, v17.8h
3608 umlal2 v27.4s, v20.8h, v7.8h
3609 umull v28.4s, v4.4h, v17.4h
3610 umlal v28.4s, v21.4h, v7.4h
3611 umull2 v29.4s, v4.8h, v17.8h
3612 umlal2 v29.4s, v21.8h, v7.8h
3613 rshrn v22.4h, v22.4s, #6
3614 rshrn2 v22.8h, v23.4s, #6
3615 rshrn v23.4h, v24.4s, #6
3616 rshrn2 v23.8h, v25.4s, #6
3617 rshrn v24.4h, v26.4s, #6
3618 rshrn2 v24.8h, v27.4s, #6
3619 rshrn v25.4h, v28.4s, #6
3620 rshrn2 v25.8h, v29.4s, #6
3621 zip1 v18.8h, v22.8h, v24.8h
3622 zip2 v19.8h, v22.8h, v24.8h
3623 zip1 v20.8h, v23.8h, v25.8h
3624 zip2 v21.8h, v23.8h, v25.8h
3643 ld1 {v1.8h, v2.8h}, [x8], #32 // left[base]
3645 ld1 {v4.8h, v5.8h}, [x10], #32
3711 st1 {v31.4h}, [x0], x1
3713 st1 {v31.4h}, [x13], x1
3714 st1 {v31.4h}, [x0], x1
3715 st1 {v31.4h}, [x13], x1
3734 st1 {v31.8h}, [x0], x1
3736 st1 {v31.8h}, [x13], x1
3737 st1 {v31.8h}, [x0], x1
3738 st1 {v31.8h}, [x13], x1
3775 st1 {v31.8h}, [x0]
3780 st1 {v31.8h}, [x0], #16
3813 dup v4.4h, w9 // frac
3814 dup v5.4h, w11
3815 uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
3816 uzp1 v0.8h, v0.8h, v0.8h // top[base]
3817 uzp2 v3.8h, v2.8h, v2.8h
3818 uzp1 v2.8h, v2.8h, v2.8h
3819 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
3820 sub v7.4h, v3.4h, v2.4h
3821 ushll v16.4s, v0.4h, #6 // top[base]*64
3822 ushll v17.4s, v2.4h, #6
3823 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
3824 smlal v17.4s, v7.4h, v5.4h
3825 rshrn v16.4h, v16.4s, #6
3826 rshrn v17.4h, v17.4s, #6
3828 zip1 v18.8h, v16.8h, v17.8h
3853 dup v4.8h, w9 // frac
3854 dup v5.8h, w11
3855 ld1 {v0.8h, v1.8h}, [x8] // top[base]
3856 ld1 {v2.8h, v3.8h}, [x10]
3859 dup v6.8h, w9 // 64 - frac
3860 dup v7.8h, w11
3861 uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
3862 uzp1 v0.8h, v0.8h, v1.8h // top[base]
3863 uzp2 v21.8h, v2.8h, v3.8h
3864 uzp1 v2.8h, v2.8h, v3.8h
3865 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
3866 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
3867 umull2 v17.4s, v0.8h, v6.8h
3868 umlal2 v17.4s, v20.8h, v4.8h
3869 umull v18.4s, v2.4h, v7.4h
3870 umlal v18.4s, v21.4h, v5.4h
3871 umull2 v19.4s, v2.8h, v7.8h
3872 umlal2 v19.4s, v21.8h, v5.8h
3873 rshrn v16.4h, v16.4s, #6
3874 rshrn2 v16.8h, v17.4s, #6
3875 rshrn v17.4h, v18.4s, #6
3876 rshrn2 v17.8h, v19.4s, #6
3878 zip1 v18.8h, v16.8h, v17.8h
3879 zip2 v19.8h, v16.8h, v17.8h
3917 sxtl v16.8h, v16.8b
3918 sxtl v17.8h, v17.8b
3920 sxtl v18.8h, v18.8b
3921 sxtl v19.8h, v19.8b
3924 sxtl v20.8h, v20.8b
3925 sxtl v21.8h, v21.8b
3926 sxtl v22.8h, v22.8b
3927 dup v31.8h, w8
3929 movi v30.8h, #0
3938 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
3940 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
3941 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
3942 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
3943 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
3944 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
3945 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
3946 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
3947 srshr v2.8h, v2.8h, #4
3948 smax v2.8h, v2.8h, v30.8h
3950 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
3951 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
3952 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
3953 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
3954 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
3955 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
3956 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
3957 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
3958 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
3959 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
3960 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
3961 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
3962 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
3963 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
3964 sqrshrun v2.4h, v2.4s, #4
3965 sqrshrun2 v2.8h, v3.4s, #4
3967 smin v2.8h, v2.8h, v31.8h
3980 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
3982 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
3983 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
3984 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
3985 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
3986 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
3987 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
3988 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
3989 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
3990 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
3991 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
3992 srshr v2.8h, v2.8h, #4
3993 smax v2.8h, v2.8h, v30.8h
3994 smin v2.8h, v2.8h, v31.8h
3995 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
3996 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
3997 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
3998 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
3999 srshr v3.8h, v3.8h, #4
4000 smax v3.8h, v3.8h, v30.8h
4002 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
4003 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
4004 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
4005 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
4006 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
4007 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
4008 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
4009 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
4010 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
4011 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
4012 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
4013 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
4014 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
4015 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
4016 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
4017 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
4018 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
4019 sqrshrun v2.4h, v2.4s, #4
4020 sqrshrun2 v2.8h, v3.4s, #4
4021 smin v2.8h, v2.8h, v31.8h
4022 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
4023 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
4024 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
4025 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
4026 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
4027 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
4028 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
4029 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
4030 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
4031 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
4032 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
4033 sqrshrun v3.4h, v4.4s, #4
4034 sqrshrun2 v3.8h, v5.4s, #4
4036 smin v3.8h, v3.8h, v31.8h
4053 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
4055 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
4057 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
4058 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
4059 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
4060 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
4061 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
4062 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
4063 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
4065 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
4066 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
4067 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
4068 srshr v3.8h, v3.8h, #4
4069 smax v3.8h, v3.8h, v30.8h
4070 smin v3.8h, v3.8h, v31.8h
4071 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
4072 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
4073 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
4074 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
4076 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
4077 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
4078 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
4079 srshr v4.8h, v4.8h, #4
4080 smax v4.8h, v4.8h, v30.8h
4081 smin v4.8h, v4.8h, v31.8h
4082 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
4083 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
4084 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
4085 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
4087 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
4088 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
4089 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
4090 srshr v5.8h, v5.8h, #4
4091 smax v5.8h, v5.8h, v30.8h
4092 smin v5.8h, v5.8h, v31.8h
4093 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
4094 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
4095 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
4096 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
4099 srshr v6.8h, v6.8h, #4
4100 smax v6.8h, v6.8h, v30.8h
4102 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
4103 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
4104 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
4105 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
4106 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
4107 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
4108 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
4109 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
4110 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
4111 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
4112 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
4113 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
4114 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
4115 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
4117 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
4118 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
4119 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
4120 sqrshrun v3.4h, v3.4s, #4
4121 sqrshrun2 v3.8h, v4.4s, #4
4122 smin v3.8h, v3.8h, v31.8h
4123 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
4124 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
4125 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
4126 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
4127 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
4128 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
4129 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
4130 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
4131 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
4132 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
4133 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
4135 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
4136 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
4137 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
4138 sqrshrun v4.4h, v5.4s, #4
4139 sqrshrun2 v4.8h, v6.4s, #4
4140 smin v4.8h, v4.8h, v31.8h
4141 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
4142 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
4143 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
4144 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
4145 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
4146 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
4147 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
4148 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
4149 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
4150 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
4151 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
4153 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
4154 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
4155 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
4156 sqrshrun v5.4h, v24.4s, #4
4157 sqrshrun2 v5.8h, v25.4s, #4
4158 smin v5.8h, v5.8h, v31.8h
4159 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
4160 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
4161 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
4162 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
4163 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
4164 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
4165 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
4166 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
4167 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
4168 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
4169 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
4172 sqrshrun v6.4h, v26.4s, #4
4173 sqrshrun2 v6.8h, v27.4s, #4
4175 smin v6.8h, v6.8h, v31.8h
4177 ins v0.h[2], v2.h[7]
4179 ins v0.h[0], v6.h[7]
4181 ins v0.h[1], v6.h[3]
4216 ld1 {v30.8h}, [x2]
4222 movi v31.8h, #1, lsl #8
4239 add v0.8h, v0.8h, v31.8h
4240 add v1.8h, v1.8h, v31.8h
4266 add v0.8h, v0.8h, v31.8h
4267 add v1.8h, v1.8h, v31.8h
4268 add v2.8h, v2.8h, v31.8h
4269 add v3.8h, v3.8h, v31.8h
4272 st1 {v0.8h}, [x0], x1
4274 st1 {v1.8h}, [x2], x1
4276 st1 {v2.8h}, [x0], x1
4277 st1 {v3.8h}, [x2], x1
4307 add v0.8h, v0.8h, v31.8h
4308 add v1.8h, v1.8h, v31.8h
4309 add v2.8h, v2.8h, v31.8h
4310 add v3.8h, v3.8h, v31.8h
4311 add v4.8h, v4.8h, v31.8h
4313 add v5.8h, v5.8h, v31.8h
4315 add v6.8h, v6.8h, v31.8h
4317 add v7.8h, v7.8h, v31.8h
4321 st1 {v0.8h, v1.8h}, [x0], x1
4323 st1 {v2.8h, v3.8h}, [x2], x1
4325 st1 {v4.8h, v5.8h}, [x0], x1
4326 st1 {v6.8h, v7.8h}, [x2], x1
4356 add v0.8h, v0.8h, v31.8h
4357 add v1.8h, v1.8h, v31.8h
4358 add v2.8h, v2.8h, v31.8h
4359 add v3.8h, v3.8h, v31.8h
4360 add v4.8h, v4.8h, v31.8h
4362 add v5.8h, v5.8h, v31.8h
4364 add v6.8h, v6.8h, v31.8h
4366 add v7.8h, v7.8h, v31.8h
4370 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4373 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4402 add v0.8h, v0.8h, v31.8h
4403 add v1.8h, v1.8h, v31.8h
4404 add v2.8h, v2.8h, v31.8h
4405 add v3.8h, v3.8h, v31.8h
4406 add v4.8h, v4.8h, v31.8h
4408 add v5.8h, v5.8h, v31.8h
4410 add v6.8h, v6.8h, v31.8h
4412 add v7.8h, v7.8h, v31.8h
4416 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4419 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4438 dup v31.8h, w7 // bitdepth_max
4443 urshr v0.8h, v31.8h, #1
4444 dup v1.8h, w6 // alpha
4448 movi v30.8h, #0
4453 ld1 {v4.8h, v5.8h}, [x5], #32
4455 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
4456 smull2 v3.4s, v4.8h, v1.8h
4457 smull v4.4s, v5.4h, v1.4h
4458 smull2 v5.4s, v5.8h, v1.8h
4467 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
4468 rshrn2 v2.8h, v3.4s, #6
4469 rshrn v3.4h, v4.4s, #6
4470 rshrn2 v3.8h, v5.4s, #6
4471 add v2.8h, v2.8h, v0.8h // dc + apply_sign()
4472 add v3.8h, v3.8h, v0.8h
4473 smax v2.8h, v2.8h, v30.8h
4474 smax v3.8h, v3.8h, v30.8h
4475 smin v2.8h, v2.8h, v31.8h
4476 smin v3.8h, v3.8h, v31.8h
4486 ld1 {v4.8h, v5.8h}, [x5], #32
4488 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
4489 smull2 v3.4s, v4.8h, v1.8h
4490 smull v4.4s, v5.4h, v1.4h
4491 smull2 v5.4s, v5.8h, v1.8h
4500 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
4501 rshrn2 v2.8h, v3.4s, #6
4502 rshrn v3.4h, v4.4s, #6
4503 rshrn2 v3.8h, v5.4s, #6
4504 add v2.8h, v2.8h, v0.8h // dc + apply_sign()
4505 add v3.8h, v3.8h, v0.8h
4506 smax v2.8h, v2.8h, v30.8h
4507 smax v3.8h, v3.8h, v30.8h
4508 smin v2.8h, v2.8h, v31.8h
4509 smin v3.8h, v3.8h, v31.8h
4510 st1 {v2.8h}, [x0], x1
4511 st1 {v3.8h}, [x6], x1
4520 ld1 {v2.8h, v3.8h}, [x5], #32
4521 ld1 {v4.8h, v5.8h}, [x7], #32
4523 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
4524 smull2 v17.4s, v2.8h, v1.8h
4525 smull v18.4s, v3.4h, v1.4h
4526 smull2 v19.4s, v3.8h, v1.8h
4527 smull v2.4s, v4.4h, v1.4h
4528 smull2 v3.4s, v4.8h, v1.8h
4529 smull v4.4s, v5.4h, v1.4h
4530 smull2 v5.4s, v5.8h, v1.8h
4547 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
4548 rshrn2 v16.8h, v17.4s, #6
4549 rshrn v17.4h, v18.4s, #6
4550 rshrn2 v17.8h, v19.4s, #6
4551 rshrn v6.4h, v2.4s, #6
4552 rshrn2 v6.8h, v3.4s, #6
4553 rshrn v7.4h, v4.4s, #6
4554 rshrn2 v7.8h, v5.4s, #6
4555 add v2.8h, v16.8h, v0.8h // dc + apply_sign()
4556 add v3.8h, v17.8h, v0.8h
4557 add v4.8h, v6.8h, v0.8h
4558 add v5.8h, v7.8h, v0.8h
4559 smax v2.8h, v2.8h, v30.8h
4560 smax v3.8h, v3.8h, v30.8h
4561 smax v4.8h, v4.8h, v30.8h
4562 smax v5.8h, v5.8h, v30.8h
4563 smin v2.8h, v2.8h, v31.8h
4564 smin v3.8h, v3.8h, v31.8h
4565 smin v4.8h, v4.8h, v31.8h
4566 smin v5.8h, v5.8h, v31.8h
4567 st1 {v2.8h, v3.8h}, [x0], #32
4568 st1 {v4.8h, v5.8h}, [x6], #32
4594 dup v31.8h, w7 // bitdepth_max
4599 dup v1.8h, w6 // alpha
4604 movi v30.8h, #0
4608 ld1 {v0.4h}, [x2]
4609 addv h0, v0.4h
4610 urshr v0.4h, v0.4h, #2
4611 dup v0.8h, v0.h[0]
4615 ld1 {v0.8h}, [x2]
4616 addv h0, v0.8h
4617 urshr v0.4h, v0.4h, #3
4618 dup v0.8h, v0.h[0]
4622 ld1 {v2.8h, v3.8h}, [x2]
4623 addp v0.8h, v2.8h, v3.8h
4624 addv h0, v0.8h
4625 urshr v0.4h, v0.4h, #4
4626 dup v0.8h, v0.h[0]
4630 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4631 addp v2.8h, v2.8h, v3.8h
4632 addp v4.8h, v4.8h, v5.8h
4633 addp v0.8h, v2.8h, v4.8h
4634 uaddlv s0, v0.8h
4635 rshrn v0.4h, v0.4s, #5
4636 dup v0.8h, v0.h[0]
4653 dup v31.8h, w7 // bitdepth_max
4663 dup v1.8h, w6 // alpha
4668 movi v30.8h, #0
4673 ld1 {v0.4h}, [x2]
4674 addv h0, v0.4h
4675 urshr v0.4h, v0.4h, #2
4676 dup v0.8h, v0.h[0]
4681 ld1 {v0.8h}, [x2]
4682 addv h0, v0.8h
4683 urshr v0.4h, v0.4h, #3
4684 dup v0.8h, v0.h[0]
4689 ld1 {v2.8h, v3.8h}, [x2]
4690 addp v0.8h, v2.8h, v3.8h
4691 addv h0, v0.8h
4692 urshr v0.4h, v0.4h, #4
4693 dup v0.8h, v0.h[0]
4698 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4699 addp v2.8h, v2.8h, v3.8h
4700 addp v4.8h, v4.8h, v5.8h
4701 addp v0.8h, v2.8h, v4.8h
4702 uaddlv s0, v0.8h
4703 rshrn v0.4h, v0.4s, #5
4704 dup v0.8h, v0.h[0]
4721 dup v31.8h, w7 // bitdepth_max
4724 dup v1.8h, w6 // alpha
4742 movi v30.8h, #0
4747 ld1 {v0.4h}, [x2], #8
4748 uaddlv s0, v0.4h
4753 ld1 {v2.4h}, [x2]
4755 uaddlv s2, v2.4h
4769 dup v0.8h, v0.h[0]
4774 ld1 {v0.8h}, [x2], #16
4775 uaddlv s0, v0.8h
4780 ld1 {v2.8h}, [x2]
4782 uaddlv s2, v2.8h
4796 dup v0.8h, v0.h[0]
4801 ld1 {v2.8h, v3.8h}, [x2], #32
4802 addp v0.8h, v2.8h, v3.8h
4804 uaddlv s0, v0.8h
4808 ld1 {v2.8h, v3.8h}, [x2]
4810 addp v2.8h, v2.8h, v3.8h
4811 uaddlv s2, v2.8h
4825 dup v0.8h, v0.h[0]
4830 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
4831 addp v2.8h, v2.8h, v3.8h
4832 addp v4.8h, v4.8h, v5.8h
4833 addp v0.8h, v2.8h, v4.8h
4835 uaddlv s0, v0.8h
4839 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4841 addp v2.8h, v2.8h, v3.8h
4842 addp v4.8h, v4.8h, v5.8h
4843 addp v2.8h, v2.8h, v4.8h
4845 uaddlv s2, v2.8h
4858 dup v0.8h, v0.h[0]
4902 ld1 {v0.8h}, [x1], x2
4903 ld1 {v1.8h}, [x10], x2
4904 ld1 {v2.8h}, [x1], x2
4905 ld1 {v3.8h}, [x10], x2
4906 addp v0.8h, v0.8h, v2.8h
4907 addp v1.8h, v1.8h, v3.8h
4908 add v0.8h, v0.8h, v1.8h
4909 shl v0.8h, v0.8h, #1
4911 st1 {v0.8h}, [x0], #16
4912 uaddw v24.4s, v24.4s, v0.4h
4913 uaddw2 v25.4s, v25.4s, v0.8h
4921 st1 {v0.8h, v1.8h}, [x0], #32
4922 uaddw v24.4s, v24.4s, v0.4h
4923 uaddw2 v25.4s, v25.4s, v0.8h
4924 uaddw v26.4s, v26.4s, v1.4h
4925 uaddw2 v27.4s, v27.4s, v1.8h
4936 dup v4.8h, v4.h[0]
4938 ld1 {v0.8h, v1.8h}, [x0]
4940 sub v0.8h, v0.8h, v4.8h
4941 sub v1.8h, v1.8h, v4.8h
4942 st1 {v0.8h, v1.8h}, [x0], #32
4950 ld1 {v0.8h, v1.8h}, [x1], x2
4951 ld1 {v2.8h, v3.8h}, [x10], x2
4952 ld1 {v4.8h, v5.8h}, [x1], x2
4953 addp v0.8h, v0.8h, v1.8h
4954 ld1 {v6.8h, v7.8h}, [x10], x2
4955 addp v2.8h, v2.8h, v3.8h
4956 addp v4.8h, v4.8h, v5.8h
4957 addp v6.8h, v6.8h, v7.8h
4958 add v0.8h, v0.8h, v2.8h
4959 add v4.8h, v4.8h, v6.8h
4960 shl v0.8h, v0.8h, #1
4961 shl v1.8h, v4.8h, #1
4963 st1 {v0.8h, v1.8h}, [x0], #32
4964 uaddw v24.4s, v24.4s, v0.4h
4965 uaddw2 v25.4s, v25.4s, v0.8h
4966 uaddw v26.4s, v26.4s, v1.4h
4967 uaddw2 v27.4s, v27.4s, v1.8h
4974 ld1 {v0.8h}, [x1], x2
4975 ld1 {v1.8h}, [x10], x2
4976 ld1 {v2.8h}, [x1], x2
4977 ld1 {v3.8h}, [x10], x2
4978 addp v0.8h, v0.8h, v2.8h
4979 addp v1.8h, v1.8h, v3.8h
4980 add v0.8h, v0.8h, v1.8h
4981 shl v0.8h, v0.8h, #1
4982 dup v1.4h, v0.h[3]
4983 dup v3.4h, v0.h[7]
4986 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4987 uaddw v24.4s, v24.4s, v0.4h
4988 uaddw v25.4s, v25.4s, v1.4h
4989 uaddw v26.4s, v26.4s, v2.4h
4990 uaddw v27.4s, v27.4s, v3.4h
4999 st1 {v0.8h, v1.8h}, [x0], #32
5000 uaddw v24.4s, v24.4s, v0.4h
5001 uaddw2 v25.4s, v25.4s, v0.8h
5002 uaddw v26.4s, v26.4s, v1.4h
5003 uaddw2 v27.4s, v27.4s, v1.8h
5004 st1 {v0.8h, v1.8h}, [x0], #32
5005 uaddw v24.4s, v24.4s, v0.4h
5006 uaddw2 v25.4s, v25.4s, v0.8h
5007 uaddw v26.4s, v26.4s, v1.4h
5008 uaddw2 v27.4s, v27.4s, v1.8h
5026 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
5027 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5028 addp v0.8h, v0.8h, v1.8h
5029 addp v2.8h, v2.8h, v3.8h
5030 addp v4.8h, v4.8h, v5.8h
5031 addp v6.8h, v6.8h, v7.8h
5032 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
5033 add v0.8h, v0.8h, v4.8h
5034 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
5035 add v2.8h, v2.8h, v6.8h
5036 addp v16.8h, v16.8h, v17.8h
5037 addp v18.8h, v18.8h, v19.8h
5038 addp v20.8h, v20.8h, v21.8h
5039 addp v22.8h, v22.8h, v23.8h
5040 add v16.8h, v16.8h, v20.8h
5041 add v18.8h, v18.8h, v22.8h
5042 shl v0.8h, v0.8h, #1
5043 shl v1.8h, v2.8h, #1
5044 shl v2.8h, v16.8h, #1
5045 shl v3.8h, v18.8h, #1
5047 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5048 uaddw v24.4s, v24.4s, v0.4h
5049 uaddw2 v25.4s, v25.4s, v0.8h
5050 uaddw v26.4s, v26.4s, v1.4h
5051 uaddw2 v27.4s, v27.4s, v1.8h
5052 uaddw v24.4s, v24.4s, v2.4h
5053 uaddw2 v25.4s, v25.4s, v2.8h
5054 uaddw v26.4s, v26.4s, v3.4h
5055 uaddw2 v27.4s, v27.4s, v3.8h
5065 ld1 {v0.8h, v1.8h}, [x1], x2
5067 ld1 {v3.8h, v4.8h}, [x10], x2
5068 addp v2.8h, v2.8h, v2.8h
5069 addp v0.8h, v0.8h, v1.8h
5070 addp v5.8h, v5.8h, v5.8h
5071 addp v3.8h, v3.8h, v4.8h
5073 add v2.4h, v2.4h, v5.4h
5074 ld1 {v16.8h, v17.8h}, [x1], x2
5075 add v0.8h, v0.8h, v3.8h
5077 ld1 {v19.8h, v20.8h}, [x10], x2
5078 addp v18.8h, v18.8h, v18.8h
5079 addp v16.8h, v16.8h, v17.8h
5080 addp v21.8h, v21.8h, v21.8h
5081 addp v19.8h, v19.8h, v20.8h
5082 add v18.4h, v18.4h, v21.4h
5083 add v16.8h, v16.8h, v19.8h
5084 shl v1.4h, v2.4h, #1
5085 shl v0.8h, v0.8h, #1
5086 shl v3.4h, v18.4h, #1
5087 shl v2.8h, v16.8h, #1
5088 dup v4.4h, v1.h[3]
5089 dup v5.4h, v3.h[3]
5093 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5094 uaddw v24.4s, v24.4s, v0.4h
5095 uaddw2 v25.4s, v25.4s, v0.8h
5096 uaddw v26.4s, v26.4s, v1.4h
5097 uaddw2 v27.4s, v27.4s, v1.8h
5098 uaddw v24.4s, v24.4s, v2.4h
5099 uaddw2 v25.4s, v25.4s, v2.8h
5100 uaddw v26.4s, v26.4s, v3.4h
5101 uaddw2 v27.4s, v27.4s, v3.8h
5110 ld1 {v0.8h, v1.8h}, [x1], x2
5111 ld1 {v2.8h, v3.8h}, [x10], x2
5112 ld1 {v4.8h, v5.8h}, [x1], x2
5113 addp v0.8h, v0.8h, v1.8h
5114 ld1 {v6.8h, v7.8h}, [x10], x2
5115 addp v2.8h, v2.8h, v3.8h
5116 addp v4.8h, v4.8h, v5.8h
5117 addp v6.8h, v6.8h, v7.8h
5118 add v0.8h, v0.8h, v2.8h
5119 add v4.8h, v4.8h, v6.8h
5120 shl v0.8h, v0.8h, #1
5121 shl v2.8h, v4.8h, #1
5122 dup v1.8h, v0.h[7]
5123 dup v3.8h, v2.h[7]
5125 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5126 uaddw v24.4s, v24.4s, v0.4h
5127 uaddw2 v25.4s, v25.4s, v0.8h
5128 uaddw v26.4s, v26.4s, v1.4h
5129 uaddw2 v27.4s, v27.4s, v1.8h
5130 uaddw v24.4s, v24.4s, v2.4h
5131 uaddw2 v25.4s, v25.4s, v2.8h
5132 uaddw v26.4s, v26.4s, v3.4h
5133 uaddw2 v27.4s, v27.4s, v3.8h
5142 ld1 {v0.8h}, [x1], x2
5143 ld1 {v2.8h}, [x10], x2
5144 ld1 {v4.8h}, [x1], x2
5145 ld1 {v6.8h}, [x10], x2
5146 addp v0.8h, v0.8h, v4.8h
5147 addp v2.8h, v2.8h, v6.8h
5148 add v0.8h, v0.8h, v2.8h
5149 shl v0.8h, v0.8h, #1
5150 dup v1.8h, v0.h[3]
5151 dup v3.8h, v0.h[7]
5155 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5156 uaddw v24.4s, v24.4s, v0.4h
5157 uaddw2 v25.4s, v25.4s, v0.8h
5158 uaddw v26.4s, v26.4s, v1.4h
5159 uaddw2 v27.4s, v27.4s, v1.8h
5160 uaddw v24.4s, v24.4s, v2.4h
5161 uaddw2 v25.4s, v25.4s, v2.8h
5162 uaddw v26.4s, v26.4s, v3.4h
5163 uaddw2 v27.4s, v27.4s, v3.8h
5172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5173 uaddw v24.4s, v24.4s, v0.4h
5174 uaddw2 v25.4s, v25.4s, v0.8h
5175 uaddw v26.4s, v26.4s, v1.4h
5176 uaddw2 v27.4s, v27.4s, v1.8h
5177 uaddw v24.4s, v24.4s, v2.4h
5178 uaddw2 v25.4s, v25.4s, v2.8h
5179 uaddw v26.4s, v26.4s, v3.4h
5180 uaddw2 v27.4s, v27.4s, v3.8h
5181 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5182 uaddw v24.4s, v24.4s, v0.4h
5183 uaddw2 v25.4s, v25.4s, v0.8h
5184 uaddw v26.4s, v26.4s, v1.4h
5185 uaddw2 v27.4s, v27.4s, v1.8h
5186 uaddw v24.4s, v24.4s, v2.4h
5187 uaddw2 v25.4s, v25.4s, v2.8h
5188 uaddw v26.4s, v26.4s, v3.4h
5189 uaddw2 v27.4s, v27.4s, v3.8h
5240 ld1 {v0.8h}, [x1], x2
5241 ld1 {v1.8h}, [x10], x2
5242 ld1 {v2.8h}, [x1], x2
5243 ld1 {v3.8h}, [x10], x2
5244 addp v0.8h, v0.8h, v1.8h
5245 addp v2.8h, v2.8h, v3.8h
5246 shl v0.8h, v0.8h, #2
5247 shl v1.8h, v2.8h, #2
5249 st1 {v0.8h, v1.8h}, [x0], #32
5250 uaddw v24.4s, v24.4s, v0.4h
5251 uaddw2 v25.4s, v25.4s, v0.8h
5252 uaddw v26.4s, v26.4s, v1.4h
5253 uaddw2 v27.4s, v27.4s, v1.8h
5263 ld1 {v0.8h, v1.8h}, [x1], x2
5264 ld1 {v2.8h, v3.8h}, [x10], x2
5265 ld1 {v4.8h, v5.8h}, [x1], x2
5266 addp v0.8h, v0.8h, v1.8h
5267 ld1 {v6.8h, v7.8h}, [x10], x2
5268 addp v2.8h, v2.8h, v3.8h
5269 addp v4.8h, v4.8h, v5.8h
5270 addp v6.8h, v6.8h, v7.8h
5271 shl v0.8h, v0.8h, #2
5272 shl v1.8h, v2.8h, #2
5273 shl v2.8h, v4.8h, #2
5274 shl v3.8h, v6.8h, #2
5276 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5277 uaddw v24.4s, v24.4s, v0.4h
5278 uaddw2 v25.4s, v25.4s, v0.8h
5279 uaddw v26.4s, v26.4s, v1.4h
5280 uaddw2 v27.4s, v27.4s, v1.8h
5281 uaddw v24.4s, v24.4s, v2.4h
5282 uaddw2 v25.4s, v25.4s, v2.8h
5283 uaddw v26.4s, v26.4s, v3.4h
5284 uaddw2 v27.4s, v27.4s, v3.8h
5292 ld1 {v0.8h}, [x1], x2
5293 ld1 {v1.8h}, [x10], x2
5294 ld1 {v2.8h}, [x1], x2
5295 ld1 {v3.8h}, [x10], x2
5296 addp v0.8h, v0.8h, v1.8h
5297 addp v2.8h, v2.8h, v3.8h
5298 shl v0.8h, v0.8h, #2
5299 shl v2.8h, v2.8h, #2
5300 dup v4.4h, v0.h[3]
5301 dup v5.8h, v0.h[7]
5302 dup v6.4h, v2.h[3]
5303 dup v7.8h, v2.h[7]
5309 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5310 uaddw v24.4s, v24.4s, v0.4h
5311 uaddw2 v25.4s, v25.4s, v0.8h
5312 uaddw v26.4s, v26.4s, v1.4h
5313 uaddw2 v27.4s, v27.4s, v1.8h
5314 uaddw v24.4s, v24.4s, v2.4h
5315 uaddw2 v25.4s, v25.4s, v2.8h
5316 uaddw v26.4s, v26.4s, v3.4h
5317 uaddw2 v27.4s, v27.4s, v3.8h
5333 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
5334 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5335 addp v0.8h, v0.8h, v1.8h
5336 addp v2.8h, v2.8h, v3.8h
5337 addp v4.8h, v4.8h, v5.8h
5338 addp v6.8h, v6.8h, v7.8h
5339 shl v0.8h, v0.8h, #2
5340 shl v1.8h, v2.8h, #2
5341 shl v2.8h, v4.8h, #2
5342 shl v3.8h, v6.8h, #2
5344 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5345 uaddw v24.4s, v24.4s, v0.4h
5346 uaddw2 v25.4s, v25.4s, v0.8h
5347 uaddw v26.4s, v26.4s, v1.4h
5348 uaddw2 v27.4s, v27.4s, v1.8h
5349 uaddw v24.4s, v24.4s, v2.4h
5350 uaddw2 v25.4s, v25.4s, v2.8h
5351 uaddw v26.4s, v26.4s, v3.4h
5352 uaddw2 v27.4s, v27.4s, v3.8h
5362 ld1 {v0.8h, v1.8h}, [x1], x2
5364 ld1 {v4.8h, v5.8h}, [x10], x2
5365 addp v2.8h, v2.8h, v2.8h
5366 addp v0.8h, v0.8h, v1.8h
5367 addp v6.8h, v6.8h, v6.8h
5368 addp v4.8h, v4.8h, v5.8h
5369 shl v1.4h, v2.4h, #2
5370 shl v0.8h, v0.8h, #2
5371 shl v3.4h, v6.4h, #2
5372 shl v2.8h, v4.8h, #2
5373 dup v4.4h, v1.h[3]
5374 dup v5.4h, v3.h[3]
5378 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5379 uaddw v24.4s, v24.4s, v0.4h
5380 uaddw2 v25.4s, v25.4s, v0.8h
5381 uaddw v26.4s, v26.4s, v1.4h
5382 uaddw2 v27.4s, v27.4s, v1.8h
5383 uaddw v24.4s, v24.4s, v2.4h
5384 uaddw2 v25.4s, v25.4s, v2.8h
5385 uaddw v26.4s, v26.4s, v3.4h
5386 uaddw2 v27.4s, v27.4s, v3.8h
5395 ld1 {v0.8h, v1.8h}, [x1], x2
5396 ld1 {v2.8h, v3.8h}, [x10], x2
5397 addp v0.8h, v0.8h, v1.8h
5398 addp v2.8h, v2.8h, v3.8h
5399 shl v0.8h, v0.8h, #2
5400 shl v2.8h, v2.8h, #2
5401 dup v1.8h, v0.h[7]
5402 dup v3.8h, v2.h[7]
5404 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5405 uaddw v24.4s, v24.4s, v0.4h
5406 uaddw2 v25.4s, v25.4s, v0.8h
5407 uaddw v26.4s, v26.4s, v1.4h
5408 uaddw2 v27.4s, v27.4s, v1.8h
5409 uaddw v24.4s, v24.4s, v2.4h
5410 uaddw2 v25.4s, v25.4s, v2.8h
5411 uaddw v26.4s, v26.4s, v3.4h
5412 uaddw2 v27.4s, v27.4s, v3.8h
5421 ld1 {v0.8h}, [x1], x2
5422 ld1 {v2.8h}, [x10], x2
5423 addp v0.8h, v0.8h, v0.8h
5424 addp v2.8h, v2.8h, v2.8h
5425 shl v0.4h, v0.4h, #2
5426 shl v2.4h, v2.4h, #2
5427 dup v1.8h, v0.h[3]
5428 dup v3.8h, v2.h[3]
5432 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5433 uaddw v24.4s, v24.4s, v0.4h
5434 uaddw2 v25.4s, v25.4s, v0.8h
5435 uaddw v26.4s, v26.4s, v1.4h
5436 uaddw2 v27.4s, v27.4s, v1.8h
5437 uaddw v24.4s, v24.4s, v2.4h
5438 uaddw2 v25.4s, v25.4s, v2.8h
5439 uaddw v26.4s, v26.4s, v3.4h
5440 uaddw2 v27.4s, v27.4s, v3.8h
5489 ld1 {v0.4h}, [x1], x2
5491 ld1 {v1.4h}, [x1], x2
5493 shl v0.8h, v0.8h, #3
5494 shl v1.8h, v1.8h, #3
5496 st1 {v0.8h, v1.8h}, [x0], #32
5497 uaddw v24.4s, v24.4s, v0.4h
5498 uaddw2 v25.4s, v25.4s, v0.8h
5499 uaddw v26.4s, v26.4s, v1.4h
5500 uaddw2 v27.4s, v27.4s, v1.8h
5509 ld1 {v0.8h}, [x1], x2
5510 ld1 {v1.8h}, [x10], x2
5511 ld1 {v2.8h}, [x1], x2
5512 shl v0.8h, v0.8h, #3
5513 ld1 {v3.8h}, [x10], x2
5514 shl v1.8h, v1.8h, #3
5515 shl v2.8h, v2.8h, #3
5516 shl v3.8h, v3.8h, #3
5518 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5519 uaddw v24.4s, v24.4s, v0.4h
5520 uaddw2 v25.4s, v25.4s, v0.8h
5521 uaddw v26.4s, v26.4s, v1.4h
5522 uaddw2 v27.4s, v27.4s, v1.8h
5523 uaddw v24.4s, v24.4s, v2.4h
5524 uaddw2 v25.4s, v25.4s, v2.8h
5525 uaddw v26.4s, v26.4s, v3.4h
5526 uaddw2 v27.4s, v27.4s, v3.8h
5536 ld1 {v0.8h, v1.8h}, [x1], x2
5537 ld1 {v2.8h, v3.8h}, [x10], x2
5538 shl v0.8h, v0.8h, #3
5539 shl v1.8h, v1.8h, #3
5540 shl v2.8h, v2.8h, #3
5541 shl v3.8h, v3.8h, #3
5543 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5544 uaddw v24.4s, v24.4s, v0.4h
5545 uaddw2 v25.4s, v25.4s, v0.8h
5546 uaddw v26.4s, v26.4s, v1.4h
5547 uaddw2 v27.4s, v27.4s, v1.8h
5548 uaddw v24.4s, v24.4s, v2.4h
5549 uaddw2 v25.4s, v25.4s, v2.8h
5550 uaddw v26.4s, v26.4s, v3.4h
5551 uaddw2 v27.4s, v27.4s, v3.8h
5559 ld1 {v0.8h}, [x1], x2
5560 ld1 {v2.8h}, [x10], x2
5561 shl v0.8h, v0.8h, #3
5562 shl v2.8h, v2.8h, #3
5563 dup v1.8h, v0.h[7]
5564 dup v3.8h, v2.h[7]
5566 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5567 uaddw v24.4s, v24.4s, v0.4h
5568 uaddw2 v25.4s, v25.4s, v0.8h
5569 uaddw v26.4s, v26.4s, v1.4h
5570 uaddw2 v27.4s, v27.4s, v1.8h
5571 uaddw v24.4s, v24.4s, v2.4h
5572 uaddw2 v25.4s, v25.4s, v2.8h
5573 uaddw v26.4s, v26.4s, v3.4h
5574 uaddw2 v27.4s, v27.4s, v3.8h
5592 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
5593 shl v0.8h, v0.8h, #3
5594 shl v1.8h, v1.8h, #3
5595 shl v2.8h, v2.8h, #3
5596 shl v3.8h, v3.8h, #3
5598 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5599 uaddw v24.4s, v24.4s, v0.4h
5600 uaddw2 v25.4s, v25.4s, v0.8h
5601 uaddw v26.4s, v26.4s, v1.4h
5602 uaddw2 v27.4s, v27.4s, v1.8h
5603 uaddw v24.4s, v24.4s, v2.4h
5604 uaddw2 v25.4s, v25.4s, v2.8h
5605 uaddw v26.4s, v26.4s, v3.4h
5606 uaddw2 v27.4s, v27.4s, v3.8h
5613 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2
5614 shl v2.8h, v2.8h, #3
5615 shl v0.8h, v0.8h, #3
5616 shl v1.8h, v1.8h, #3
5617 dup v3.8h, v2.h[7]
5619 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5620 uaddw v24.4s, v24.4s, v0.4h
5621 uaddw2 v25.4s, v25.4s, v0.8h
5622 uaddw v26.4s, v26.4s, v1.4h
5623 uaddw2 v27.4s, v27.4s, v1.8h
5624 uaddw v24.4s, v24.4s, v2.4h
5625 uaddw2 v25.4s, v25.4s, v2.8h
5626 uaddw v26.4s, v26.4s, v3.4h
5627 uaddw2 v27.4s, v27.4s, v3.8h
5634 ld1 {v0.8h, v1.8h}, [x1], x2
5635 shl v1.8h, v1.8h, #3
5636 shl v0.8h, v0.8h, #3
5637 dup v2.8h, v1.h[7]
5638 dup v3.8h, v1.h[7]
5640 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5641 uaddw v24.4s, v24.4s, v0.4h
5642 uaddw2 v25.4s, v25.4s, v0.8h
5643 uaddw v26.4s, v26.4s, v1.4h
5644 uaddw2 v27.4s, v27.4s, v1.8h
5645 uaddw v24.4s, v24.4s, v2.4h
5646 uaddw2 v25.4s, v25.4s, v2.8h
5647 uaddw v26.4s, v26.4s, v3.4h
5648 uaddw2 v27.4s, v27.4s, v3.8h
5655 ld1 {v0.8h}, [x1], x2
5656 shl v0.8h, v0.8h, #3
5657 dup v1.8h, v0.h[7]
5658 dup v2.8h, v0.h[7]
5659 dup v3.8h, v0.h[7]
5661 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5662 uaddw v24.4s, v24.4s, v0.4h
5663 uaddw2 v25.4s, v25.4s, v0.8h
5664 uaddw v26.4s, v26.4s, v1.4h
5665 uaddw2 v27.4s, v27.4s, v1.8h
5666 uaddw v24.4s, v24.4s, v2.4h
5667 uaddw2 v25.4s, v25.4s, v2.8h
5668 uaddw v26.4s, v26.4s, v3.4h
5669 uaddw2 v27.4s, v27.4s, v3.8h
5676 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5677 uaddw v24.4s, v24.4s, v0.4h
5678 uaddw2 v25.4s, v25.4s, v0.8h
5679 uaddw v26.4s, v26.4s, v1.4h
5680 uaddw2 v27.4s, v27.4s, v1.8h
5681 uaddw v24.4s, v24.4s, v2.4h
5682 uaddw2 v25.4s, v25.4s, v2.8h
5683 uaddw v26.4s, v26.4s, v3.4h
5684 uaddw2 v27.4s, v27.4s, v3.8h
5685 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5686 uaddw v24.4s, v24.4s, v0.4h
5687 uaddw2 v25.4s, v25.4s, v0.8h
5688 uaddw v26.4s, v26.4s, v1.4h
5689 uaddw2 v27.4s, v27.4s, v1.8h
5690 uaddw v24.4s, v24.4s, v2.4h
5691 uaddw2 v25.4s, v25.4s, v2.8h
5692 uaddw v26.4s, v26.4s, v3.4h
5693 uaddw2 v27.4s, v27.4s, v3.8h