1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <private/bionic_asm.h> 32 33 34#ifndef L 35# define L(label) .L##label 36#endif 37 38#ifndef ALIGN 39# define ALIGN(n) .p2align n 40#endif 41 42 .section .text.avx2,"ax",@progbits 43 44ENTRY(__memset_chk_avx2) 45 # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len 46 cmp %rcx, %rdx 47 ja __memset_chk_fail 48 // Fall through to memset... 49END(__memset_chk_avx2) 50 51ENTRY(memset_avx2) 52 movq %rdi, %rax 53 and $0xff, %rsi 54 mov $0x0101010101010101, %rcx 55 imul %rsi, %rcx 56 cmpq $16, %rdx 57 jae L(16bytesormore) 58 testb $8, %dl 59 jnz L(8_15bytes) 60 testb $4, %dl 61 jnz L(4_7bytes) 62 testb $2, %dl 63 jnz L(2_3bytes) 64 testb $1, %dl 65 jz 1f 66 movb %cl, (%rdi) 671: ret 68 69L(8_15bytes): 70 movq %rcx, (%rdi) 71 movq %rcx, -8(%rdi, %rdx) 72 ret 73 74L(4_7bytes): 75 movl %ecx, (%rdi) 76 movl %ecx, -4(%rdi, %rdx) 77 ret 78 79L(2_3bytes): 80 movw %cx, (%rdi) 81 movw %cx, -2(%rdi, %rdx) 82 ret 83 84 ALIGN (4) 85L(16bytesormore): 86 movd %rcx, %xmm0 87 pshufd $0, %xmm0, %xmm0 88 movdqu %xmm0, (%rdi) 89 movdqu %xmm0, -16(%rdi, %rdx) 90 cmpq $32, %rdx 91 jbe L(done) 92 movdqu %xmm0, 16(%rdi) 93 movdqu %xmm0, -32(%rdi, %rdx) 94 cmpq $64, %rdx 95 jbe L(done) 96 movdqu %xmm0, 32(%rdi) 97 movdqu %xmm0, 48(%rdi) 98 movdqu %xmm0, -64(%rdi, %rdx) 99 movdqu %xmm0, -48(%rdi, %rdx) 100 cmpq $128, %rdx 101 jbe L(done) 102 vpbroadcastb %xmm0, %ymm0 103 vmovdqu %ymm0, 64(%rdi) 104 vmovdqu %ymm0, 96(%rdi) 105 vmovdqu %ymm0, -128(%rdi, %rdx) 106 vmovdqu %ymm0, -96(%rdi, %rdx) 107 cmpq $256, %rdx 108 jbe L(done) 109 110 ALIGN (4) 111 leaq 128(%rdi), %rcx 112 andq $-128, %rcx 113 movq %rdx, %r8 114 addq %rdi, %rdx 115 andq $-128, %rdx 116 cmpq %rcx, %rdx 117 je L(done) 118 119 cmp __x86_shared_cache_size(%rip), %r8 120 121 ja L(non_temporal_loop) 122 123 ALIGN (4) 124L(normal_loop): 125 vmovdqa %ymm0, (%rcx) 126 vmovdqa %ymm0, 32(%rcx) 127 vmovdqa %ymm0, 64(%rcx) 128 vmovdqa %ymm0, 96(%rcx) 129 addq $128, %rcx 130 cmpq %rcx, %rdx 131 jne L(normal_loop) 132 jmp L(done) 133 134 ALIGN (4) 135L(non_temporal_loop): 136 movntdq %xmm0, (%rcx) 137 movntdq %xmm0, 16(%rcx) 138 movntdq %xmm0, 32(%rcx) 139 movntdq %xmm0, 48(%rcx) 140 movntdq %xmm0, 64(%rcx) 141 movntdq %xmm0, 80(%rcx) 142 movntdq %xmm0, 96(%rcx) 143 movntdq %xmm0, 112(%rcx) 144 leaq 128(%rcx), %rcx 145 cmpq %rcx, %rdx 146 jne L(non_temporal_loop) 147 # We used non-temporal stores, so we need a fence here. 148 sfence 149 150L(done): 151 # We used the ymm registers, and that can break SSE2 performance 152 # unless you do this. 153 vzeroupper 154 ret 155 156END(memset_avx2) 157