xref: /aosp_15_r20/bionic/libc/arch-x86/string/sse2-memset-slm.S (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <private/bionic_asm.h>
32
33#define FOR_SILVERMONT
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#define CFI_PUSH(REG)						\
44  .cfi_adjust_cfa_offset 4;					\
45  .cfi_rel_offset REG, 0
46
47#define CFI_POP(REG)						\
48  .cfi_adjust_cfa_offset -4;					\
49  .cfi_restore REG
50
51#define PUSH(REG)	pushl REG; CFI_PUSH(REG)
52#define POP(REG)	popl REG; CFI_POP(REG)
53
54#define PARMS 8 /* Preserve EBX. */
55#define DST PARMS
56#define CHR (DST+4)
57#define LEN (CHR+4)
58#define CHK_DST_LEN (LEN+4)
59#define SETRTNVAL	movl DST(%esp), %eax
60
61# define ENTRANCE	PUSH(%ebx);
62# define RETURN_END	POP(%ebx); ret
63# define RETURN		RETURN_END; CFI_PUSH(%ebx)
64# define JMPTBL(I, B)	I - B
65
66#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
67
68/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
69   jump table with relative offsets.   */
70# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
71    /* We first load PC into EBX.  */				\
72    call	__x86.get_pc_thunk.bx;				\
73    /* Get the address of the jump table.  */			\
74    add		$(TABLE - .), %ebx;				\
75    /* Get the entry and convert the relative offset to the	\
76       absolute address.  */					\
77    add		(%ebx,%ecx,4), %ebx;				\
78    add		%ecx, %edx;					\
79    /* We loaded the jump table and adjusted EDX. Go.  */	\
80    jmp		*%ebx
81
82ENTRY(__memset_chk)
83  ENTRANCE
84
85  movl LEN(%esp), %ecx
86  cmpl CHK_DST_LEN(%esp), %ecx
87  jna L(memset_length_loaded)
88
89  POP(%ebx) // Undo ENTRANCE without returning.
90  jmp __memset_chk_fail
91END(__memset_chk)
92
93	.section .text.sse2,"ax",@progbits
94	ALIGN(4)
95ENTRY(memset)
96	ENTRANCE
97
98	movl	LEN(%esp), %ecx
99L(memset_length_loaded):
100	cmp	$0, %ecx
101	ja	L(1byteormore)
102	SETRTNVAL
103	RETURN
104
105L(1byteormore):
106	movzbl	CHR(%esp), %eax
107	movb	%al, %ah
108	/* Fill the whole EAX with pattern.  */
109	movl	%eax, %edx
110	shl	 $16, %eax
111	or	%edx, %eax
112	movl	DST(%esp), %edx
113	cmp	$1, %ecx
114	je	L(1byte)
115	cmp	$16, %ecx
116	jae	L(16bytesormore)
117
118	cmp	$4, %ecx
119	jb	L(4bytesless)
120	movl	%eax, (%edx)
121	movl	%eax, -4(%edx, %ecx)
122	cmp	$8, %ecx
123	jb	L(8bytesless)
124	movl	%eax, 4(%edx)
125	movl	%eax, -8(%edx, %ecx)
126L(8bytesless):
127	SETRTNVAL
128	RETURN
129
130L(4bytesless):
131	movw	%ax, (%edx)
132	movw	%ax, -2(%edx, %ecx)
133	SETRTNVAL
134	RETURN
135
136L(1byte):
137	movb	%al, (%edx)
138	SETRTNVAL
139	RETURN
140
141	ALIGN(4)
142L(16bytesormore):
143	movd	%eax, %xmm0
144	pshufd	$0, %xmm0, %xmm0
145
146	cmp	$64, %ecx
147	ja	L(64bytesmore)
148	movdqu	%xmm0, (%edx)
149	movdqu	%xmm0, -16(%edx, %ecx)
150	cmp	$32, %ecx
151	jbe	L(32bytesless)
152	movdqu	%xmm0, 16(%edx)
153	movdqu	%xmm0, -32(%edx, %ecx)
154L(32bytesless):
155	SETRTNVAL
156	RETURN
157
158L(64bytesmore):
159	testl	$0xf, %edx
160	jz	L(aligned_16)
161L(not_aligned_16):
162	movdqu	%xmm0, (%edx)
163	movl	%edx, %eax
164	and	$-16, %edx
165	add	$16, %edx
166	sub	%edx, %eax
167	add	%eax, %ecx
168	movd	%xmm0, %eax
169
170	ALIGN(4)
171L(aligned_16):
172	cmp	$128, %ecx
173	jae	L(128bytesormore)
174
175L(aligned_16_less128bytes):
176	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
177
178	ALIGN(4)
179L(128bytesormore):
180	PUSH(%ebx)
181	SETUP_PIC_REG(bx)
182	add	$_GLOBAL_OFFSET_TABLE_, %ebx
183	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
184	cmp	%ebx, %ecx
185	jae	L(128bytesormore_nt_start)
186
187	POP(%ebx)
188
189	PUSH(%ebx)
190	SETUP_PIC_REG(bx)
191	add	$_GLOBAL_OFFSET_TABLE_, %ebx
192	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
193
194	cmp	%ebx, %ecx
195	jae	L(128bytes_L2_normal)
196	subl	$128, %ecx
197L(128bytesormore_normal):
198	sub	$128, %ecx
199	movdqa	%xmm0, (%edx)
200	movaps	%xmm0, 0x10(%edx)
201	movaps	%xmm0, 0x20(%edx)
202	movaps	%xmm0, 0x30(%edx)
203	movaps	%xmm0, 0x40(%edx)
204	movaps	%xmm0, 0x50(%edx)
205	movaps	%xmm0, 0x60(%edx)
206	movaps	%xmm0, 0x70(%edx)
207	lea	128(%edx), %edx
208	jb	L(128bytesless_normal)
209
210
211	sub	$128, %ecx
212	movdqa	%xmm0, (%edx)
213	movaps	%xmm0, 0x10(%edx)
214	movaps	%xmm0, 0x20(%edx)
215	movaps	%xmm0, 0x30(%edx)
216	movaps	%xmm0, 0x40(%edx)
217	movaps	%xmm0, 0x50(%edx)
218	movaps	%xmm0, 0x60(%edx)
219	movaps	%xmm0, 0x70(%edx)
220	lea	128(%edx), %edx
221	jae	L(128bytesormore_normal)
222
223L(128bytesless_normal):
224	lea	128(%ecx), %ecx
225	POP(%ebx)
226	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
227
228	ALIGN(4)
229L(128bytes_L2_normal):
230	prefetchnta	0x380(%edx)
231	prefetchnta	0x3c0(%edx)
232	sub	$128, %ecx
233	movdqa	%xmm0, (%edx)
234	movaps	%xmm0, 0x10(%edx)
235	movaps	%xmm0, 0x20(%edx)
236	movaps	%xmm0, 0x30(%edx)
237	movaps	%xmm0, 0x40(%edx)
238	movaps	%xmm0, 0x50(%edx)
239	movaps	%xmm0, 0x60(%edx)
240	movaps	%xmm0, 0x70(%edx)
241	add	$128, %edx
242	cmp	$128, %ecx
243	jae	L(128bytes_L2_normal)
244
245L(128bytesless_L2_normal):
246	POP(%ebx)
247	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
248
249L(128bytesormore_nt_start):
250	sub	%ebx, %ecx
251	ALIGN(4)
252L(128bytesormore_shared_cache_loop):
253	prefetchnta	0x3c0(%edx)
254	prefetchnta	0x380(%edx)
255	sub	$0x80, %ebx
256	movdqa	%xmm0, (%edx)
257	movaps	%xmm0, 0x10(%edx)
258	movaps	%xmm0, 0x20(%edx)
259	movaps	%xmm0, 0x30(%edx)
260	movaps	%xmm0, 0x40(%edx)
261	movaps	%xmm0, 0x50(%edx)
262	movaps	%xmm0, 0x60(%edx)
263	movaps	%xmm0, 0x70(%edx)
264	add	$0x80, %edx
265	cmp	$0x80, %ebx
266	jae	L(128bytesormore_shared_cache_loop)
267	cmp	$0x80, %ecx
268	jb	L(shared_cache_loop_end)
269	ALIGN(4)
270L(128bytesormore_nt):
271	sub	$0x80, %ecx
272	movntdq	%xmm0, (%edx)
273	movntdq	%xmm0, 0x10(%edx)
274	movntdq	%xmm0, 0x20(%edx)
275	movntdq	%xmm0, 0x30(%edx)
276	movntdq	%xmm0, 0x40(%edx)
277	movntdq	%xmm0, 0x50(%edx)
278	movntdq	%xmm0, 0x60(%edx)
279	movntdq	%xmm0, 0x70(%edx)
280	add	$0x80, %edx
281	cmp	$0x80, %ecx
282	jae	L(128bytesormore_nt)
283	sfence
284L(shared_cache_loop_end):
285	POP(%ebx)
286	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
287
288
289	.pushsection .rodata.sse2,"a",@progbits
290	ALIGN(2)
291L(table_16_128bytes):
292	.int	JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
293	.int	JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
294	.int	JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
295	.int	JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
296	.int	JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
297	.int	JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
298	.int	JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
299	.int	JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
300	.int	JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
301	.int	JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
302	.int	JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
303	.int	JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
304	.int	JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
305	.int	JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
306	.int	JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
307	.int	JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
308	.int	JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
309	.int	JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
310	.int	JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
311	.int	JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
312	.int	JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
313	.int	JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
314	.int	JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
315	.int	JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
316	.int	JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
317	.int	JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
318	.int	JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
319	.int	JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
320	.int	JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
321	.int	JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
322	.int	JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
323	.int	JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
324	.int	JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
325	.int	JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
326	.int	JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
327	.int	JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
328	.int	JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
329	.int	JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
330	.int	JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
331	.int	JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
332	.int	JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
333	.int	JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
334	.int	JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
335	.int	JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
336	.int	JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
337	.int	JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
338	.int	JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
339	.int	JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
340	.int	JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
341	.int	JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
342	.int	JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
343	.int	JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
344	.int	JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
345	.int	JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
346	.int	JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
347	.int	JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
348	.int	JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
349	.int	JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
350	.int	JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
351	.int	JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
352	.int	JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
353	.int	JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
354	.int	JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
355	.int	JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
356	.int	JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
357	.int	JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
358	.int	JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
359	.int	JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
360	.int	JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
361	.int	JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
362	.int	JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
363	.int	JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
364	.int	JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
365	.int	JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
366	.int	JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
367	.int	JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
368	.int	JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
369	.int	JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
370	.int	JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
371	.int	JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
372	.int	JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
373	.int	JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
374	.int	JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
375	.int	JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
376	.int	JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
377	.int	JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
378	.int	JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
379	.int	JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
380	.int	JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
381	.int	JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
382	.int	JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
383	.int	JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
384	.int	JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
385	.int	JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
386	.int	JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
387	.int	JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
388	.int	JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
389	.int	JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
390	.int	JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
391	.int	JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
392	.int	JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
393	.int	JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
394	.int	JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
395	.int	JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
396	.int	JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
397	.int	JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
398	.int	JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
399	.int	JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
400	.int	JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
401	.int	JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
402	.int	JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
403	.int	JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
404	.int	JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
405	.int	JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
406	.int	JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
407	.int	JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
408	.int	JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
409	.int	JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
410	.int	JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
411	.int	JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
412	.int	JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
413	.int	JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
414	.int	JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
415	.int	JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
416	.int	JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
417	.int	JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
418	.int	JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
419	.int	JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
420	.popsection
421
422	ALIGN(4)
423L(aligned_16_112bytes):
424	movdqa	%xmm0, -112(%edx)
425L(aligned_16_96bytes):
426	movdqa	%xmm0, -96(%edx)
427L(aligned_16_80bytes):
428	movdqa	%xmm0, -80(%edx)
429L(aligned_16_64bytes):
430	movdqa	%xmm0, -64(%edx)
431L(aligned_16_48bytes):
432	movdqa	%xmm0, -48(%edx)
433L(aligned_16_32bytes):
434	movdqa	%xmm0, -32(%edx)
435L(aligned_16_16bytes):
436	movdqa	%xmm0, -16(%edx)
437L(aligned_16_0bytes):
438	SETRTNVAL
439	RETURN
440
441	ALIGN(4)
442L(aligned_16_113bytes):
443	movdqa	%xmm0, -113(%edx)
444L(aligned_16_97bytes):
445	movdqa	%xmm0, -97(%edx)
446L(aligned_16_81bytes):
447	movdqa	%xmm0, -81(%edx)
448L(aligned_16_65bytes):
449	movdqa	%xmm0, -65(%edx)
450L(aligned_16_49bytes):
451	movdqa	%xmm0, -49(%edx)
452L(aligned_16_33bytes):
453	movdqa	%xmm0, -33(%edx)
454L(aligned_16_17bytes):
455	movdqa	%xmm0, -17(%edx)
456L(aligned_16_1bytes):
457	movb	%al, -1(%edx)
458	SETRTNVAL
459	RETURN
460
461	ALIGN(4)
462L(aligned_16_114bytes):
463	movdqa	%xmm0, -114(%edx)
464L(aligned_16_98bytes):
465	movdqa	%xmm0, -98(%edx)
466L(aligned_16_82bytes):
467	movdqa	%xmm0, -82(%edx)
468L(aligned_16_66bytes):
469	movdqa	%xmm0, -66(%edx)
470L(aligned_16_50bytes):
471	movdqa	%xmm0, -50(%edx)
472L(aligned_16_34bytes):
473	movdqa	%xmm0, -34(%edx)
474L(aligned_16_18bytes):
475	movdqa	%xmm0, -18(%edx)
476L(aligned_16_2bytes):
477	movw	%ax, -2(%edx)
478	SETRTNVAL
479	RETURN
480
481	ALIGN(4)
482L(aligned_16_115bytes):
483	movdqa	%xmm0, -115(%edx)
484L(aligned_16_99bytes):
485	movdqa	%xmm0, -99(%edx)
486L(aligned_16_83bytes):
487	movdqa	%xmm0, -83(%edx)
488L(aligned_16_67bytes):
489	movdqa	%xmm0, -67(%edx)
490L(aligned_16_51bytes):
491	movdqa	%xmm0, -51(%edx)
492L(aligned_16_35bytes):
493	movdqa	%xmm0, -35(%edx)
494L(aligned_16_19bytes):
495	movdqa	%xmm0, -19(%edx)
496L(aligned_16_3bytes):
497	movw	%ax, -3(%edx)
498	movb	%al, -1(%edx)
499	SETRTNVAL
500	RETURN
501
502	ALIGN(4)
503L(aligned_16_116bytes):
504	movdqa	%xmm0, -116(%edx)
505L(aligned_16_100bytes):
506	movdqa	%xmm0, -100(%edx)
507L(aligned_16_84bytes):
508	movdqa	%xmm0, -84(%edx)
509L(aligned_16_68bytes):
510	movdqa	%xmm0, -68(%edx)
511L(aligned_16_52bytes):
512	movdqa	%xmm0, -52(%edx)
513L(aligned_16_36bytes):
514	movdqa	%xmm0, -36(%edx)
515L(aligned_16_20bytes):
516	movdqa	%xmm0, -20(%edx)
517L(aligned_16_4bytes):
518	movl	%eax, -4(%edx)
519	SETRTNVAL
520	RETURN
521
522	ALIGN(4)
523L(aligned_16_117bytes):
524	movdqa	%xmm0, -117(%edx)
525L(aligned_16_101bytes):
526	movdqa	%xmm0, -101(%edx)
527L(aligned_16_85bytes):
528	movdqa	%xmm0, -85(%edx)
529L(aligned_16_69bytes):
530	movdqa	%xmm0, -69(%edx)
531L(aligned_16_53bytes):
532	movdqa	%xmm0, -53(%edx)
533L(aligned_16_37bytes):
534	movdqa	%xmm0, -37(%edx)
535L(aligned_16_21bytes):
536	movdqa	%xmm0, -21(%edx)
537L(aligned_16_5bytes):
538	movl	%eax, -5(%edx)
539	movb	%al, -1(%edx)
540	SETRTNVAL
541	RETURN
542
543	ALIGN(4)
544L(aligned_16_118bytes):
545	movdqa	%xmm0, -118(%edx)
546L(aligned_16_102bytes):
547	movdqa	%xmm0, -102(%edx)
548L(aligned_16_86bytes):
549	movdqa	%xmm0, -86(%edx)
550L(aligned_16_70bytes):
551	movdqa	%xmm0, -70(%edx)
552L(aligned_16_54bytes):
553	movdqa	%xmm0, -54(%edx)
554L(aligned_16_38bytes):
555	movdqa	%xmm0, -38(%edx)
556L(aligned_16_22bytes):
557	movdqa	%xmm0, -22(%edx)
558L(aligned_16_6bytes):
559	movl	%eax, -6(%edx)
560	movw	%ax, -2(%edx)
561	SETRTNVAL
562	RETURN
563
564	ALIGN(4)
565L(aligned_16_119bytes):
566	movdqa	%xmm0, -119(%edx)
567L(aligned_16_103bytes):
568	movdqa	%xmm0, -103(%edx)
569L(aligned_16_87bytes):
570	movdqa	%xmm0, -87(%edx)
571L(aligned_16_71bytes):
572	movdqa	%xmm0, -71(%edx)
573L(aligned_16_55bytes):
574	movdqa	%xmm0, -55(%edx)
575L(aligned_16_39bytes):
576	movdqa	%xmm0, -39(%edx)
577L(aligned_16_23bytes):
578	movdqa	%xmm0, -23(%edx)
579L(aligned_16_7bytes):
580	movl	%eax, -7(%edx)
581	movw	%ax, -3(%edx)
582	movb	%al, -1(%edx)
583	SETRTNVAL
584	RETURN
585
586	ALIGN(4)
587L(aligned_16_120bytes):
588	movdqa	%xmm0, -120(%edx)
589L(aligned_16_104bytes):
590	movdqa	%xmm0, -104(%edx)
591L(aligned_16_88bytes):
592	movdqa	%xmm0, -88(%edx)
593L(aligned_16_72bytes):
594	movdqa	%xmm0, -72(%edx)
595L(aligned_16_56bytes):
596	movdqa	%xmm0, -56(%edx)
597L(aligned_16_40bytes):
598	movdqa	%xmm0, -40(%edx)
599L(aligned_16_24bytes):
600	movdqa	%xmm0, -24(%edx)
601L(aligned_16_8bytes):
602	movq	%xmm0, -8(%edx)
603	SETRTNVAL
604	RETURN
605
606	ALIGN(4)
607L(aligned_16_121bytes):
608	movdqa	%xmm0, -121(%edx)
609L(aligned_16_105bytes):
610	movdqa	%xmm0, -105(%edx)
611L(aligned_16_89bytes):
612	movdqa	%xmm0, -89(%edx)
613L(aligned_16_73bytes):
614	movdqa	%xmm0, -73(%edx)
615L(aligned_16_57bytes):
616	movdqa	%xmm0, -57(%edx)
617L(aligned_16_41bytes):
618	movdqa	%xmm0, -41(%edx)
619L(aligned_16_25bytes):
620	movdqa	%xmm0, -25(%edx)
621L(aligned_16_9bytes):
622	movq	%xmm0, -9(%edx)
623	movb	%al, -1(%edx)
624	SETRTNVAL
625	RETURN
626
627	ALIGN(4)
628L(aligned_16_122bytes):
629	movdqa	%xmm0, -122(%edx)
630L(aligned_16_106bytes):
631	movdqa	%xmm0, -106(%edx)
632L(aligned_16_90bytes):
633	movdqa	%xmm0, -90(%edx)
634L(aligned_16_74bytes):
635	movdqa	%xmm0, -74(%edx)
636L(aligned_16_58bytes):
637	movdqa	%xmm0, -58(%edx)
638L(aligned_16_42bytes):
639	movdqa	%xmm0, -42(%edx)
640L(aligned_16_26bytes):
641	movdqa	%xmm0, -26(%edx)
642L(aligned_16_10bytes):
643	movq	%xmm0, -10(%edx)
644	movw	%ax, -2(%edx)
645	SETRTNVAL
646	RETURN
647
648	ALIGN(4)
649L(aligned_16_123bytes):
650	movdqa	%xmm0, -123(%edx)
651L(aligned_16_107bytes):
652	movdqa	%xmm0, -107(%edx)
653L(aligned_16_91bytes):
654	movdqa	%xmm0, -91(%edx)
655L(aligned_16_75bytes):
656	movdqa	%xmm0, -75(%edx)
657L(aligned_16_59bytes):
658	movdqa	%xmm0, -59(%edx)
659L(aligned_16_43bytes):
660	movdqa	%xmm0, -43(%edx)
661L(aligned_16_27bytes):
662	movdqa	%xmm0, -27(%edx)
663L(aligned_16_11bytes):
664	movq	%xmm0, -11(%edx)
665	movw	%ax, -3(%edx)
666	movb	%al, -1(%edx)
667	SETRTNVAL
668	RETURN
669
670	ALIGN(4)
671L(aligned_16_124bytes):
672	movdqa	%xmm0, -124(%edx)
673L(aligned_16_108bytes):
674	movdqa	%xmm0, -108(%edx)
675L(aligned_16_92bytes):
676	movdqa	%xmm0, -92(%edx)
677L(aligned_16_76bytes):
678	movdqa	%xmm0, -76(%edx)
679L(aligned_16_60bytes):
680	movdqa	%xmm0, -60(%edx)
681L(aligned_16_44bytes):
682	movdqa	%xmm0, -44(%edx)
683L(aligned_16_28bytes):
684	movdqa	%xmm0, -28(%edx)
685L(aligned_16_12bytes):
686	movq	%xmm0, -12(%edx)
687	movl	%eax, -4(%edx)
688	SETRTNVAL
689	RETURN
690
691	ALIGN(4)
692L(aligned_16_125bytes):
693	movdqa	%xmm0, -125(%edx)
694L(aligned_16_109bytes):
695	movdqa	%xmm0, -109(%edx)
696L(aligned_16_93bytes):
697	movdqa	%xmm0, -93(%edx)
698L(aligned_16_77bytes):
699	movdqa	%xmm0, -77(%edx)
700L(aligned_16_61bytes):
701	movdqa	%xmm0, -61(%edx)
702L(aligned_16_45bytes):
703	movdqa	%xmm0, -45(%edx)
704L(aligned_16_29bytes):
705	movdqa	%xmm0, -29(%edx)
706L(aligned_16_13bytes):
707	movq	%xmm0, -13(%edx)
708	movl	%eax, -5(%edx)
709	movb	%al, -1(%edx)
710	SETRTNVAL
711	RETURN
712
713	ALIGN(4)
714L(aligned_16_126bytes):
715	movdqa	%xmm0, -126(%edx)
716L(aligned_16_110bytes):
717	movdqa	%xmm0, -110(%edx)
718L(aligned_16_94bytes):
719	movdqa	%xmm0, -94(%edx)
720L(aligned_16_78bytes):
721	movdqa	%xmm0, -78(%edx)
722L(aligned_16_62bytes):
723	movdqa	%xmm0, -62(%edx)
724L(aligned_16_46bytes):
725	movdqa	%xmm0, -46(%edx)
726L(aligned_16_30bytes):
727	movdqa	%xmm0, -30(%edx)
728L(aligned_16_14bytes):
729	movq	%xmm0, -14(%edx)
730	movl	%eax, -6(%edx)
731	movw	%ax, -2(%edx)
732	SETRTNVAL
733	RETURN
734
735	ALIGN(4)
736L(aligned_16_127bytes):
737	movdqa	%xmm0, -127(%edx)
738L(aligned_16_111bytes):
739	movdqa	%xmm0, -111(%edx)
740L(aligned_16_95bytes):
741	movdqa	%xmm0, -95(%edx)
742L(aligned_16_79bytes):
743	movdqa	%xmm0, -79(%edx)
744L(aligned_16_63bytes):
745	movdqa	%xmm0, -63(%edx)
746L(aligned_16_47bytes):
747	movdqa	%xmm0, -47(%edx)
748L(aligned_16_31bytes):
749	movdqa	%xmm0, -31(%edx)
750L(aligned_16_15bytes):
751	movq	%xmm0, -15(%edx)
752	movl	%eax, -7(%edx)
753	movw	%ax, -3(%edx)
754	movb	%al, -1(%edx)
755	SETRTNVAL
756	RETURN_END
757
758END(memset)
759