xref: /openbmc/linux/arch/sparc/lib/M7memcpy.S (revision 34060b8f)
1/*
2 * M7memcpy: Optimized SPARC M7 memcpy
3 *
4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5 */
6
7	.file	"M7memcpy.S"
8
9/*
10 * memcpy(s1, s2, len)
11 *
12 * Copy s2 to s1, always copy n bytes.
13 * Note: this C code does not work for overlapped copies.
14 *
15 * Fast assembler language version of the following C-program for memcpy
16 * which represents the `standard' for the C-library.
17 *
18 *	void *
19 *	memcpy(void *s, const void *s0, size_t n)
20 *	{
21 *		if (n != 0) {
22 *		    char *s1 = s;
23 *		    const char *s2 = s0;
24 *		    do {
25 *			*s1++ = *s2++;
26 *		    } while (--n != 0);
27 *		}
28 *		return (s);
29 *	}
30 *
31 *
32 * SPARC T7/M7 Flow :
33 *
34 * if (count < SMALL_MAX) {
35 *   if count < SHORTCOPY              (SHORTCOPY=3)
36 *	copy bytes; exit with dst addr
37 *   if src & dst aligned on word boundary but not long word boundary,
38 *     copy with ldw/stw; branch to finish_up
39 *   if src & dst aligned on long word boundary
40 *     copy with ldx/stx; branch to finish_up
41 *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
42 *     copy bytes; exit with dst addr
43 *   move enough bytes to get src to word boundary
44 *   if dst now on word boundary
45 * move_words:
46 *     copy words; branch to finish_up
47 *   if dst now on half word boundary
48 *     load words, shift half words, store words; branch to finish_up
49 *   if dst on byte 1
50 *     load words, shift 3 bytes, store words; branch to finish_up
51 *   if dst on byte 3
52 *     load words, shift 1 byte, store words; branch to finish_up
53 * finish_up:
54 *     copy bytes; exit with dst addr
55 * } else {                                         More than SMALL_MAX bytes
56 *   move bytes until dst is on long word boundary
57 *   if( src is on long word boundary ) {
58 *     if (count < MED_MAX) {
59 * finish_long:					   src/dst aligned on 8 bytes
60 *       copy with ldx/stx in 8-way unrolled loop;
61 *       copy final 0-63 bytes; exit with dst addr
62 *     } else {				     src/dst aligned; count > MED_MAX
63 *       align dst on 64 byte boundary; for main data movement:
64 *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65 *       Use BIS (block initializing store) to avoid copying store cache
66 *       lines from memory. But pre-store first element of each cache line
67 *       ST_CHUNK lines in advance of the rest of that cache line. That
68 *       gives time for replacement cache lines to be written back without
69 *       excess STQ and Miss Buffer filling. Repeat until near the end,
70 *       then finish up storing before going to finish_long.
71 *     }
72 *   } else {                                   src/dst not aligned on 8 bytes
73 *     if src is word aligned and count < MED_WMAX
74 *       move words in 8-way unrolled loop
75 *       move final 0-31 bytes; exit with dst addr
76 *     if count < MED_UMAX
77 *       use alignaddr/faligndata combined with ldd/std in 8-way
78 *       unrolled loop to move data.
79 *       go to unalign_done
80 *     else
81 *       setup alignaddr for faligndata instructions
82 *       align dst on 64 byte boundary; prefetch src data to L1 cache
83 *       loadx8, falign, block-store, prefetch loop
84 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
85 * unalign_done:
86 *       move remaining bytes for unaligned cases. exit with dst addr.
87 * }
88 *
89 */
90
91#include <asm/visasm.h>
92#include <asm/asi.h>
93
94#if !defined(EX_LD) && !defined(EX_ST)
95#define NON_USER_COPY
96#endif
97
98#ifndef EX_LD
99#define EX_LD(x,y)	x
100#endif
101#ifndef EX_LD_FP
102#define EX_LD_FP(x,y)	x
103#endif
104
105#ifndef EX_ST
106#define EX_ST(x,y)	x
107#endif
108#ifndef EX_ST_FP
109#define EX_ST_FP(x,y)	x
110#endif
111
112#ifndef EX_RETVAL
113#define EX_RETVAL(x)    x
114#endif
115
116#ifndef LOAD
117#define LOAD(type,addr,dest)	type [addr], dest
118#endif
119
120#ifndef STORE
121#define STORE(type,src,addr)	type src, [addr]
122#endif
123
124/*
125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126 * line as "least recently used" which means if many threads are
127 * active, it has a high probability of being pushed out of the cache
128 * between the first initializing store and the final stores.
129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130 * marks the cache line as "most recently used" for all
131 * but the last cache line
132 */
133#ifndef STORE_ASI
134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
136#else
137#define STORE_ASI	0x80		/* ASI_P */
138#endif
139#endif
140
141#ifndef STORE_MRU_ASI
142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143#define STORE_MRU_ASI	ASI_ST_BLKINIT_MRU_P
144#else
145#define STORE_MRU_ASI	0x80		/* ASI_P */
146#endif
147#endif
148
149#ifndef STORE_INIT
150#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
151#endif
152
153#ifndef STORE_INIT_MRU
154#define STORE_INIT_MRU(src,addr)	stxa src, [addr] STORE_MRU_ASI
155#endif
156
157#ifndef FUNC_NAME
158#define FUNC_NAME	M7memcpy
159#endif
160
161#ifndef PREAMBLE
162#define PREAMBLE
163#endif
164
165#define	BLOCK_SIZE	64
166#define	SHORTCOPY	3
167#define	SHORTCHECK	14
168#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
169				/* must be at least 64 */
170#define	SMALL_MAX	128
171#define	MED_UMAX	1024	/* max copy for medium un-aligned case */
172#define	MED_WMAX	1024	/* max copy for medium word-aligned case */
173#define	MED_MAX		1024	/* max copy for medium longword-aligned case */
174#define ST_CHUNK	24	/* ST_CHUNK - block of values for BIS Store */
175#define ALIGN_PRE	24	/* distance for aligned prefetch loop */
176
177	.register	%g2,#scratch
178
179	.section	".text"
180	.global		FUNC_NAME
181	.type		FUNC_NAME, #function
182	.align		16
183FUNC_NAME:
184	srlx            %o2, 31, %g2
185	cmp             %g2, 0
186	tne             %xcc, 5
187	PREAMBLE
188	mov		%o0, %g1	! save %o0
189	brz,pn          %o2, .Lsmallx
190	 cmp            %o2, 3
191	ble,pn          %icc, .Ltiny_cp
192	 cmp            %o2, 19
193	ble,pn          %icc, .Lsmall_cp
194	 or             %o0, %o1, %g2
195	cmp             %o2, SMALL_MAX
196	bl,pn           %icc, .Lmedium_cp
197	 nop
198
199.Lmedium:
200	neg	%o0, %o5
201	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
202	brz,pt	%o5, .Ldst_aligned_on_8
203
204	! %o5 has the bytes to be written in partial store.
205	 sub	%o2, %o5, %o2
206	sub	%o1, %o0, %o1		! %o1 gets the difference
2077:					! dst aligning loop
208	add	%o1, %o0, %o4
209	EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)	! load one byte
210	subcc	%o5, 1, %o5
211	EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
212	bgu,pt	%xcc, 7b
213	 add	%o0, 1, %o0		! advance dst
214	add	%o1, %o0, %o1		! restore %o1
215.Ldst_aligned_on_8:
216	andcc	%o1, 7, %o5
217	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
218	 nop
219
220.Lsrc_dst_aligned_on_8:
221	! check if we are copying MED_MAX or more bytes
222	set MED_MAX, %o3
223	cmp %o2, %o3 			! limit to store buffer size
224	bgu,pn	%xcc, .Llarge_align8_copy
225	 nop
226
227/*
228 * Special case for handling when src and dest are both long word aligned
229 * and total data to move is less than MED_MAX bytes
230 */
231.Lmedlong:
232	subcc	%o2, 63, %o2		! adjust length to allow cc test
233	ble,pn	%xcc, .Lmedl63		! skip big loop if less than 64 bytes
234	 nop
235.Lmedl64:
236	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)	! load
237	subcc	%o2, 64, %o2		! decrement length count
238	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)	! and store
239	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56)	! a block of 64
240	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
241	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
242	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
243	EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
244	EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
245	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
246	EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
247	EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
248	add	%o1, 64, %o1		! increase src ptr by 64
249	EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
250	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
251	add	%o0, 64, %o0		! increase dst ptr by 64
252	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
253	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
254	bgu,pt	%xcc, .Lmedl64		! repeat if at least 64 bytes left
255	 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
256.Lmedl63:
257	addcc	%o2, 32, %o2		! adjust remaining count
258	ble,pt	%xcc, .Lmedl31		! to skip if 31 or fewer bytes left
259	 nop
260	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)	! load
261	sub	%o2, 32, %o2		! decrement length count
262	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)	! and store
263	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24)	! a block of 32
264	add	%o1, 32, %o1		! increase src ptr by 32
265	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
266	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
267	add	%o0, 32, %o0		! increase dst ptr by 32
268	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
269	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
270	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
271.Lmedl31:
272	addcc	%o2, 16, %o2		! adjust remaining count
273	ble,pt	%xcc, .Lmedl15		! skip if 15 or fewer bytes left
274	 nop				!
275	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
276	add	%o1, 16, %o1		! increase src ptr by 16
277	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
278	sub	%o2, 16, %o2		! decrease count by 16
279	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
280	add	%o0, 16, %o0		! increase dst ptr by 16
281	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
282.Lmedl15:
283	addcc	%o2, 15, %o2		! restore count
284	bz,pt	%xcc, .Lsmallx	! exit if finished
285	 cmp	%o2, 8
286	blt,pt	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
287	 tst	%o2
288	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)	! load 8 bytes
289	add	%o1, 8, %o1		! increase src ptr by 8
290	add	%o0, 8, %o0		! increase dst ptr by 8
291	subcc	%o2, 8, %o2		! decrease count by 8
292	bnz,pn	%xcc, .Lmedw7
293	 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)	! and store 8
294	retl
295	 mov	EX_RETVAL(%g1), %o0	! restore %o0
296
297	.align 16
298.Lsrc_dst_unaligned_on_8:
299	! DST is 8-byte aligned, src is not
3002:
301	andcc	%o1, 0x3, %o5		! test word alignment
302	bnz,pt	%xcc, .Lunalignsetup	! branch to skip if not word aligned
303	 nop
304
305/*
306 * Handle all cases where src and dest are aligned on word
307 * boundaries. Use unrolled loops for better performance.
308 * This option wins over standard large data move when
309 * source and destination is in cache for.Lmedium
310 * to short data moves.
311 */
312	set MED_WMAX, %o3
313	cmp %o2, %o3 			! limit to store buffer size
314	bge,pt	%xcc, .Lunalignrejoin	! otherwise rejoin main loop
315	 nop
316
317	subcc	%o2, 31, %o2		! adjust length to allow cc test
318					! for end of loop
319	ble,pt	%xcc, .Lmedw31		! skip big loop if less than 16
320.Lmedw32:
321	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
322	sllx	%o4, 32, %o5
323	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
324	or	%o4, %o5, %o5
325	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
326	subcc	%o2, 32, %o2		! decrement length count
327	EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
328	sllx	%o4, 32, %o5
329	EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
330	or	%o4, %o5, %o5
331	EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
332	add	%o1, 32, %o1		! increase src ptr by 32
333	EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
334	sllx	%o4, 32, %o5
335	EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
336	or	%o4, %o5, %o5
337	EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
338	add	%o0, 32, %o0		! increase dst ptr by 32
339	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
340	sllx	%o4, 32, %o5
341	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
342	or	%o4, %o5, %o5
343	bgu,pt	%xcc, .Lmedw32		! repeat if at least 32 bytes left
344	 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
345.Lmedw31:
346	addcc	%o2, 31, %o2		! restore count
347
348	bz,pt	%xcc, .Lsmallx	! exit if finished
349	 nop
350	cmp	%o2, 16
351	blt,pt	%xcc, .Lmedw15
352	 nop
353	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
354	sllx	%o4, 32, %o5
355	subcc	%o2, 16, %o2		! decrement length count
356	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
357	or	%o4, %o5, %o5
358	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
359	add	%o1, 16, %o1		! increase src ptr by 16
360	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
361	add	%o0, 16, %o0		! increase dst ptr by 16
362	sllx	%o4, 32, %o5
363	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
364	or	%o4, %o5, %o5
365	EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
366.Lmedw15:
367	bz,pt	%xcc, .Lsmallx	! exit if finished
368	 cmp	%o2, 8
369	blt,pn	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
370	 tst	%o2
371	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
372	subcc	%o2, 8, %o2		! decrease count by 8
373	EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
374	add	%o1, 8, %o1		! increase src ptr by 8
375	EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)	! load 4 bytes
376	add	%o0, 8, %o0		! increase dst ptr by 8
377	EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
378	bz,pt	%xcc, .Lsmallx	! exit if finished
379.Lmedw7:				! count is ge 1, less than 8
380	cmp	%o2, 4			! check for 4 bytes left
381	blt,pn	%xcc, .Lsmallleft3	! skip if 3 or fewer bytes left
382	 nop				!
383	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
384	add	%o1, 4, %o1		! increase src ptr by 4
385	add	%o0, 4, %o0		! increase dst ptr by 4
386	subcc	%o2, 4, %o2		! decrease count by 4
387	bnz	.Lsmallleft3
388	 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
389	retl
390	 mov	EX_RETVAL(%g1), %o0
391
392	.align 16
393.Llarge_align8_copy:			! Src and dst share 8 byte alignment
394	! align dst to 64 byte boundary
395	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
396	brz,pn	%o3, .Laligned_to_64
397	 andcc	%o0, 8, %o3		! odd long words to move?
398	brz,pt	%o3, .Laligned_to_16
399	 nop
400	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
401	sub	%o2, 8, %o2
402	add	%o1, 8, %o1		! increment src ptr
403	add	%o0, 8, %o0		! increment dst ptr
404	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
405.Laligned_to_16:
406	andcc	%o0, 16, %o3		! pair of long words to move?
407	brz,pt	%o3, .Laligned_to_32
408	 nop
409	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
410	sub	%o2, 16, %o2
411	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
412	add	%o1, 16, %o1		! increment src ptr
413	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
414	add	%o0, 16, %o0		! increment dst ptr
415	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
416.Laligned_to_32:
417	andcc	%o0, 32, %o3		! four long words to move?
418	brz,pt	%o3, .Laligned_to_64
419	 nop
420	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
421	sub	%o2, 32, %o2
422	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
423	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
424	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
425	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
426	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
427	add	%o1, 32, %o1		! increment src ptr
428	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
429	add	%o0, 32, %o0		! increment dst ptr
430	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
431.Laligned_to_64:
432!
433!	Using block init store (BIS) instructions to avoid fetching cache
434!	lines from memory. Use ST_CHUNK stores to first element of each cache
435!	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436!	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437!	Initial stores using MRU version of BIS to keep cache line in
438!	cache until we are ready to store final element of cache line.
439!	Then store last element using the LRU version of BIS.
440!
441	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
442	and	%o2, 0x3f, %o2		! residue bytes in %o2
443!
444!	We use STORE_MRU_ASI for the first seven stores to each cache line
445!	followed by STORE_ASI (mark as LRU) for the last store. That
446!	mixed approach reduces the probability that the cache line is removed
447!	before we finish setting it, while minimizing the effects on
448!	other cached values during a large memcpy
449!
450!	ST_CHUNK batches up initial BIS operations for several cache lines
451!	to allow multiple requests to not be blocked by overflowing the
452!	the store miss buffer. Then the matching stores for all those
453!	BIS operations are executed.
454!
455
456	sub	%o0, 8, %o0		! adjust %o0 for ASI alignment
457.Lalign_loop:
458	cmp	%o5, ST_CHUNK*64
459	blu,pt	%xcc, .Lalign_loop_fin
460	 mov	ST_CHUNK,%o3
461.Lalign_loop_start:
462	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463	subcc	%o3, 1, %o3
464	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
465	add	%o1, 64, %o1
466	add	%o0, 8, %o0
467	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
468	bgu	%xcc,.Lalign_loop_start
469	 add	%o0, 56, %o0
470
471	mov	ST_CHUNK,%o3
472	sllx	%o3, 6, %o4		! ST_CHUNK*64
473	sub	%o1, %o4, %o1		! reset %o1
474	sub	%o0, %o4, %o0		! reset %o0
475
476.Lalign_loop_rest:
477	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
478	add	%o0, 16, %o0
479	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
480	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
481	add	%o0, 8, %o0
482	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
483	subcc	%o3, 1, %o3
484	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
485	add	%o0, 8, %o0
486	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
487	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
488	add	%o0, 8, %o0
489	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
490	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
491	add	%o0, 8, %o0
492	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
493	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
494	add	%o1, 64, %o1
495	add	%o0, 8, %o0
496	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
497	add	%o0, 8, %o0
498	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
499	sub	%o5, 64, %o5
500	bgu	%xcc,.Lalign_loop_rest
501	! mark cache line as LRU
502	 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
503
504	cmp	%o5, ST_CHUNK*64
505	bgu,pt	%xcc, .Lalign_loop_start
506	 mov	ST_CHUNK,%o3
507
508	cmp	%o5, 0
509	beq	.Lalign_done
510	 nop
511.Lalign_loop_fin:
512	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
513	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
514	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
515	EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
516	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
517	EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
518	subcc	%o5, 64, %o5
519	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
520	EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
521	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
522	EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
523	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
524	EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
525	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
526	add	%o1, 64, %o1
527	EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
528	add	%o0, 64, %o0
529	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
530	bgu	%xcc,.Lalign_loop_fin
531	 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
532
533.Lalign_done:
534	add	%o0, 8, %o0		! restore %o0 from ASI alignment
535	membar	#StoreStore
536	sub	%o2, 63, %o2		! adjust length to allow cc test
537	ba	.Lmedl63		! in .Lmedl63
538	 nop
539
540	.align 16
541	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542.Lunalignsetup:
543.Lunalignrejoin:
544	mov	%g1, %o3	! save %g1 as VISEntryHalf clobbers it
545#ifdef NON_USER_COPY
546	VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547#else
548	VISEntryHalf
549#endif
550	mov	%o3, %g1	! restore %g1
551
552	set MED_UMAX, %o3
553	cmp %o2, %o3 		! check for.Lmedium unaligned limit
554	bge,pt	%xcc,.Lunalign_large
555	 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
557	and	%o2, 0x3f, %o2		! residue bytes in %o2
558	cmp	%o2, 8			! Insure we do not load beyond
559	bgt	.Lunalign_adjust	! end of source buffer
560	 andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
561	add	%o2, 64, %o2		! adjust to leave loop
562	sub	%o5, 64, %o5		! early if necessary
563.Lunalign_adjust:
564	alignaddr %o1, %g0, %g0		! generate %gsr
565	add	%o1, %o5, %o1		! advance %o1 to after blocks
566	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
567.Lunalign_loop:
568	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
569	faligndata %f0, %f2, %f16
570	EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
571	subcc	%o5, BLOCK_SIZE, %o5
572	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
573	faligndata %f2, %f4, %f18
574	EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
575	EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
576	faligndata %f4, %f6, %f20
577	EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
578	EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
579	faligndata %f6, %f8, %f22
580	EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
581	EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
582	faligndata %f8, %f10, %f24
583	EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
584	EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
585	faligndata %f10, %f12, %f26
586	EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
587	add	%o4, BLOCK_SIZE, %o4
588	EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
589	faligndata %f12, %f14, %f28
590	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
591	EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
592	faligndata %f14, %f0, %f30
593	EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
594	add	%o0, BLOCK_SIZE, %o0
595	bgu,pt	%xcc, .Lunalign_loop
596	 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597	ba	.Lunalign_done
598	 nop
599
600.Lunalign_large:
601	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
602	bz	%xcc, .Lunalignsrc
603	 sub	%o3, 64, %o3		! %o3 will be multiple of 8
604	neg	%o3			! bytes until dest is 64 byte aligned
605	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
606	! Move bytes according to source alignment
607	andcc	%o1, 0x1, %o5
608	bnz	%xcc, .Lunalignbyte	! check for byte alignment
609	 nop
610	andcc	%o1, 2, %o5		! check for half word alignment
611	bnz	%xcc, .Lunalignhalf
612	 nop
613	! Src is word aligned
614.Lunalignword:
615	EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 4 bytes
616	add	%o1, 8, %o1		! increase src ptr by 8
617	EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)	! and store 4
618	subcc	%o3, 8, %o3		! decrease count by 8
619	EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
620	add	%o0, 8, %o0		! increase dst ptr by 8
621	bnz	%xcc, .Lunalignword
622	 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
623	ba	.Lunalignsrc
624	 nop
625
626	! Src is half-word aligned
627.Lunalignhalf:
628	EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 2 bytes
629	sllx	%o4, 32, %o5		! shift left
630	EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
631	or	%o4, %o5, %o5
632	sllx	%o5, 16, %o5
633	EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
634	or	%o4, %o5, %o5
635	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
636	add	%o1, 8, %o1
637	subcc	%o3, 8, %o3
638	bnz	%xcc, .Lunalignhalf
639	 add	%o0, 8, %o0
640	ba	.Lunalignsrc
641	 nop
642
643	! Src is Byte aligned
644.Lunalignbyte:
645	sub	%o0, %o1, %o0		! share pointer advance
646.Lunalignbyte_loop:
647	EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
648	sllx	%o4, 56, %o5
649	EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
650	sllx	%o4, 40, %o4
651	or	%o4, %o5, %o5
652	EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
653	sllx	%o4, 24, %o4
654	or	%o4, %o5, %o5
655	EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
656	sllx	%o4,  8, %o4
657	or	%o4, %o5, %o5
658	EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
659	or	%o4, %o5, %o5
660	add	%o0, %o1, %o0
661	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
662	sub	%o0, %o1, %o0
663	subcc	%o3, 8, %o3
664	bnz	%xcc, .Lunalignbyte_loop
665	 add	%o1, 8, %o1
666	add	%o0,%o1, %o0 		! restore pointer
667
668	! Destination is now block (64 byte aligned)
669.Lunalignsrc:
670	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
671	and	%o2, 0x3f, %o2		! residue bytes in %o2
672	add	%o2, 64, %o2		! Insure we do not load beyond
673	sub	%o5, 64, %o5		! end of source buffer
674
675	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
676	alignaddr %o1, %g0, %g0		! generate %gsr
677	add	%o1, %o5, %o1		! advance %o1 to after blocks
678
679	EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
680	add	%o4, 8, %o4
681.Lunalign_sloop:
682	EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
683	faligndata %f14, %f16, %f0
684	EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
685	faligndata %f16, %f18, %f2
686	EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
687	faligndata %f18, %f20, %f4
688	EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
689	subcc	%o5, 64, %o5
690	EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
691	faligndata %f20, %f22, %f6
692	EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
693	EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
694	faligndata %f22, %f24, %f8
695	EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
696	EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
697	faligndata %f24, %f26, %f10
698	EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
699	EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
700	faligndata %f26, %f28, %f12
701	EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
702	add	%o4, 64, %o4
703	EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
704	faligndata %f28, %f30, %f14
705	EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
706	EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
707	add	%o0, 64, %o0
708	EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
709	fsrc2	%f30, %f14
710	bgu,pt	%xcc, .Lunalign_sloop
711	 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712
713.Lunalign_done:
714	! Handle trailing bytes, 64 to 127
715	! Dest long word aligned, Src not long word aligned
716	cmp	%o2, 15
717	bleu	%xcc, .Lunalign_short
718
719	 andn	%o2, 0x7, %o5		! %o5 is multiple of 8
720	and	%o2, 0x7, %o2		! residue bytes in %o2
721	add	%o2, 8, %o2
722	sub	%o5, 8, %o5		! insure we do not load past end of src
723	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
724	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
725	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
726.Lunalign_by8:
727	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
728	add	%o4, 8, %o4
729	faligndata %f0, %f2, %f16
730	subcc	%o5, 8, %o5
731	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
732	fsrc2	%f2, %f0
733	bgu,pt	%xcc, .Lunalign_by8
734	 add	%o0, 8, %o0
735
736.Lunalign_short:
737#ifdef NON_USER_COPY
738	VISExitHalfFast
739#else
740	VISExitHalf
741#endif
742	ba	.Lsmallrest
743	 nop
744
745/*
746 * This is a special case of nested memcpy. This can happen when kernel
747 * calls unaligned memcpy back to back without saving FP registers. We need
748 * traps(context switch) to save/restore FP registers. If the kernel calls
749 * memcpy without this trap sequence we will hit FP corruption. Let's use
750 * the normal integer load/store method in this case.
751 */
752
753#ifdef NON_USER_COPY
754.Lmedium_vis_entry_fail_cp:
755	or	%o0, %o1, %g2
756#endif
757.Lmedium_cp:
758	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759	andcc	%g2, 0x7, %g0
760	bne,pn	%xcc, .Lmedium_unaligned_cp
761	 nop
762
763.Lmedium_noprefetch_cp:
764	andncc	%o2, 0x20 - 1, %o5
765	be,pn	%xcc, 2f
766	 sub	%o2, %o5, %o2
7671:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
768	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
769	EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
770	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
771	add	%o1, 0x20, %o1
772	subcc	%o5, 0x20, %o5
773	EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
774	EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
775	EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
776	EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
777	bne,pt	%xcc, 1b
778	 add	%o0, 0x20, %o0
7792:	andcc	%o2, 0x18, %o5
780	be,pt	%xcc, 3f
781	 sub	%o2, %o5, %o2
7821:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
783	add	%o1, 0x08, %o1
784	add	%o0, 0x08, %o0
785	subcc	%o5, 0x08, %o5
786	bne,pt	%xcc, 1b
787	 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
7883:	brz,pt	%o2, .Lexit_cp
789	 cmp	%o2, 0x04
790	bl,pn	%xcc, .Ltiny_cp
791	 nop
792	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
793	add	%o1, 0x04, %o1
794	add	%o0, 0x04, %o0
795	subcc	%o2, 0x04, %o2
796	bne,pn	%xcc, .Ltiny_cp
797	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
798	ba,a,pt	%xcc, .Lexit_cp
799
800.Lmedium_unaligned_cp:
801	/* First get dest 8 byte aligned.  */
802	sub	%g0, %o0, %o3
803	and	%o3, 0x7, %o3
804	brz,pt	%o3, 2f
805	 sub	%o2, %o3, %o2
806
8071:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
808	add	%o1, 1, %o1
809	subcc	%o3, 1, %o3
810	add	%o0, 1, %o0
811	bne,pt	%xcc, 1b
812	 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
8132:
814	and	%o1, 0x7, %o3
815	brz,pn	%o3, .Lmedium_noprefetch_cp
816	 sll	%o3, 3, %o3
817	mov	64, %g2
818	sub	%g2, %o3, %g2
819	andn	%o1, 0x7, %o1
820	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
821	sllx	%o4, %o3, %o4
822	andn	%o2, 0x08 - 1, %o5
823	sub	%o2, %o5, %o2
824
8251:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
826	add	%o1, 0x08, %o1
827	subcc	%o5, 0x08, %o5
828	srlx	%g3, %g2, %g7
829	or	%g7, %o4, %g7
830	EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
831	add	%o0, 0x08, %o0
832	bne,pt	%xcc, 1b
833	 sllx	%g3, %o3, %o4
834	srl	%o3, 3, %o3
835	add	%o1, %o3, %o1
836	brz,pn	%o2, .Lexit_cp
837	 nop
838	ba,pt	%xcc, .Lsmall_unaligned_cp
839
840.Ltiny_cp:
841	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
842	subcc	%o2, 1, %o2
843	be,pn	%xcc, .Lexit_cp
844	 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
845	EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
846	subcc	%o2, 1, %o2
847	be,pn	%xcc, .Lexit_cp
848	 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
849	EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
850	ba,pt	%xcc, .Lexit_cp
851	 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
852
853.Lsmall_cp:
854	andcc	%g2, 0x3, %g0
855	bne,pn	%xcc, .Lsmall_unaligned_cp
856	 andn	%o2, 0x4 - 1, %o5
857	sub	%o2, %o5, %o2
8581:
859	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
860	add	%o1, 0x04, %o1
861	subcc	%o5, 0x04, %o5
862	add	%o0, 0x04, %o0
863	bne,pt	%xcc, 1b
864	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
865	brz,pt	%o2, .Lexit_cp
866	 nop
867	ba,a,pt	%xcc, .Ltiny_cp
868
869.Lsmall_unaligned_cp:
8701:	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
871	add	%o1, 1, %o1
872	add	%o0, 1, %o0
873	subcc	%o2, 1, %o2
874	bne,pt	%xcc, 1b
875	 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
876	ba,a,pt	%xcc, .Lexit_cp
877
878.Lsmallrest:
879	tst	%o2
880	bz,pt	%xcc, .Lsmallx
881	 cmp	%o2, 4
882	blt,pn	%xcc, .Lsmallleft3
883	 nop
884	sub	%o2, 3, %o2
885.Lsmallnotalign4:
886	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
887	subcc	%o2, 4, %o2		! reduce count by 4
888	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
889	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
890	add	%o1, 4, %o1		! advance SRC by 4
891	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
892	EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
893	add	%o0, 4, %o0		! advance DST by 4
894	EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
895	EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
896	bgu,pt	%xcc, .Lsmallnotalign4	! loop til 3 or fewer bytes remain
897	EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
898	addcc	%o2, 3, %o2		! restore count
899	bz,pt	%xcc, .Lsmallx
900.Lsmallleft3:				! 1, 2, or 3 bytes remain
901	subcc	%o2, 1, %o2
902	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)	! load one byte
903	bz,pt	%xcc, .Lsmallx
904	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)	! store one byte
905	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)	! load second byte
906	subcc	%o2, 1, %o2
907	bz,pt	%xcc, .Lsmallx
908	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
909	EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)	! load third byte
910	EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)	! store third byte
911.Lsmallx:
912	retl
913	 mov	EX_RETVAL(%g1), %o0
914.Lsmallfin:
915	tst	%o2
916	bnz,pn	%xcc, .Lsmallleft3
917	 nop
918	retl
919	 mov	EX_RETVAL(%g1), %o0	! restore %o0
920.Lexit_cp:
921	retl
922	 mov	EX_RETVAL(%g1), %o0
923	.size  FUNC_NAME, .-FUNC_NAME
924