xref: /openbmc/linux/arch/sparc/lib/M7memcpy.S (revision 34060b8f)
1b3a04ed5SBabu Moger/*
2b3a04ed5SBabu Moger * M7memcpy: Optimized SPARC M7 memcpy
3b3a04ed5SBabu Moger *
4b3a04ed5SBabu Moger * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5b3a04ed5SBabu Moger */
6b3a04ed5SBabu Moger
7b3a04ed5SBabu Moger	.file	"M7memcpy.S"
8b3a04ed5SBabu Moger
9b3a04ed5SBabu Moger/*
10b3a04ed5SBabu Moger * memcpy(s1, s2, len)
11b3a04ed5SBabu Moger *
12b3a04ed5SBabu Moger * Copy s2 to s1, always copy n bytes.
13b3a04ed5SBabu Moger * Note: this C code does not work for overlapped copies.
14b3a04ed5SBabu Moger *
15b3a04ed5SBabu Moger * Fast assembler language version of the following C-program for memcpy
16b3a04ed5SBabu Moger * which represents the `standard' for the C-library.
17b3a04ed5SBabu Moger *
18b3a04ed5SBabu Moger *	void *
19b3a04ed5SBabu Moger *	memcpy(void *s, const void *s0, size_t n)
20b3a04ed5SBabu Moger *	{
21b3a04ed5SBabu Moger *		if (n != 0) {
22b3a04ed5SBabu Moger *		    char *s1 = s;
23b3a04ed5SBabu Moger *		    const char *s2 = s0;
24b3a04ed5SBabu Moger *		    do {
25b3a04ed5SBabu Moger *			*s1++ = *s2++;
26b3a04ed5SBabu Moger *		    } while (--n != 0);
27b3a04ed5SBabu Moger *		}
28b3a04ed5SBabu Moger *		return (s);
29b3a04ed5SBabu Moger *	}
30b3a04ed5SBabu Moger *
31b3a04ed5SBabu Moger *
32b3a04ed5SBabu Moger * SPARC T7/M7 Flow :
33b3a04ed5SBabu Moger *
34b3a04ed5SBabu Moger * if (count < SMALL_MAX) {
35b3a04ed5SBabu Moger *   if count < SHORTCOPY              (SHORTCOPY=3)
36b3a04ed5SBabu Moger *	copy bytes; exit with dst addr
37b3a04ed5SBabu Moger *   if src & dst aligned on word boundary but not long word boundary,
38b3a04ed5SBabu Moger *     copy with ldw/stw; branch to finish_up
39b3a04ed5SBabu Moger *   if src & dst aligned on long word boundary
40b3a04ed5SBabu Moger *     copy with ldx/stx; branch to finish_up
41b3a04ed5SBabu Moger *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
42b3a04ed5SBabu Moger *     copy bytes; exit with dst addr
43b3a04ed5SBabu Moger *   move enough bytes to get src to word boundary
44b3a04ed5SBabu Moger *   if dst now on word boundary
45b3a04ed5SBabu Moger * move_words:
46b3a04ed5SBabu Moger *     copy words; branch to finish_up
47b3a04ed5SBabu Moger *   if dst now on half word boundary
48b3a04ed5SBabu Moger *     load words, shift half words, store words; branch to finish_up
49b3a04ed5SBabu Moger *   if dst on byte 1
50b3a04ed5SBabu Moger *     load words, shift 3 bytes, store words; branch to finish_up
51b3a04ed5SBabu Moger *   if dst on byte 3
52b3a04ed5SBabu Moger *     load words, shift 1 byte, store words; branch to finish_up
53b3a04ed5SBabu Moger * finish_up:
54b3a04ed5SBabu Moger *     copy bytes; exit with dst addr
55b3a04ed5SBabu Moger * } else {                                         More than SMALL_MAX bytes
56b3a04ed5SBabu Moger *   move bytes until dst is on long word boundary
57b3a04ed5SBabu Moger *   if( src is on long word boundary ) {
58b3a04ed5SBabu Moger *     if (count < MED_MAX) {
59b3a04ed5SBabu Moger * finish_long:					   src/dst aligned on 8 bytes
60b3a04ed5SBabu Moger *       copy with ldx/stx in 8-way unrolled loop;
61b3a04ed5SBabu Moger *       copy final 0-63 bytes; exit with dst addr
62b3a04ed5SBabu Moger *     } else {				     src/dst aligned; count > MED_MAX
63b3a04ed5SBabu Moger *       align dst on 64 byte boundary; for main data movement:
64b3a04ed5SBabu Moger *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65b3a04ed5SBabu Moger *       Use BIS (block initializing store) to avoid copying store cache
66b3a04ed5SBabu Moger *       lines from memory. But pre-store first element of each cache line
67b3a04ed5SBabu Moger *       ST_CHUNK lines in advance of the rest of that cache line. That
68b3a04ed5SBabu Moger *       gives time for replacement cache lines to be written back without
69b3a04ed5SBabu Moger *       excess STQ and Miss Buffer filling. Repeat until near the end,
70b3a04ed5SBabu Moger *       then finish up storing before going to finish_long.
71b3a04ed5SBabu Moger *     }
72b3a04ed5SBabu Moger *   } else {                                   src/dst not aligned on 8 bytes
73b3a04ed5SBabu Moger *     if src is word aligned and count < MED_WMAX
74b3a04ed5SBabu Moger *       move words in 8-way unrolled loop
75b3a04ed5SBabu Moger *       move final 0-31 bytes; exit with dst addr
76b3a04ed5SBabu Moger *     if count < MED_UMAX
77b3a04ed5SBabu Moger *       use alignaddr/faligndata combined with ldd/std in 8-way
78b3a04ed5SBabu Moger *       unrolled loop to move data.
79b3a04ed5SBabu Moger *       go to unalign_done
80b3a04ed5SBabu Moger *     else
81b3a04ed5SBabu Moger *       setup alignaddr for faligndata instructions
82b3a04ed5SBabu Moger *       align dst on 64 byte boundary; prefetch src data to L1 cache
83b3a04ed5SBabu Moger *       loadx8, falign, block-store, prefetch loop
84b3a04ed5SBabu Moger *	 (only use block-init-store when src/dst on 8 byte boundaries.)
85b3a04ed5SBabu Moger * unalign_done:
86b3a04ed5SBabu Moger *       move remaining bytes for unaligned cases. exit with dst addr.
87b3a04ed5SBabu Moger * }
88b3a04ed5SBabu Moger *
89b3a04ed5SBabu Moger */
90b3a04ed5SBabu Moger
91b3a04ed5SBabu Moger#include <asm/visasm.h>
92b3a04ed5SBabu Moger#include <asm/asi.h>
93b3a04ed5SBabu Moger
94b3a04ed5SBabu Moger#if !defined(EX_LD) && !defined(EX_ST)
95b3a04ed5SBabu Moger#define NON_USER_COPY
96b3a04ed5SBabu Moger#endif
97b3a04ed5SBabu Moger
98b3a04ed5SBabu Moger#ifndef EX_LD
9934060b8fSBabu Moger#define EX_LD(x,y)	x
100b3a04ed5SBabu Moger#endif
101b3a04ed5SBabu Moger#ifndef EX_LD_FP
10234060b8fSBabu Moger#define EX_LD_FP(x,y)	x
103b3a04ed5SBabu Moger#endif
104b3a04ed5SBabu Moger
105b3a04ed5SBabu Moger#ifndef EX_ST
10634060b8fSBabu Moger#define EX_ST(x,y)	x
107b3a04ed5SBabu Moger#endif
108b3a04ed5SBabu Moger#ifndef EX_ST_FP
10934060b8fSBabu Moger#define EX_ST_FP(x,y)	x
110b3a04ed5SBabu Moger#endif
111b3a04ed5SBabu Moger
112b3a04ed5SBabu Moger#ifndef EX_RETVAL
113b3a04ed5SBabu Moger#define EX_RETVAL(x)    x
114b3a04ed5SBabu Moger#endif
115b3a04ed5SBabu Moger
116b3a04ed5SBabu Moger#ifndef LOAD
117b3a04ed5SBabu Moger#define LOAD(type,addr,dest)	type [addr], dest
118b3a04ed5SBabu Moger#endif
119b3a04ed5SBabu Moger
120b3a04ed5SBabu Moger#ifndef STORE
121b3a04ed5SBabu Moger#define STORE(type,src,addr)	type src, [addr]
122b3a04ed5SBabu Moger#endif
123b3a04ed5SBabu Moger
124b3a04ed5SBabu Moger/*
125b3a04ed5SBabu Moger * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126b3a04ed5SBabu Moger * line as "least recently used" which means if many threads are
127b3a04ed5SBabu Moger * active, it has a high probability of being pushed out of the cache
128b3a04ed5SBabu Moger * between the first initializing store and the final stores.
129b3a04ed5SBabu Moger * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130b3a04ed5SBabu Moger * marks the cache line as "most recently used" for all
131b3a04ed5SBabu Moger * but the last cache line
132b3a04ed5SBabu Moger */
133b3a04ed5SBabu Moger#ifndef STORE_ASI
134b3a04ed5SBabu Moger#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135b3a04ed5SBabu Moger#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
136b3a04ed5SBabu Moger#else
137b3a04ed5SBabu Moger#define STORE_ASI	0x80		/* ASI_P */
138b3a04ed5SBabu Moger#endif
139b3a04ed5SBabu Moger#endif
140b3a04ed5SBabu Moger
141b3a04ed5SBabu Moger#ifndef STORE_MRU_ASI
142b3a04ed5SBabu Moger#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143b3a04ed5SBabu Moger#define STORE_MRU_ASI	ASI_ST_BLKINIT_MRU_P
144b3a04ed5SBabu Moger#else
145b3a04ed5SBabu Moger#define STORE_MRU_ASI	0x80		/* ASI_P */
146b3a04ed5SBabu Moger#endif
147b3a04ed5SBabu Moger#endif
148b3a04ed5SBabu Moger
149b3a04ed5SBabu Moger#ifndef STORE_INIT
150b3a04ed5SBabu Moger#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
151b3a04ed5SBabu Moger#endif
152b3a04ed5SBabu Moger
153b3a04ed5SBabu Moger#ifndef STORE_INIT_MRU
154b3a04ed5SBabu Moger#define STORE_INIT_MRU(src,addr)	stxa src, [addr] STORE_MRU_ASI
155b3a04ed5SBabu Moger#endif
156b3a04ed5SBabu Moger
157b3a04ed5SBabu Moger#ifndef FUNC_NAME
158b3a04ed5SBabu Moger#define FUNC_NAME	M7memcpy
159b3a04ed5SBabu Moger#endif
160b3a04ed5SBabu Moger
161b3a04ed5SBabu Moger#ifndef PREAMBLE
162b3a04ed5SBabu Moger#define PREAMBLE
163b3a04ed5SBabu Moger#endif
164b3a04ed5SBabu Moger
165b3a04ed5SBabu Moger#define	BLOCK_SIZE	64
166b3a04ed5SBabu Moger#define	SHORTCOPY	3
167b3a04ed5SBabu Moger#define	SHORTCHECK	14
168b3a04ed5SBabu Moger#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
169b3a04ed5SBabu Moger				/* must be at least 64 */
170b3a04ed5SBabu Moger#define	SMALL_MAX	128
171b3a04ed5SBabu Moger#define	MED_UMAX	1024	/* max copy for medium un-aligned case */
172b3a04ed5SBabu Moger#define	MED_WMAX	1024	/* max copy for medium word-aligned case */
173b3a04ed5SBabu Moger#define	MED_MAX		1024	/* max copy for medium longword-aligned case */
174b3a04ed5SBabu Moger#define ST_CHUNK	24	/* ST_CHUNK - block of values for BIS Store */
175b3a04ed5SBabu Moger#define ALIGN_PRE	24	/* distance for aligned prefetch loop */
176b3a04ed5SBabu Moger
177b3a04ed5SBabu Moger	.register	%g2,#scratch
178b3a04ed5SBabu Moger
179b3a04ed5SBabu Moger	.section	".text"
180b3a04ed5SBabu Moger	.global		FUNC_NAME
181b3a04ed5SBabu Moger	.type		FUNC_NAME, #function
182b3a04ed5SBabu Moger	.align		16
183b3a04ed5SBabu MogerFUNC_NAME:
184b3a04ed5SBabu Moger	srlx            %o2, 31, %g2
185b3a04ed5SBabu Moger	cmp             %g2, 0
186b3a04ed5SBabu Moger	tne             %xcc, 5
187b3a04ed5SBabu Moger	PREAMBLE
188b3a04ed5SBabu Moger	mov		%o0, %g1	! save %o0
189b3a04ed5SBabu Moger	brz,pn          %o2, .Lsmallx
190b3a04ed5SBabu Moger	 cmp            %o2, 3
191b3a04ed5SBabu Moger	ble,pn          %icc, .Ltiny_cp
192b3a04ed5SBabu Moger	 cmp            %o2, 19
193b3a04ed5SBabu Moger	ble,pn          %icc, .Lsmall_cp
194b3a04ed5SBabu Moger	 or             %o0, %o1, %g2
195b3a04ed5SBabu Moger	cmp             %o2, SMALL_MAX
196b3a04ed5SBabu Moger	bl,pn           %icc, .Lmedium_cp
197b3a04ed5SBabu Moger	 nop
198b3a04ed5SBabu Moger
199b3a04ed5SBabu Moger.Lmedium:
200b3a04ed5SBabu Moger	neg	%o0, %o5
201b3a04ed5SBabu Moger	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
202b3a04ed5SBabu Moger	brz,pt	%o5, .Ldst_aligned_on_8
203b3a04ed5SBabu Moger
204b3a04ed5SBabu Moger	! %o5 has the bytes to be written in partial store.
205b3a04ed5SBabu Moger	 sub	%o2, %o5, %o2
206b3a04ed5SBabu Moger	sub	%o1, %o0, %o1		! %o1 gets the difference
207b3a04ed5SBabu Moger7:					! dst aligning loop
208b3a04ed5SBabu Moger	add	%o1, %o0, %o4
20934060b8fSBabu Moger	EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)	! load one byte
210b3a04ed5SBabu Moger	subcc	%o5, 1, %o5
21134060b8fSBabu Moger	EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
212b3a04ed5SBabu Moger	bgu,pt	%xcc, 7b
213b3a04ed5SBabu Moger	 add	%o0, 1, %o0		! advance dst
214b3a04ed5SBabu Moger	add	%o1, %o0, %o1		! restore %o1
215b3a04ed5SBabu Moger.Ldst_aligned_on_8:
216b3a04ed5SBabu Moger	andcc	%o1, 7, %o5
217b3a04ed5SBabu Moger	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
218b3a04ed5SBabu Moger	 nop
219b3a04ed5SBabu Moger
220b3a04ed5SBabu Moger.Lsrc_dst_aligned_on_8:
221b3a04ed5SBabu Moger	! check if we are copying MED_MAX or more bytes
222b3a04ed5SBabu Moger	set MED_MAX, %o3
223b3a04ed5SBabu Moger	cmp %o2, %o3 			! limit to store buffer size
224b3a04ed5SBabu Moger	bgu,pn	%xcc, .Llarge_align8_copy
225b3a04ed5SBabu Moger	 nop
226b3a04ed5SBabu Moger
227b3a04ed5SBabu Moger/*
228b3a04ed5SBabu Moger * Special case for handling when src and dest are both long word aligned
229b3a04ed5SBabu Moger * and total data to move is less than MED_MAX bytes
230b3a04ed5SBabu Moger */
231b3a04ed5SBabu Moger.Lmedlong:
232b3a04ed5SBabu Moger	subcc	%o2, 63, %o2		! adjust length to allow cc test
233b3a04ed5SBabu Moger	ble,pn	%xcc, .Lmedl63		! skip big loop if less than 64 bytes
234b3a04ed5SBabu Moger	 nop
235b3a04ed5SBabu Moger.Lmedl64:
23634060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)	! load
237b3a04ed5SBabu Moger	subcc	%o2, 64, %o2		! decrement length count
23834060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)	! and store
23934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56)	! a block of 64
24034060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
24134060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
24234060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
24334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
24434060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
24534060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
24634060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
24734060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
248b3a04ed5SBabu Moger	add	%o1, 64, %o1		! increase src ptr by 64
24934060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
25034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
251b3a04ed5SBabu Moger	add	%o0, 64, %o0		! increase dst ptr by 64
25234060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
25334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
254b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lmedl64		! repeat if at least 64 bytes left
25534060b8fSBabu Moger	 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
256b3a04ed5SBabu Moger.Lmedl63:
257b3a04ed5SBabu Moger	addcc	%o2, 32, %o2		! adjust remaining count
258b3a04ed5SBabu Moger	ble,pt	%xcc, .Lmedl31		! to skip if 31 or fewer bytes left
259b3a04ed5SBabu Moger	 nop
26034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)	! load
261b3a04ed5SBabu Moger	sub	%o2, 32, %o2		! decrement length count
26234060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)	! and store
26334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24)	! a block of 32
264b3a04ed5SBabu Moger	add	%o1, 32, %o1		! increase src ptr by 32
26534060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
26634060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
267b3a04ed5SBabu Moger	add	%o0, 32, %o0		! increase dst ptr by 32
26834060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
26934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
27034060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
271b3a04ed5SBabu Moger.Lmedl31:
272b3a04ed5SBabu Moger	addcc	%o2, 16, %o2		! adjust remaining count
273b3a04ed5SBabu Moger	ble,pt	%xcc, .Lmedl15		! skip if 15 or fewer bytes left
274b3a04ed5SBabu Moger	 nop				!
27534060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
276b3a04ed5SBabu Moger	add	%o1, 16, %o1		! increase src ptr by 16
27734060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
278b3a04ed5SBabu Moger	sub	%o2, 16, %o2		! decrease count by 16
27934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
280b3a04ed5SBabu Moger	add	%o0, 16, %o0		! increase dst ptr by 16
28134060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
282b3a04ed5SBabu Moger.Lmedl15:
283b3a04ed5SBabu Moger	addcc	%o2, 15, %o2		! restore count
284b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx	! exit if finished
285b3a04ed5SBabu Moger	 cmp	%o2, 8
286b3a04ed5SBabu Moger	blt,pt	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
287b3a04ed5SBabu Moger	 tst	%o2
28834060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)	! load 8 bytes
289b3a04ed5SBabu Moger	add	%o1, 8, %o1		! increase src ptr by 8
290b3a04ed5SBabu Moger	add	%o0, 8, %o0		! increase dst ptr by 8
291b3a04ed5SBabu Moger	subcc	%o2, 8, %o2		! decrease count by 8
292b3a04ed5SBabu Moger	bnz,pn	%xcc, .Lmedw7
29334060b8fSBabu Moger	 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)	! and store 8
294b3a04ed5SBabu Moger	retl
295b3a04ed5SBabu Moger	 mov	EX_RETVAL(%g1), %o0	! restore %o0
296b3a04ed5SBabu Moger
297b3a04ed5SBabu Moger	.align 16
298b3a04ed5SBabu Moger.Lsrc_dst_unaligned_on_8:
299b3a04ed5SBabu Moger	! DST is 8-byte aligned, src is not
300b3a04ed5SBabu Moger2:
301b3a04ed5SBabu Moger	andcc	%o1, 0x3, %o5		! test word alignment
302b3a04ed5SBabu Moger	bnz,pt	%xcc, .Lunalignsetup	! branch to skip if not word aligned
303b3a04ed5SBabu Moger	 nop
304b3a04ed5SBabu Moger
305b3a04ed5SBabu Moger/*
306b3a04ed5SBabu Moger * Handle all cases where src and dest are aligned on word
307b3a04ed5SBabu Moger * boundaries. Use unrolled loops for better performance.
308b3a04ed5SBabu Moger * This option wins over standard large data move when
309b3a04ed5SBabu Moger * source and destination is in cache for.Lmedium
310b3a04ed5SBabu Moger * to short data moves.
311b3a04ed5SBabu Moger */
312b3a04ed5SBabu Moger	set MED_WMAX, %o3
313b3a04ed5SBabu Moger	cmp %o2, %o3 			! limit to store buffer size
314b3a04ed5SBabu Moger	bge,pt	%xcc, .Lunalignrejoin	! otherwise rejoin main loop
315b3a04ed5SBabu Moger	 nop
316b3a04ed5SBabu Moger
317b3a04ed5SBabu Moger	subcc	%o2, 31, %o2		! adjust length to allow cc test
318b3a04ed5SBabu Moger					! for end of loop
319b3a04ed5SBabu Moger	ble,pt	%xcc, .Lmedw31		! skip big loop if less than 16
320b3a04ed5SBabu Moger.Lmedw32:
32134060b8fSBabu Moger	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
322b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
32334060b8fSBabu Moger	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
324b3a04ed5SBabu Moger	or	%o4, %o5, %o5
32534060b8fSBabu Moger	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
326b3a04ed5SBabu Moger	subcc	%o2, 32, %o2		! decrement length count
32734060b8fSBabu Moger	EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
328b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
32934060b8fSBabu Moger	EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
330b3a04ed5SBabu Moger	or	%o4, %o5, %o5
33134060b8fSBabu Moger	EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
332b3a04ed5SBabu Moger	add	%o1, 32, %o1		! increase src ptr by 32
33334060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
334b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
33534060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
336b3a04ed5SBabu Moger	or	%o4, %o5, %o5
33734060b8fSBabu Moger	EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
338b3a04ed5SBabu Moger	add	%o0, 32, %o0		! increase dst ptr by 32
33934060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
340b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
34134060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
342b3a04ed5SBabu Moger	or	%o4, %o5, %o5
343b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lmedw32		! repeat if at least 32 bytes left
34434060b8fSBabu Moger	 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
345b3a04ed5SBabu Moger.Lmedw31:
346b3a04ed5SBabu Moger	addcc	%o2, 31, %o2		! restore count
347b3a04ed5SBabu Moger
348b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx	! exit if finished
349b3a04ed5SBabu Moger	 nop
350b3a04ed5SBabu Moger	cmp	%o2, 16
351b3a04ed5SBabu Moger	blt,pt	%xcc, .Lmedw15
352b3a04ed5SBabu Moger	 nop
35334060b8fSBabu Moger	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
354b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
355b3a04ed5SBabu Moger	subcc	%o2, 16, %o2		! decrement length count
35634060b8fSBabu Moger	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
357b3a04ed5SBabu Moger	or	%o4, %o5, %o5
35834060b8fSBabu Moger	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
359b3a04ed5SBabu Moger	add	%o1, 16, %o1		! increase src ptr by 16
36034060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
361b3a04ed5SBabu Moger	add	%o0, 16, %o0		! increase dst ptr by 16
362b3a04ed5SBabu Moger	sllx	%o4, 32, %o5
36334060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
364b3a04ed5SBabu Moger	or	%o4, %o5, %o5
36534060b8fSBabu Moger	EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
366b3a04ed5SBabu Moger.Lmedw15:
367b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx	! exit if finished
368b3a04ed5SBabu Moger	 cmp	%o2, 8
369b3a04ed5SBabu Moger	blt,pn	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
370b3a04ed5SBabu Moger	 tst	%o2
37134060b8fSBabu Moger	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
372b3a04ed5SBabu Moger	subcc	%o2, 8, %o2		! decrease count by 8
37334060b8fSBabu Moger	EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
374b3a04ed5SBabu Moger	add	%o1, 8, %o1		! increase src ptr by 8
37534060b8fSBabu Moger	EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)	! load 4 bytes
376b3a04ed5SBabu Moger	add	%o0, 8, %o0		! increase dst ptr by 8
37734060b8fSBabu Moger	EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
378b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx	! exit if finished
379b3a04ed5SBabu Moger.Lmedw7:				! count is ge 1, less than 8
380b3a04ed5SBabu Moger	cmp	%o2, 4			! check for 4 bytes left
381b3a04ed5SBabu Moger	blt,pn	%xcc, .Lsmallleft3	! skip if 3 or fewer bytes left
382b3a04ed5SBabu Moger	 nop				!
38334060b8fSBabu Moger	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
384b3a04ed5SBabu Moger	add	%o1, 4, %o1		! increase src ptr by 4
385b3a04ed5SBabu Moger	add	%o0, 4, %o0		! increase dst ptr by 4
386b3a04ed5SBabu Moger	subcc	%o2, 4, %o2		! decrease count by 4
387b3a04ed5SBabu Moger	bnz	.Lsmallleft3
38834060b8fSBabu Moger	 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
389b3a04ed5SBabu Moger	retl
390b3a04ed5SBabu Moger	 mov	EX_RETVAL(%g1), %o0
391b3a04ed5SBabu Moger
392b3a04ed5SBabu Moger	.align 16
393b3a04ed5SBabu Moger.Llarge_align8_copy:			! Src and dst share 8 byte alignment
394b3a04ed5SBabu Moger	! align dst to 64 byte boundary
395b3a04ed5SBabu Moger	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
396b3a04ed5SBabu Moger	brz,pn	%o3, .Laligned_to_64
397b3a04ed5SBabu Moger	 andcc	%o0, 8, %o3		! odd long words to move?
398b3a04ed5SBabu Moger	brz,pt	%o3, .Laligned_to_16
399b3a04ed5SBabu Moger	 nop
40034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
401b3a04ed5SBabu Moger	sub	%o2, 8, %o2
402b3a04ed5SBabu Moger	add	%o1, 8, %o1		! increment src ptr
403b3a04ed5SBabu Moger	add	%o0, 8, %o0		! increment dst ptr
40434060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
405b3a04ed5SBabu Moger.Laligned_to_16:
406b3a04ed5SBabu Moger	andcc	%o0, 16, %o3		! pair of long words to move?
407b3a04ed5SBabu Moger	brz,pt	%o3, .Laligned_to_32
408b3a04ed5SBabu Moger	 nop
40934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
410b3a04ed5SBabu Moger	sub	%o2, 16, %o2
41134060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
412b3a04ed5SBabu Moger	add	%o1, 16, %o1		! increment src ptr
41334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
414b3a04ed5SBabu Moger	add	%o0, 16, %o0		! increment dst ptr
41534060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
416b3a04ed5SBabu Moger.Laligned_to_32:
417b3a04ed5SBabu Moger	andcc	%o0, 32, %o3		! four long words to move?
418b3a04ed5SBabu Moger	brz,pt	%o3, .Laligned_to_64
419b3a04ed5SBabu Moger	 nop
42034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
421b3a04ed5SBabu Moger	sub	%o2, 32, %o2
42234060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
42334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
42434060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
42534060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
42634060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
427b3a04ed5SBabu Moger	add	%o1, 32, %o1		! increment src ptr
42834060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
429b3a04ed5SBabu Moger	add	%o0, 32, %o0		! increment dst ptr
43034060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
431b3a04ed5SBabu Moger.Laligned_to_64:
432b3a04ed5SBabu Moger!
433b3a04ed5SBabu Moger!	Using block init store (BIS) instructions to avoid fetching cache
434b3a04ed5SBabu Moger!	lines from memory. Use ST_CHUNK stores to first element of each cache
435b3a04ed5SBabu Moger!	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436b3a04ed5SBabu Moger!	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437b3a04ed5SBabu Moger!	Initial stores using MRU version of BIS to keep cache line in
438b3a04ed5SBabu Moger!	cache until we are ready to store final element of cache line.
439b3a04ed5SBabu Moger!	Then store last element using the LRU version of BIS.
440b3a04ed5SBabu Moger!
441b3a04ed5SBabu Moger	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
442b3a04ed5SBabu Moger	and	%o2, 0x3f, %o2		! residue bytes in %o2
443b3a04ed5SBabu Moger!
444b3a04ed5SBabu Moger!	We use STORE_MRU_ASI for the first seven stores to each cache line
445b3a04ed5SBabu Moger!	followed by STORE_ASI (mark as LRU) for the last store. That
446b3a04ed5SBabu Moger!	mixed approach reduces the probability that the cache line is removed
447b3a04ed5SBabu Moger!	before we finish setting it, while minimizing the effects on
448b3a04ed5SBabu Moger!	other cached values during a large memcpy
449b3a04ed5SBabu Moger!
450b3a04ed5SBabu Moger!	ST_CHUNK batches up initial BIS operations for several cache lines
451b3a04ed5SBabu Moger!	to allow multiple requests to not be blocked by overflowing the
452b3a04ed5SBabu Moger!	the store miss buffer. Then the matching stores for all those
453b3a04ed5SBabu Moger!	BIS operations are executed.
454b3a04ed5SBabu Moger!
455b3a04ed5SBabu Moger
456b3a04ed5SBabu Moger	sub	%o0, 8, %o0		! adjust %o0 for ASI alignment
457b3a04ed5SBabu Moger.Lalign_loop:
458b3a04ed5SBabu Moger	cmp	%o5, ST_CHUNK*64
459b3a04ed5SBabu Moger	blu,pt	%xcc, .Lalign_loop_fin
460b3a04ed5SBabu Moger	 mov	ST_CHUNK,%o3
461b3a04ed5SBabu Moger.Lalign_loop_start:
462b3a04ed5SBabu Moger	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463b3a04ed5SBabu Moger	subcc	%o3, 1, %o3
46434060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
465b3a04ed5SBabu Moger	add	%o1, 64, %o1
466b3a04ed5SBabu Moger	add	%o0, 8, %o0
46734060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
468b3a04ed5SBabu Moger	bgu	%xcc,.Lalign_loop_start
469b3a04ed5SBabu Moger	 add	%o0, 56, %o0
470b3a04ed5SBabu Moger
471b3a04ed5SBabu Moger	mov	ST_CHUNK,%o3
472b3a04ed5SBabu Moger	sllx	%o3, 6, %o4		! ST_CHUNK*64
473b3a04ed5SBabu Moger	sub	%o1, %o4, %o1		! reset %o1
474b3a04ed5SBabu Moger	sub	%o0, %o4, %o0		! reset %o0
475b3a04ed5SBabu Moger
476b3a04ed5SBabu Moger.Lalign_loop_rest:
47734060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
478b3a04ed5SBabu Moger	add	%o0, 16, %o0
47934060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
48034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
481b3a04ed5SBabu Moger	add	%o0, 8, %o0
48234060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
483b3a04ed5SBabu Moger	subcc	%o3, 1, %o3
48434060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
485b3a04ed5SBabu Moger	add	%o0, 8, %o0
48634060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
48734060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
488b3a04ed5SBabu Moger	add	%o0, 8, %o0
48934060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
49034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
491b3a04ed5SBabu Moger	add	%o0, 8, %o0
49234060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
49334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
494b3a04ed5SBabu Moger	add	%o1, 64, %o1
495b3a04ed5SBabu Moger	add	%o0, 8, %o0
49634060b8fSBabu Moger	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
497b3a04ed5SBabu Moger	add	%o0, 8, %o0
49834060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
499b3a04ed5SBabu Moger	sub	%o5, 64, %o5
500b3a04ed5SBabu Moger	bgu	%xcc,.Lalign_loop_rest
501b3a04ed5SBabu Moger	! mark cache line as LRU
50234060b8fSBabu Moger	 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
503b3a04ed5SBabu Moger
504b3a04ed5SBabu Moger	cmp	%o5, ST_CHUNK*64
505b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lalign_loop_start
506b3a04ed5SBabu Moger	 mov	ST_CHUNK,%o3
507b3a04ed5SBabu Moger
508b3a04ed5SBabu Moger	cmp	%o5, 0
509b3a04ed5SBabu Moger	beq	.Lalign_done
510b3a04ed5SBabu Moger	 nop
511b3a04ed5SBabu Moger.Lalign_loop_fin:
51234060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
51334060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
51434060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
51534060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
51634060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
51734060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
518b3a04ed5SBabu Moger	subcc	%o5, 64, %o5
51934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
52034060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
52134060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
52234060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
52334060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
52434060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
52534060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
526b3a04ed5SBabu Moger	add	%o1, 64, %o1
52734060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
528b3a04ed5SBabu Moger	add	%o0, 64, %o0
52934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
530b3a04ed5SBabu Moger	bgu	%xcc,.Lalign_loop_fin
53134060b8fSBabu Moger	 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
532b3a04ed5SBabu Moger
533b3a04ed5SBabu Moger.Lalign_done:
534b3a04ed5SBabu Moger	add	%o0, 8, %o0		! restore %o0 from ASI alignment
535b3a04ed5SBabu Moger	membar	#StoreStore
536b3a04ed5SBabu Moger	sub	%o2, 63, %o2		! adjust length to allow cc test
537b3a04ed5SBabu Moger	ba	.Lmedl63		! in .Lmedl63
538b3a04ed5SBabu Moger	 nop
539b3a04ed5SBabu Moger
540b3a04ed5SBabu Moger	.align 16
541b3a04ed5SBabu Moger	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542b3a04ed5SBabu Moger.Lunalignsetup:
543b3a04ed5SBabu Moger.Lunalignrejoin:
544b3a04ed5SBabu Moger	mov	%g1, %o3	! save %g1 as VISEntryHalf clobbers it
545b3a04ed5SBabu Moger#ifdef NON_USER_COPY
546b3a04ed5SBabu Moger	VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547b3a04ed5SBabu Moger#else
548b3a04ed5SBabu Moger	VISEntryHalf
549b3a04ed5SBabu Moger#endif
550b3a04ed5SBabu Moger	mov	%o3, %g1	! restore %g1
551b3a04ed5SBabu Moger
552b3a04ed5SBabu Moger	set MED_UMAX, %o3
553b3a04ed5SBabu Moger	cmp %o2, %o3 		! check for.Lmedium unaligned limit
554b3a04ed5SBabu Moger	bge,pt	%xcc,.Lunalign_large
555b3a04ed5SBabu Moger	 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556b3a04ed5SBabu Moger	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
557b3a04ed5SBabu Moger	and	%o2, 0x3f, %o2		! residue bytes in %o2
558b3a04ed5SBabu Moger	cmp	%o2, 8			! Insure we do not load beyond
559b3a04ed5SBabu Moger	bgt	.Lunalign_adjust	! end of source buffer
560b3a04ed5SBabu Moger	 andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
561b3a04ed5SBabu Moger	add	%o2, 64, %o2		! adjust to leave loop
562b3a04ed5SBabu Moger	sub	%o5, 64, %o5		! early if necessary
563b3a04ed5SBabu Moger.Lunalign_adjust:
564b3a04ed5SBabu Moger	alignaddr %o1, %g0, %g0		! generate %gsr
565b3a04ed5SBabu Moger	add	%o1, %o5, %o1		! advance %o1 to after blocks
56634060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
567b3a04ed5SBabu Moger.Lunalign_loop:
56834060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
569b3a04ed5SBabu Moger	faligndata %f0, %f2, %f16
57034060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
571b3a04ed5SBabu Moger	subcc	%o5, BLOCK_SIZE, %o5
57234060b8fSBabu Moger	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
573b3a04ed5SBabu Moger	faligndata %f2, %f4, %f18
57434060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
57534060b8fSBabu Moger	EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
576b3a04ed5SBabu Moger	faligndata %f4, %f6, %f20
57734060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
57834060b8fSBabu Moger	EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
579b3a04ed5SBabu Moger	faligndata %f6, %f8, %f22
58034060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
58134060b8fSBabu Moger	EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
582b3a04ed5SBabu Moger	faligndata %f8, %f10, %f24
58334060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
58434060b8fSBabu Moger	EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
585b3a04ed5SBabu Moger	faligndata %f10, %f12, %f26
58634060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
587b3a04ed5SBabu Moger	add	%o4, BLOCK_SIZE, %o4
58834060b8fSBabu Moger	EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
589b3a04ed5SBabu Moger	faligndata %f12, %f14, %f28
59034060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
59134060b8fSBabu Moger	EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
592b3a04ed5SBabu Moger	faligndata %f14, %f0, %f30
59334060b8fSBabu Moger	EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
594b3a04ed5SBabu Moger	add	%o0, BLOCK_SIZE, %o0
595b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lunalign_loop
596b3a04ed5SBabu Moger	 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597b3a04ed5SBabu Moger	ba	.Lunalign_done
598b3a04ed5SBabu Moger	 nop
599b3a04ed5SBabu Moger
600b3a04ed5SBabu Moger.Lunalign_large:
601b3a04ed5SBabu Moger	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
602b3a04ed5SBabu Moger	bz	%xcc, .Lunalignsrc
603b3a04ed5SBabu Moger	 sub	%o3, 64, %o3		! %o3 will be multiple of 8
604b3a04ed5SBabu Moger	neg	%o3			! bytes until dest is 64 byte aligned
605b3a04ed5SBabu Moger	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
606b3a04ed5SBabu Moger	! Move bytes according to source alignment
607b3a04ed5SBabu Moger	andcc	%o1, 0x1, %o5
608b3a04ed5SBabu Moger	bnz	%xcc, .Lunalignbyte	! check for byte alignment
609b3a04ed5SBabu Moger	 nop
610b3a04ed5SBabu Moger	andcc	%o1, 2, %o5		! check for half word alignment
611b3a04ed5SBabu Moger	bnz	%xcc, .Lunalignhalf
612b3a04ed5SBabu Moger	 nop
613b3a04ed5SBabu Moger	! Src is word aligned
614b3a04ed5SBabu Moger.Lunalignword:
61534060b8fSBabu Moger	EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 4 bytes
616b3a04ed5SBabu Moger	add	%o1, 8, %o1		! increase src ptr by 8
61734060b8fSBabu Moger	EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)	! and store 4
618b3a04ed5SBabu Moger	subcc	%o3, 8, %o3		! decrease count by 8
61934060b8fSBabu Moger	EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
620b3a04ed5SBabu Moger	add	%o0, 8, %o0		! increase dst ptr by 8
621b3a04ed5SBabu Moger	bnz	%xcc, .Lunalignword
62234060b8fSBabu Moger	 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
623b3a04ed5SBabu Moger	ba	.Lunalignsrc
624b3a04ed5SBabu Moger	 nop
625b3a04ed5SBabu Moger
626b3a04ed5SBabu Moger	! Src is half-word aligned
627b3a04ed5SBabu Moger.Lunalignhalf:
62834060b8fSBabu Moger	EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 2 bytes
629b3a04ed5SBabu Moger	sllx	%o4, 32, %o5		! shift left
63034060b8fSBabu Moger	EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
631b3a04ed5SBabu Moger	or	%o4, %o5, %o5
632b3a04ed5SBabu Moger	sllx	%o5, 16, %o5
63334060b8fSBabu Moger	EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
634b3a04ed5SBabu Moger	or	%o4, %o5, %o5
63534060b8fSBabu Moger	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
636b3a04ed5SBabu Moger	add	%o1, 8, %o1
637b3a04ed5SBabu Moger	subcc	%o3, 8, %o3
638b3a04ed5SBabu Moger	bnz	%xcc, .Lunalignhalf
639b3a04ed5SBabu Moger	 add	%o0, 8, %o0
640b3a04ed5SBabu Moger	ba	.Lunalignsrc
641b3a04ed5SBabu Moger	 nop
642b3a04ed5SBabu Moger
643b3a04ed5SBabu Moger	! Src is Byte aligned
644b3a04ed5SBabu Moger.Lunalignbyte:
645b3a04ed5SBabu Moger	sub	%o0, %o1, %o0		! share pointer advance
646b3a04ed5SBabu Moger.Lunalignbyte_loop:
64734060b8fSBabu Moger	EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
648b3a04ed5SBabu Moger	sllx	%o4, 56, %o5
64934060b8fSBabu Moger	EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
650b3a04ed5SBabu Moger	sllx	%o4, 40, %o4
651b3a04ed5SBabu Moger	or	%o4, %o5, %o5
65234060b8fSBabu Moger	EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
653b3a04ed5SBabu Moger	sllx	%o4, 24, %o4
654b3a04ed5SBabu Moger	or	%o4, %o5, %o5
65534060b8fSBabu Moger	EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
656b3a04ed5SBabu Moger	sllx	%o4,  8, %o4
657b3a04ed5SBabu Moger	or	%o4, %o5, %o5
65834060b8fSBabu Moger	EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
659b3a04ed5SBabu Moger	or	%o4, %o5, %o5
660b3a04ed5SBabu Moger	add	%o0, %o1, %o0
66134060b8fSBabu Moger	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
662b3a04ed5SBabu Moger	sub	%o0, %o1, %o0
663b3a04ed5SBabu Moger	subcc	%o3, 8, %o3
664b3a04ed5SBabu Moger	bnz	%xcc, .Lunalignbyte_loop
665b3a04ed5SBabu Moger	 add	%o1, 8, %o1
666b3a04ed5SBabu Moger	add	%o0,%o1, %o0 		! restore pointer
667b3a04ed5SBabu Moger
668b3a04ed5SBabu Moger	! Destination is now block (64 byte aligned)
669b3a04ed5SBabu Moger.Lunalignsrc:
670b3a04ed5SBabu Moger	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
671b3a04ed5SBabu Moger	and	%o2, 0x3f, %o2		! residue bytes in %o2
672b3a04ed5SBabu Moger	add	%o2, 64, %o2		! Insure we do not load beyond
673b3a04ed5SBabu Moger	sub	%o5, 64, %o5		! end of source buffer
674b3a04ed5SBabu Moger
675b3a04ed5SBabu Moger	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
676b3a04ed5SBabu Moger	alignaddr %o1, %g0, %g0		! generate %gsr
677b3a04ed5SBabu Moger	add	%o1, %o5, %o1		! advance %o1 to after blocks
678b3a04ed5SBabu Moger
67934060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
680b3a04ed5SBabu Moger	add	%o4, 8, %o4
681b3a04ed5SBabu Moger.Lunalign_sloop:
68234060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
683b3a04ed5SBabu Moger	faligndata %f14, %f16, %f0
68434060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
685b3a04ed5SBabu Moger	faligndata %f16, %f18, %f2
68634060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
687b3a04ed5SBabu Moger	faligndata %f18, %f20, %f4
68834060b8fSBabu Moger	EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
689b3a04ed5SBabu Moger	subcc	%o5, 64, %o5
69034060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
691b3a04ed5SBabu Moger	faligndata %f20, %f22, %f6
69234060b8fSBabu Moger	EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
69334060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
694b3a04ed5SBabu Moger	faligndata %f22, %f24, %f8
69534060b8fSBabu Moger	EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
69634060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
697b3a04ed5SBabu Moger	faligndata %f24, %f26, %f10
69834060b8fSBabu Moger	EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
69934060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
700b3a04ed5SBabu Moger	faligndata %f26, %f28, %f12
70134060b8fSBabu Moger	EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
702b3a04ed5SBabu Moger	add	%o4, 64, %o4
70334060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
704b3a04ed5SBabu Moger	faligndata %f28, %f30, %f14
70534060b8fSBabu Moger	EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
70634060b8fSBabu Moger	EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
707b3a04ed5SBabu Moger	add	%o0, 64, %o0
70834060b8fSBabu Moger	EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
709b3a04ed5SBabu Moger	fsrc2	%f30, %f14
710b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lunalign_sloop
711b3a04ed5SBabu Moger	 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712b3a04ed5SBabu Moger
713b3a04ed5SBabu Moger.Lunalign_done:
714b3a04ed5SBabu Moger	! Handle trailing bytes, 64 to 127
715b3a04ed5SBabu Moger	! Dest long word aligned, Src not long word aligned
716b3a04ed5SBabu Moger	cmp	%o2, 15
717b3a04ed5SBabu Moger	bleu	%xcc, .Lunalign_short
718b3a04ed5SBabu Moger
719b3a04ed5SBabu Moger	 andn	%o2, 0x7, %o5		! %o5 is multiple of 8
720b3a04ed5SBabu Moger	and	%o2, 0x7, %o2		! residue bytes in %o2
721b3a04ed5SBabu Moger	add	%o2, 8, %o2
722b3a04ed5SBabu Moger	sub	%o5, 8, %o5		! insure we do not load past end of src
723b3a04ed5SBabu Moger	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
724b3a04ed5SBabu Moger	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
72534060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
726b3a04ed5SBabu Moger.Lunalign_by8:
72734060b8fSBabu Moger	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
728b3a04ed5SBabu Moger	add	%o4, 8, %o4
729b3a04ed5SBabu Moger	faligndata %f0, %f2, %f16
730b3a04ed5SBabu Moger	subcc	%o5, 8, %o5
73134060b8fSBabu Moger	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
732b3a04ed5SBabu Moger	fsrc2	%f2, %f0
733b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lunalign_by8
734b3a04ed5SBabu Moger	 add	%o0, 8, %o0
735b3a04ed5SBabu Moger
736b3a04ed5SBabu Moger.Lunalign_short:
737b3a04ed5SBabu Moger#ifdef NON_USER_COPY
738b3a04ed5SBabu Moger	VISExitHalfFast
739b3a04ed5SBabu Moger#else
740b3a04ed5SBabu Moger	VISExitHalf
741b3a04ed5SBabu Moger#endif
742b3a04ed5SBabu Moger	ba	.Lsmallrest
743b3a04ed5SBabu Moger	 nop
744b3a04ed5SBabu Moger
745b3a04ed5SBabu Moger/*
746b3a04ed5SBabu Moger * This is a special case of nested memcpy. This can happen when kernel
747b3a04ed5SBabu Moger * calls unaligned memcpy back to back without saving FP registers. We need
748b3a04ed5SBabu Moger * traps(context switch) to save/restore FP registers. If the kernel calls
749b3a04ed5SBabu Moger * memcpy without this trap sequence we will hit FP corruption. Let's use
750b3a04ed5SBabu Moger * the normal integer load/store method in this case.
751b3a04ed5SBabu Moger */
752b3a04ed5SBabu Moger
753b3a04ed5SBabu Moger#ifdef NON_USER_COPY
754b3a04ed5SBabu Moger.Lmedium_vis_entry_fail_cp:
755b3a04ed5SBabu Moger	or	%o0, %o1, %g2
756b3a04ed5SBabu Moger#endif
757b3a04ed5SBabu Moger.Lmedium_cp:
758b3a04ed5SBabu Moger	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759b3a04ed5SBabu Moger	andcc	%g2, 0x7, %g0
760b3a04ed5SBabu Moger	bne,pn	%xcc, .Lmedium_unaligned_cp
761b3a04ed5SBabu Moger	 nop
762b3a04ed5SBabu Moger
763b3a04ed5SBabu Moger.Lmedium_noprefetch_cp:
764b3a04ed5SBabu Moger	andncc	%o2, 0x20 - 1, %o5
765b3a04ed5SBabu Moger	be,pn	%xcc, 2f
766b3a04ed5SBabu Moger	 sub	%o2, %o5, %o2
76734060b8fSBabu Moger1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
76834060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
76934060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
77034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
771b3a04ed5SBabu Moger	add	%o1, 0x20, %o1
772b3a04ed5SBabu Moger	subcc	%o5, 0x20, %o5
77334060b8fSBabu Moger	EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
77434060b8fSBabu Moger	EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
77534060b8fSBabu Moger	EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
77634060b8fSBabu Moger	EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
777b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
778b3a04ed5SBabu Moger	 add	%o0, 0x20, %o0
779b3a04ed5SBabu Moger2:	andcc	%o2, 0x18, %o5
780b3a04ed5SBabu Moger	be,pt	%xcc, 3f
781b3a04ed5SBabu Moger	 sub	%o2, %o5, %o2
78234060b8fSBabu Moger1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
783b3a04ed5SBabu Moger	add	%o1, 0x08, %o1
784b3a04ed5SBabu Moger	add	%o0, 0x08, %o0
785b3a04ed5SBabu Moger	subcc	%o5, 0x08, %o5
786b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
78734060b8fSBabu Moger	 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
788b3a04ed5SBabu Moger3:	brz,pt	%o2, .Lexit_cp
789b3a04ed5SBabu Moger	 cmp	%o2, 0x04
790b3a04ed5SBabu Moger	bl,pn	%xcc, .Ltiny_cp
791b3a04ed5SBabu Moger	 nop
79234060b8fSBabu Moger	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
793b3a04ed5SBabu Moger	add	%o1, 0x04, %o1
794b3a04ed5SBabu Moger	add	%o0, 0x04, %o0
795b3a04ed5SBabu Moger	subcc	%o2, 0x04, %o2
796b3a04ed5SBabu Moger	bne,pn	%xcc, .Ltiny_cp
79734060b8fSBabu Moger	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
798b3a04ed5SBabu Moger	ba,a,pt	%xcc, .Lexit_cp
799b3a04ed5SBabu Moger
800b3a04ed5SBabu Moger.Lmedium_unaligned_cp:
801b3a04ed5SBabu Moger	/* First get dest 8 byte aligned.  */
802b3a04ed5SBabu Moger	sub	%g0, %o0, %o3
803b3a04ed5SBabu Moger	and	%o3, 0x7, %o3
804b3a04ed5SBabu Moger	brz,pt	%o3, 2f
805b3a04ed5SBabu Moger	 sub	%o2, %o3, %o2
806b3a04ed5SBabu Moger
80734060b8fSBabu Moger1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
808b3a04ed5SBabu Moger	add	%o1, 1, %o1
809b3a04ed5SBabu Moger	subcc	%o3, 1, %o3
810b3a04ed5SBabu Moger	add	%o0, 1, %o0
811b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
81234060b8fSBabu Moger	 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
813b3a04ed5SBabu Moger2:
814b3a04ed5SBabu Moger	and	%o1, 0x7, %o3
815b3a04ed5SBabu Moger	brz,pn	%o3, .Lmedium_noprefetch_cp
816b3a04ed5SBabu Moger	 sll	%o3, 3, %o3
817b3a04ed5SBabu Moger	mov	64, %g2
818b3a04ed5SBabu Moger	sub	%g2, %o3, %g2
819b3a04ed5SBabu Moger	andn	%o1, 0x7, %o1
82034060b8fSBabu Moger	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
821b3a04ed5SBabu Moger	sllx	%o4, %o3, %o4
822b3a04ed5SBabu Moger	andn	%o2, 0x08 - 1, %o5
823b3a04ed5SBabu Moger	sub	%o2, %o5, %o2
824b3a04ed5SBabu Moger
82534060b8fSBabu Moger1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
826b3a04ed5SBabu Moger	add	%o1, 0x08, %o1
827b3a04ed5SBabu Moger	subcc	%o5, 0x08, %o5
828b3a04ed5SBabu Moger	srlx	%g3, %g2, %g7
829b3a04ed5SBabu Moger	or	%g7, %o4, %g7
83034060b8fSBabu Moger	EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
831b3a04ed5SBabu Moger	add	%o0, 0x08, %o0
832b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
833b3a04ed5SBabu Moger	 sllx	%g3, %o3, %o4
834b3a04ed5SBabu Moger	srl	%o3, 3, %o3
835b3a04ed5SBabu Moger	add	%o1, %o3, %o1
836b3a04ed5SBabu Moger	brz,pn	%o2, .Lexit_cp
837b3a04ed5SBabu Moger	 nop
838b3a04ed5SBabu Moger	ba,pt	%xcc, .Lsmall_unaligned_cp
839b3a04ed5SBabu Moger
840b3a04ed5SBabu Moger.Ltiny_cp:
84134060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
842b3a04ed5SBabu Moger	subcc	%o2, 1, %o2
843b3a04ed5SBabu Moger	be,pn	%xcc, .Lexit_cp
84434060b8fSBabu Moger	 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
84534060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
846b3a04ed5SBabu Moger	subcc	%o2, 1, %o2
847b3a04ed5SBabu Moger	be,pn	%xcc, .Lexit_cp
84834060b8fSBabu Moger	 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
84934060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
850b3a04ed5SBabu Moger	ba,pt	%xcc, .Lexit_cp
85134060b8fSBabu Moger	 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
852b3a04ed5SBabu Moger
853b3a04ed5SBabu Moger.Lsmall_cp:
854b3a04ed5SBabu Moger	andcc	%g2, 0x3, %g0
855b3a04ed5SBabu Moger	bne,pn	%xcc, .Lsmall_unaligned_cp
856b3a04ed5SBabu Moger	 andn	%o2, 0x4 - 1, %o5
857b3a04ed5SBabu Moger	sub	%o2, %o5, %o2
858b3a04ed5SBabu Moger1:
85934060b8fSBabu Moger	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
860b3a04ed5SBabu Moger	add	%o1, 0x04, %o1
861b3a04ed5SBabu Moger	subcc	%o5, 0x04, %o5
862b3a04ed5SBabu Moger	add	%o0, 0x04, %o0
863b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
86434060b8fSBabu Moger	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
865b3a04ed5SBabu Moger	brz,pt	%o2, .Lexit_cp
866b3a04ed5SBabu Moger	 nop
867b3a04ed5SBabu Moger	ba,a,pt	%xcc, .Ltiny_cp
868b3a04ed5SBabu Moger
869b3a04ed5SBabu Moger.Lsmall_unaligned_cp:
87034060b8fSBabu Moger1:	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
871b3a04ed5SBabu Moger	add	%o1, 1, %o1
872b3a04ed5SBabu Moger	add	%o0, 1, %o0
873b3a04ed5SBabu Moger	subcc	%o2, 1, %o2
874b3a04ed5SBabu Moger	bne,pt	%xcc, 1b
87534060b8fSBabu Moger	 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
876b3a04ed5SBabu Moger	ba,a,pt	%xcc, .Lexit_cp
877b3a04ed5SBabu Moger
878b3a04ed5SBabu Moger.Lsmallrest:
879b3a04ed5SBabu Moger	tst	%o2
880b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx
881b3a04ed5SBabu Moger	 cmp	%o2, 4
882b3a04ed5SBabu Moger	blt,pn	%xcc, .Lsmallleft3
883b3a04ed5SBabu Moger	 nop
884b3a04ed5SBabu Moger	sub	%o2, 3, %o2
885b3a04ed5SBabu Moger.Lsmallnotalign4:
88634060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
887b3a04ed5SBabu Moger	subcc	%o2, 4, %o2		! reduce count by 4
88834060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
88934060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
890b3a04ed5SBabu Moger	add	%o1, 4, %o1		! advance SRC by 4
89134060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
89234060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
893b3a04ed5SBabu Moger	add	%o0, 4, %o0		! advance DST by 4
89434060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
89534060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
896b3a04ed5SBabu Moger	bgu,pt	%xcc, .Lsmallnotalign4	! loop til 3 or fewer bytes remain
89734060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
898b3a04ed5SBabu Moger	addcc	%o2, 3, %o2		! restore count
899b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx
900b3a04ed5SBabu Moger.Lsmallleft3:				! 1, 2, or 3 bytes remain
901b3a04ed5SBabu Moger	subcc	%o2, 1, %o2
90234060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)	! load one byte
903b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx
90434060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)	! store one byte
90534060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)	! load second byte
906b3a04ed5SBabu Moger	subcc	%o2, 1, %o2
907b3a04ed5SBabu Moger	bz,pt	%xcc, .Lsmallx
90834060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
90934060b8fSBabu Moger	EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)	! load third byte
91034060b8fSBabu Moger	EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)	! store third byte
911b3a04ed5SBabu Moger.Lsmallx:
912b3a04ed5SBabu Moger	retl
913b3a04ed5SBabu Moger	 mov	EX_RETVAL(%g1), %o0
914b3a04ed5SBabu Moger.Lsmallfin:
915b3a04ed5SBabu Moger	tst	%o2
916b3a04ed5SBabu Moger	bnz,pn	%xcc, .Lsmallleft3
917b3a04ed5SBabu Moger	 nop
918b3a04ed5SBabu Moger	retl
919b3a04ed5SBabu Moger	 mov	EX_RETVAL(%g1), %o0	! restore %o0
920b3a04ed5SBabu Moger.Lexit_cp:
921b3a04ed5SBabu Moger	retl
922b3a04ed5SBabu Moger	 mov	EX_RETVAL(%g1), %o0
923b3a04ed5SBabu Moger	.size  FUNC_NAME, .-FUNC_NAME
924