xref: /openbmc/linux/arch/sparc/lib/U3memcpy.S (revision 498495dba268b20e8eadd7fe93c140c68b6cc9d2)
1*b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
2478b8fecSSam Ravnborg/* U3memcpy.S: UltraSparc-III optimized memcpy.
3478b8fecSSam Ravnborg *
4478b8fecSSam Ravnborg * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
5478b8fecSSam Ravnborg */
6478b8fecSSam Ravnborg
7478b8fecSSam Ravnborg#ifdef __KERNEL__
8ee841d0aSDavid S. Miller#include <linux/linkage.h>
9478b8fecSSam Ravnborg#include <asm/visasm.h>
10478b8fecSSam Ravnborg#include <asm/asi.h>
11478b8fecSSam Ravnborg#define GLOBAL_SPARE	%g7
12478b8fecSSam Ravnborg#else
13478b8fecSSam Ravnborg#define ASI_BLK_P 0xf0
14478b8fecSSam Ravnborg#define FPRS_FEF  0x04
15478b8fecSSam Ravnborg#ifdef MEMCPY_DEBUG
16478b8fecSSam Ravnborg#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
17478b8fecSSam Ravnborg		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
18478b8fecSSam Ravnborg#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
19478b8fecSSam Ravnborg#else
20478b8fecSSam Ravnborg#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
21478b8fecSSam Ravnborg#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22478b8fecSSam Ravnborg#endif
23478b8fecSSam Ravnborg#define GLOBAL_SPARE	%g5
24478b8fecSSam Ravnborg#endif
25478b8fecSSam Ravnborg
26478b8fecSSam Ravnborg#ifndef EX_LD
27ee841d0aSDavid S. Miller#define EX_LD(x,y)	x
28478b8fecSSam Ravnborg#endif
29a7c5724bSRob Gardner#ifndef EX_LD_FP
30ee841d0aSDavid S. Miller#define EX_LD_FP(x,y)	x
31a7c5724bSRob Gardner#endif
32478b8fecSSam Ravnborg
33478b8fecSSam Ravnborg#ifndef EX_ST
34ee841d0aSDavid S. Miller#define EX_ST(x,y)	x
35478b8fecSSam Ravnborg#endif
36a7c5724bSRob Gardner#ifndef EX_ST_FP
37ee841d0aSDavid S. Miller#define EX_ST_FP(x,y)	x
38478b8fecSSam Ravnborg#endif
39478b8fecSSam Ravnborg
40478b8fecSSam Ravnborg#ifndef LOAD
41478b8fecSSam Ravnborg#define LOAD(type,addr,dest)	type [addr], dest
42478b8fecSSam Ravnborg#endif
43478b8fecSSam Ravnborg
44478b8fecSSam Ravnborg#ifndef STORE
45478b8fecSSam Ravnborg#define STORE(type,src,addr)	type src, [addr]
46478b8fecSSam Ravnborg#endif
47478b8fecSSam Ravnborg
48478b8fecSSam Ravnborg#ifndef STORE_BLK
49478b8fecSSam Ravnborg#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
50478b8fecSSam Ravnborg#endif
51478b8fecSSam Ravnborg
52478b8fecSSam Ravnborg#ifndef FUNC_NAME
53478b8fecSSam Ravnborg#define FUNC_NAME	U3memcpy
54478b8fecSSam Ravnborg#endif
55478b8fecSSam Ravnborg
56478b8fecSSam Ravnborg#ifndef PREAMBLE
57478b8fecSSam Ravnborg#define PREAMBLE
58478b8fecSSam Ravnborg#endif
59478b8fecSSam Ravnborg
60478b8fecSSam Ravnborg#ifndef XCC
61478b8fecSSam Ravnborg#define XCC xcc
62478b8fecSSam Ravnborg#endif
63478b8fecSSam Ravnborg
64478b8fecSSam Ravnborg	.register	%g2,#scratch
65478b8fecSSam Ravnborg	.register	%g3,#scratch
66478b8fecSSam Ravnborg
67478b8fecSSam Ravnborg	/* Special/non-trivial issues of this code:
68478b8fecSSam Ravnborg	 *
69478b8fecSSam Ravnborg	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
70478b8fecSSam Ravnborg	 * 2) Only low 32 FPU registers are used so that only the
71478b8fecSSam Ravnborg	 *    lower half of the FPU register set is dirtied by this
72478b8fecSSam Ravnborg	 *    code.  This is especially important in the kernel.
73478b8fecSSam Ravnborg	 * 3) This code never prefetches cachelines past the end
74478b8fecSSam Ravnborg	 *    of the source buffer.
75478b8fecSSam Ravnborg	 */
76478b8fecSSam Ravnborg
77478b8fecSSam Ravnborg	.text
78ee841d0aSDavid S. Miller#ifndef EX_RETVAL
79ee841d0aSDavid S. Miller#define EX_RETVAL(x)	x
80ee841d0aSDavid S. Miller__restore_fp:
81ee841d0aSDavid S. Miller	VISExitHalf
82ee841d0aSDavid S. Miller	retl
83ee841d0aSDavid S. Miller	 nop
84ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
85ee841d0aSDavid S. Miller	add	%g1, 1, %g1
86ee841d0aSDavid S. Miller	add	%g2, %g1, %g2
87ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
88ee841d0aSDavid S. Miller	 add	%o2, %g2, %o0
89ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
90ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_g2_fp)
91ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
92ee841d0aSDavid S. Miller	 add	%o2, %g2, %o0
93ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_g2_fp)
94ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_g2_plus_8_fp)
95ee841d0aSDavid S. Miller	add	%g2, 8, %g2
96ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
97ee841d0aSDavid S. Miller	 add	%o2, %g2, %o0
98ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
99ee841d0aSDavid S. MillerENTRY(U3_retl_o2)
100ee841d0aSDavid S. Miller	retl
101ee841d0aSDavid S. Miller	 mov	%o2, %o0
102ee841d0aSDavid S. MillerENDPROC(U3_retl_o2)
103ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_1)
104ee841d0aSDavid S. Miller	retl
105ee841d0aSDavid S. Miller	 add	%o2, 1, %o0
106ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_1)
107ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_4)
108ee841d0aSDavid S. Miller	retl
109ee841d0aSDavid S. Miller	 add	%o2, 4, %o0
110ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_4)
111ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_8)
112ee841d0aSDavid S. Miller	retl
113ee841d0aSDavid S. Miller	 add	%o2, 8, %o0
114ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_8)
115ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_g1_plus_1)
116ee841d0aSDavid S. Miller	add	%g1, 1, %g1
117ee841d0aSDavid S. Miller	retl
118ee841d0aSDavid S. Miller	 add	%o2, %g1, %o0
119ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_g1_plus_1)
120ee841d0aSDavid S. MillerENTRY(U3_retl_o2_fp)
121ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
122ee841d0aSDavid S. Miller	 mov	%o2, %o0
123ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_fp)
124ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
125ee841d0aSDavid S. Miller	sll	%o3, 6, %o3
126ee841d0aSDavid S. Miller	add	%o3, 0x80, %o3
127ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
128ee841d0aSDavid S. Miller	 add	%o2, %o3, %o0
129ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
130ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
131ee841d0aSDavid S. Miller	sll	%o3, 6, %o3
132ee841d0aSDavid S. Miller	add	%o3, 0x40, %o3
133ee841d0aSDavid S. Miller	ba,pt	%xcc, __restore_fp
134ee841d0aSDavid S. Miller	 add	%o2, %o3, %o0
135ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
136ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_GS_plus_0x10)
137ee841d0aSDavid S. Miller	add	GLOBAL_SPARE, 0x10, GLOBAL_SPARE
138ee841d0aSDavid S. Miller	retl
139ee841d0aSDavid S. Miller	 add	%o2, GLOBAL_SPARE, %o0
140ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_GS_plus_0x10)
141ee841d0aSDavid S. MillerENTRY(U3_retl_o2_plus_GS_plus_0x08)
142ee841d0aSDavid S. Miller	add	GLOBAL_SPARE, 0x08, GLOBAL_SPARE
143ee841d0aSDavid S. Miller	retl
144ee841d0aSDavid S. Miller	 add	%o2, GLOBAL_SPARE, %o0
145ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_plus_GS_plus_0x08)
146ee841d0aSDavid S. MillerENTRY(U3_retl_o2_and_7_plus_GS)
147ee841d0aSDavid S. Miller	and	%o2, 7, %o2
148ee841d0aSDavid S. Miller	retl
1490ede1c40SDavid S. Miller	 add	%o2, GLOBAL_SPARE, %o0
150ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_and_7_plus_GS)
151ee841d0aSDavid S. MillerENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
152ee841d0aSDavid S. Miller	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
153ee841d0aSDavid S. Miller	and	%o2, 7, %o2
154ee841d0aSDavid S. Miller	retl
1550ede1c40SDavid S. Miller	 add	%o2, GLOBAL_SPARE, %o0
156ee841d0aSDavid S. MillerENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
157ee841d0aSDavid S. Miller#endif
158ee841d0aSDavid S. Miller
159478b8fecSSam Ravnborg	.align		64
160478b8fecSSam Ravnborg
161478b8fecSSam Ravnborg	/* The cheetah's flexible spine, oversized liver, enlarged heart,
162478b8fecSSam Ravnborg	 * slender muscular body, and claws make it the swiftest hunter
163478b8fecSSam Ravnborg	 * in Africa and the fastest animal on land.  Can reach speeds
164478b8fecSSam Ravnborg	 * of up to 2.4GB per second.
165478b8fecSSam Ravnborg	 */
166478b8fecSSam Ravnborg
167478b8fecSSam Ravnborg	.globl	FUNC_NAME
168478b8fecSSam Ravnborg	.type	FUNC_NAME,#function
169478b8fecSSam RavnborgFUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
170478b8fecSSam Ravnborg	srlx		%o2, 31, %g2
171478b8fecSSam Ravnborg	cmp		%g2, 0
172061273f9SSam Ravnborg
173061273f9SSam Ravnborg	/* software trap 5 "Range Check" if dst >= 0x80000000 */
174478b8fecSSam Ravnborg	tne		%xcc, 5
175478b8fecSSam Ravnborg	PREAMBLE
176478b8fecSSam Ravnborg	mov		%o0, %o4
177061273f9SSam Ravnborg
178061273f9SSam Ravnborg	/* if len == 0 */
179478b8fecSSam Ravnborg	cmp		%o2, 0
180061273f9SSam Ravnborg	be,pn		%XCC, end_return
181478b8fecSSam Ravnborg	 or		%o0, %o1, %o3
182061273f9SSam Ravnborg
183061273f9SSam Ravnborg	/* if len < 16 */
184478b8fecSSam Ravnborg	cmp		%o2, 16
185061273f9SSam Ravnborg	blu,a,pn	%XCC, less_than_16
186478b8fecSSam Ravnborg	 or		%o3, %o2, %o3
187478b8fecSSam Ravnborg
188061273f9SSam Ravnborg	/* if len < 192 */
189478b8fecSSam Ravnborg	cmp		%o2, (3 * 64)
190061273f9SSam Ravnborg	blu,pt		%XCC, less_than_192
191478b8fecSSam Ravnborg	 andcc		%o3, 0x7, %g0
192478b8fecSSam Ravnborg
193478b8fecSSam Ravnborg	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
194478b8fecSSam Ravnborg	 * o5 from here until we hit VISExitHalf.
195478b8fecSSam Ravnborg	 */
196478b8fecSSam Ravnborg	VISEntryHalf
197478b8fecSSam Ravnborg
198478b8fecSSam Ravnborg	/* Is 'dst' already aligned on an 64-byte boundary? */
199478b8fecSSam Ravnborg	andcc		%o0, 0x3f, %g2
200478b8fecSSam Ravnborg	be,pt		%XCC, 2f
201478b8fecSSam Ravnborg
202478b8fecSSam Ravnborg	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
203478b8fecSSam Ravnborg	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
204478b8fecSSam Ravnborg	 * subtract this from 'len'.
205478b8fecSSam Ravnborg	 */
206478b8fecSSam Ravnborg	 sub		%o0, %o1, GLOBAL_SPARE
207478b8fecSSam Ravnborg	sub		%g2, 0x40, %g2
208478b8fecSSam Ravnborg	sub		%g0, %g2, %g2
209478b8fecSSam Ravnborg	sub		%o2, %g2, %o2
210478b8fecSSam Ravnborg	andcc		%g2, 0x7, %g1
211478b8fecSSam Ravnborg	be,pt		%icc, 2f
212478b8fecSSam Ravnborg	 and		%g2, 0x38, %g2
213478b8fecSSam Ravnborg
214478b8fecSSam Ravnborg1:	subcc		%g1, 0x1, %g1
215ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
216ee841d0aSDavid S. Miller	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
217478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
218478b8fecSSam Ravnborg	 add		%o1, 0x1, %o1
219478b8fecSSam Ravnborg
220478b8fecSSam Ravnborg	add		%o1, GLOBAL_SPARE, %o0
221478b8fecSSam Ravnborg
222478b8fecSSam Ravnborg2:	cmp		%g2, 0x0
223478b8fecSSam Ravnborg	and		%o1, 0x7, %g1
224478b8fecSSam Ravnborg	be,pt		%icc, 3f
225478b8fecSSam Ravnborg	 alignaddr	%o1, %g0, %o1
226478b8fecSSam Ravnborg
227ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
228ee841d0aSDavid S. Miller1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
229478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
230478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
231478b8fecSSam Ravnborg	faligndata	%f4, %f6, %f0
232ee841d0aSDavid S. Miller	EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
233478b8fecSSam Ravnborg	be,pn		%icc, 3f
234478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
235478b8fecSSam Ravnborg
236ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
237478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
238478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
239478b8fecSSam Ravnborg	faligndata	%f6, %f4, %f2
240ee841d0aSDavid S. Miller	EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
241478b8fecSSam Ravnborg	bne,pt		%icc, 1b
242478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
243478b8fecSSam Ravnborg
244478b8fecSSam Ravnborg3:	LOAD(prefetch, %o1 + 0x000, #one_read)
245478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x040, #one_read)
246478b8fecSSam Ravnborg	andn		%o2, (0x40 - 1), GLOBAL_SPARE
247478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x080, #one_read)
248478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x0c0, #one_read)
249478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x100, #one_read)
250ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
251478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x140, #one_read)
252ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
253478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x180, #one_read)
254ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
255478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x1c0, #one_read)
256478b8fecSSam Ravnborg	faligndata	%f0, %f2, %f16
257ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
258478b8fecSSam Ravnborg	faligndata	%f2, %f4, %f18
259ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
260478b8fecSSam Ravnborg	faligndata	%f4, %f6, %f20
261ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
262478b8fecSSam Ravnborg	faligndata	%f6, %f8, %f22
263478b8fecSSam Ravnborg
264ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
265478b8fecSSam Ravnborg	faligndata	%f8, %f10, %f24
266ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
267478b8fecSSam Ravnborg	faligndata	%f10, %f12, %f26
268ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
269478b8fecSSam Ravnborg
270478b8fecSSam Ravnborg	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
271478b8fecSSam Ravnborg	add		%o1, 0x40, %o1
272478b8fecSSam Ravnborg	bgu,pt		%XCC, 1f
273478b8fecSSam Ravnborg	 srl		GLOBAL_SPARE, 6, %o3
274478b8fecSSam Ravnborg	ba,pt		%xcc, 2f
275478b8fecSSam Ravnborg	 nop
276478b8fecSSam Ravnborg
277478b8fecSSam Ravnborg	.align		64
278478b8fecSSam Ravnborg1:
279ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
280478b8fecSSam Ravnborg	faligndata	%f12, %f14, %f28
281ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
282478b8fecSSam Ravnborg	faligndata	%f14, %f0, %f30
283ee841d0aSDavid S. Miller	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
284ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
285478b8fecSSam Ravnborg	faligndata	%f0, %f2, %f16
286478b8fecSSam Ravnborg	add		%o0, 0x40, %o0
287478b8fecSSam Ravnborg
288ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
289478b8fecSSam Ravnborg	faligndata	%f2, %f4, %f18
290ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
291478b8fecSSam Ravnborg	faligndata	%f4, %f6, %f20
292ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
293478b8fecSSam Ravnborg	subcc		%o3, 0x01, %o3
294478b8fecSSam Ravnborg	faligndata	%f6, %f8, %f22
295ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
296478b8fecSSam Ravnborg
297478b8fecSSam Ravnborg	faligndata	%f8, %f10, %f24
298ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
299478b8fecSSam Ravnborg	LOAD(prefetch, %o1 + 0x1c0, #one_read)
300478b8fecSSam Ravnborg	faligndata	%f10, %f12, %f26
301478b8fecSSam Ravnborg	bg,pt		%XCC, 1b
302478b8fecSSam Ravnborg	 add		%o1, 0x40, %o1
303478b8fecSSam Ravnborg
304478b8fecSSam Ravnborg	/* Finally we copy the last full 64-byte block. */
305478b8fecSSam Ravnborg2:
306ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
307478b8fecSSam Ravnborg	faligndata	%f12, %f14, %f28
308ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
309478b8fecSSam Ravnborg	faligndata	%f14, %f0, %f30
310ee841d0aSDavid S. Miller	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
311ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
312478b8fecSSam Ravnborg	faligndata	%f0, %f2, %f16
313ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
314478b8fecSSam Ravnborg	faligndata	%f2, %f4, %f18
315ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
316478b8fecSSam Ravnborg	faligndata	%f4, %f6, %f20
317ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
318478b8fecSSam Ravnborg	faligndata	%f6, %f8, %f22
319ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
320478b8fecSSam Ravnborg	faligndata	%f8, %f10, %f24
321478b8fecSSam Ravnborg	cmp		%g1, 0
322478b8fecSSam Ravnborg	be,pt		%XCC, 1f
323478b8fecSSam Ravnborg	 add		%o0, 0x40, %o0
324ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
325478b8fecSSam Ravnborg1:	faligndata	%f10, %f12, %f26
326478b8fecSSam Ravnborg	faligndata	%f12, %f14, %f28
327478b8fecSSam Ravnborg	faligndata	%f14, %f0, %f30
328ee841d0aSDavid S. Miller	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
329478b8fecSSam Ravnborg	add		%o0, 0x40, %o0
330478b8fecSSam Ravnborg	add		%o1, 0x40, %o1
331478b8fecSSam Ravnborg	membar		#Sync
332478b8fecSSam Ravnborg
333478b8fecSSam Ravnborg	/* Now we copy the (len modulo 64) bytes at the end.
334478b8fecSSam Ravnborg	 * Note how we borrow the %f0 loaded above.
335478b8fecSSam Ravnborg	 *
336478b8fecSSam Ravnborg	 * Also notice how this code is careful not to perform a
337478b8fecSSam Ravnborg	 * load past the end of the src buffer.
338478b8fecSSam Ravnborg	 */
339478b8fecSSam Ravnborg	and		%o2, 0x3f, %o2
340478b8fecSSam Ravnborg	andcc		%o2, 0x38, %g2
341478b8fecSSam Ravnborg	be,pn		%XCC, 2f
342478b8fecSSam Ravnborg	 subcc		%g2, 0x8, %g2
343478b8fecSSam Ravnborg	be,pn		%XCC, 2f
344478b8fecSSam Ravnborg	 cmp		%g1, 0
345478b8fecSSam Ravnborg
346478b8fecSSam Ravnborg	sub		%o2, %g2, %o2
347478b8fecSSam Ravnborg	be,a,pt		%XCC, 1f
348ee841d0aSDavid S. Miller	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
349478b8fecSSam Ravnborg
350ee841d0aSDavid S. Miller1:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
351478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
352478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
353478b8fecSSam Ravnborg	faligndata	%f0, %f2, %f8
354ee841d0aSDavid S. Miller	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
355478b8fecSSam Ravnborg	be,pn		%XCC, 2f
356478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
357ee841d0aSDavid S. Miller	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
358478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
359478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
360478b8fecSSam Ravnborg	faligndata	%f2, %f0, %f8
361ee841d0aSDavid S. Miller	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
362478b8fecSSam Ravnborg	bne,pn		%XCC, 1b
363478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
364478b8fecSSam Ravnborg
365478b8fecSSam Ravnborg	/* If anything is left, we copy it one byte at a time.
366478b8fecSSam Ravnborg	 * Note that %g1 is (src & 0x3) saved above before the
367478b8fecSSam Ravnborg	 * alignaddr was performed.
368478b8fecSSam Ravnborg	 */
369478b8fecSSam Ravnborg2:
370478b8fecSSam Ravnborg	cmp		%o2, 0
371478b8fecSSam Ravnborg	add		%o1, %g1, %o1
372478b8fecSSam Ravnborg	VISExitHalf
373061273f9SSam Ravnborg	be,pn		%XCC, end_return
374478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
375478b8fecSSam Ravnborg
376478b8fecSSam Ravnborg	andcc		%g1, 0x7, %g0
377478b8fecSSam Ravnborg	bne,pn		%icc, 90f
378478b8fecSSam Ravnborg	 andcc		%o2, 0x8, %g0
379478b8fecSSam Ravnborg	be,pt		%icc, 1f
380478b8fecSSam Ravnborg	 nop
381ee841d0aSDavid S. Miller	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
382ee841d0aSDavid S. Miller	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
383478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
384ee841d0aSDavid S. Miller	sub		%o2, 8, %o2
385478b8fecSSam Ravnborg
386478b8fecSSam Ravnborg1:	andcc		%o2, 0x4, %g0
387478b8fecSSam Ravnborg	be,pt		%icc, 1f
388478b8fecSSam Ravnborg	 nop
389ee841d0aSDavid S. Miller	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
390ee841d0aSDavid S. Miller	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
391478b8fecSSam Ravnborg	add		%o1, 0x4, %o1
392ee841d0aSDavid S. Miller	sub		%o2, 4, %o2
393478b8fecSSam Ravnborg
394478b8fecSSam Ravnborg1:	andcc		%o2, 0x2, %g0
395478b8fecSSam Ravnborg	be,pt		%icc, 1f
396478b8fecSSam Ravnborg	 nop
397ee841d0aSDavid S. Miller	EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
398ee841d0aSDavid S. Miller	EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
399478b8fecSSam Ravnborg	add		%o1, 0x2, %o1
400ee841d0aSDavid S. Miller	sub		%o2, 2, %o2
401478b8fecSSam Ravnborg
402478b8fecSSam Ravnborg1:	andcc		%o2, 0x1, %g0
403061273f9SSam Ravnborg	be,pt		%icc, end_return
404478b8fecSSam Ravnborg	 nop
405ee841d0aSDavid S. Miller	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
406061273f9SSam Ravnborg	ba,pt		%xcc, end_return
407ee841d0aSDavid S. Miller	 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
408478b8fecSSam Ravnborg
409478b8fecSSam Ravnborg	.align		64
410061273f9SSam Ravnborg	/* 16 <= len < 192 */
411061273f9SSam Ravnborgless_than_192:
412478b8fecSSam Ravnborg	bne,pn		%XCC, 75f
413478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
414478b8fecSSam Ravnborg
415478b8fecSSam Ravnborg72:
416478b8fecSSam Ravnborg	andn		%o2, 0xf, GLOBAL_SPARE
417478b8fecSSam Ravnborg	and		%o2, 0xf, %o2
418478b8fecSSam Ravnborg1:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
419ee841d0aSDavid S. Miller	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
420ee841d0aSDavid S. Miller	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
421ee841d0aSDavid S. Miller	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
422478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
423ee841d0aSDavid S. Miller	EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
424478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
425478b8fecSSam Ravnborg	 add		%o1, 0x8, %o1
426478b8fecSSam Ravnborg73:	andcc		%o2, 0x8, %g0
427478b8fecSSam Ravnborg	be,pt		%XCC, 1f
428478b8fecSSam Ravnborg	 nop
429478b8fecSSam Ravnborg	sub		%o2, 0x8, %o2
430ee841d0aSDavid S. Miller	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
431ee841d0aSDavid S. Miller	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
432478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
433478b8fecSSam Ravnborg1:	andcc		%o2, 0x4, %g0
434478b8fecSSam Ravnborg	be,pt		%XCC, 1f
435478b8fecSSam Ravnborg	 nop
436478b8fecSSam Ravnborg	sub		%o2, 0x4, %o2
437ee841d0aSDavid S. Miller	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
438ee841d0aSDavid S. Miller	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
439478b8fecSSam Ravnborg	add		%o1, 0x4, %o1
440478b8fecSSam Ravnborg1:	cmp		%o2, 0
441061273f9SSam Ravnborg	be,pt		%XCC, end_return
442478b8fecSSam Ravnborg	 nop
443478b8fecSSam Ravnborg	ba,pt		%xcc, 90f
444478b8fecSSam Ravnborg	 nop
445478b8fecSSam Ravnborg
446478b8fecSSam Ravnborg75:
447478b8fecSSam Ravnborg	andcc		%o0, 0x7, %g1
448478b8fecSSam Ravnborg	sub		%g1, 0x8, %g1
449478b8fecSSam Ravnborg	be,pn		%icc, 2f
450478b8fecSSam Ravnborg	 sub		%g0, %g1, %g1
451478b8fecSSam Ravnborg	sub		%o2, %g1, %o2
452478b8fecSSam Ravnborg
453478b8fecSSam Ravnborg1:	subcc		%g1, 1, %g1
454ee841d0aSDavid S. Miller	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
455ee841d0aSDavid S. Miller	EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
456478b8fecSSam Ravnborg	bgu,pt		%icc, 1b
457478b8fecSSam Ravnborg	 add		%o1, 1, %o1
458478b8fecSSam Ravnborg
459478b8fecSSam Ravnborg2:	add		%o1, %o3, %o0
460478b8fecSSam Ravnborg	andcc		%o1, 0x7, %g1
461478b8fecSSam Ravnborg	bne,pt		%icc, 8f
462478b8fecSSam Ravnborg	 sll		%g1, 3, %g1
463478b8fecSSam Ravnborg
464478b8fecSSam Ravnborg	cmp		%o2, 16
465478b8fecSSam Ravnborg	bgeu,pt		%icc, 72b
466478b8fecSSam Ravnborg	 nop
467478b8fecSSam Ravnborg	ba,a,pt		%xcc, 73b
468478b8fecSSam Ravnborg
469478b8fecSSam Ravnborg8:	mov		64, %o3
470478b8fecSSam Ravnborg	andn		%o1, 0x7, %o1
471ee841d0aSDavid S. Miller	EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
472478b8fecSSam Ravnborg	sub		%o3, %g1, %o3
473478b8fecSSam Ravnborg	andn		%o2, 0x7, GLOBAL_SPARE
474478b8fecSSam Ravnborg	sllx		%g2, %g1, %g2
475ee841d0aSDavid S. Miller1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
476478b8fecSSam Ravnborg	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
477478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
478478b8fecSSam Ravnborg	srlx		%g3, %o3, %o5
479478b8fecSSam Ravnborg	or		%o5, %g2, %o5
480ee841d0aSDavid S. Miller	EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
481478b8fecSSam Ravnborg	add		%o0, 0x8, %o0
482478b8fecSSam Ravnborg	bgu,pt		%icc, 1b
483478b8fecSSam Ravnborg	 sllx		%g3, %g1, %g2
484478b8fecSSam Ravnborg
485478b8fecSSam Ravnborg	srl		%g1, 3, %g1
486478b8fecSSam Ravnborg	andcc		%o2, 0x7, %o2
487061273f9SSam Ravnborg	be,pn		%icc, end_return
488478b8fecSSam Ravnborg	 add		%o1, %g1, %o1
489478b8fecSSam Ravnborg	ba,pt		%xcc, 90f
490478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
491478b8fecSSam Ravnborg
492478b8fecSSam Ravnborg	.align		64
493061273f9SSam Ravnborg	/* 0 < len < 16 */
494061273f9SSam Ravnborgless_than_16:
495478b8fecSSam Ravnborg	andcc		%o3, 0x3, %g0
496478b8fecSSam Ravnborg	bne,pn		%XCC, 90f
497478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
498478b8fecSSam Ravnborg
499478b8fecSSam Ravnborg1:
500478b8fecSSam Ravnborg	subcc		%o2, 4, %o2
501ee841d0aSDavid S. Miller	EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
502ee841d0aSDavid S. Miller	EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
503478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
504478b8fecSSam Ravnborg	 add		%o1, 4, %o1
505478b8fecSSam Ravnborg
506061273f9SSam Ravnborgend_return:
507061273f9SSam Ravnborg	retl
508478b8fecSSam Ravnborg	 mov		EX_RETVAL(%o4), %o0
509478b8fecSSam Ravnborg
510478b8fecSSam Ravnborg	.align		32
511478b8fecSSam Ravnborg90:
512478b8fecSSam Ravnborg	subcc		%o2, 1, %o2
513ee841d0aSDavid S. Miller	EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
514ee841d0aSDavid S. Miller	EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
515478b8fecSSam Ravnborg	bgu,pt		%XCC, 90b
516478b8fecSSam Ravnborg	 add		%o1, 1, %o1
517478b8fecSSam Ravnborg	retl
518478b8fecSSam Ravnborg	 mov		EX_RETVAL(%o4), %o0
519478b8fecSSam Ravnborg
520478b8fecSSam Ravnborg	.size		FUNC_NAME, .-FUNC_NAME
521