xref: /openbmc/linux/arch/sparc/lib/U3memcpy.S (revision 41e4b7dc)
1/* U3memcpy.S: UltraSparc-III optimized memcpy.
2 *
3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
4 */
5
6#ifdef __KERNEL__
7#include <linux/linkage.h>
8#include <asm/visasm.h>
9#include <asm/asi.h>
10#define GLOBAL_SPARE	%g7
11#else
12#define ASI_BLK_P 0xf0
13#define FPRS_FEF  0x04
14#ifdef MEMCPY_DEBUG
15#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
16		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
17#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
18#else
19#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
21#endif
22#define GLOBAL_SPARE	%g5
23#endif
24
25#ifndef EX_LD
26#define EX_LD(x,y)	x
27#endif
28#ifndef EX_LD_FP
29#define EX_LD_FP(x,y)	x
30#endif
31
32#ifndef EX_ST
33#define EX_ST(x,y)	x
34#endif
35#ifndef EX_ST_FP
36#define EX_ST_FP(x,y)	x
37#endif
38
39#ifndef LOAD
40#define LOAD(type,addr,dest)	type [addr], dest
41#endif
42
43#ifndef STORE
44#define STORE(type,src,addr)	type src, [addr]
45#endif
46
47#ifndef STORE_BLK
48#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
49#endif
50
51#ifndef FUNC_NAME
52#define FUNC_NAME	U3memcpy
53#endif
54
55#ifndef PREAMBLE
56#define PREAMBLE
57#endif
58
59#ifndef XCC
60#define XCC xcc
61#endif
62
63	.register	%g2,#scratch
64	.register	%g3,#scratch
65
66	/* Special/non-trivial issues of this code:
67	 *
68	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
69	 * 2) Only low 32 FPU registers are used so that only the
70	 *    lower half of the FPU register set is dirtied by this
71	 *    code.  This is especially important in the kernel.
72	 * 3) This code never prefetches cachelines past the end
73	 *    of the source buffer.
74	 */
75
76	.text
77#ifndef EX_RETVAL
78#define EX_RETVAL(x)	x
79__restore_fp:
80	VISExitHalf
81	retl
82	 nop
83ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
84	add	%g1, 1, %g1
85	add	%g2, %g1, %g2
86	ba,pt	%xcc, __restore_fp
87	 add	%o2, %g2, %o0
88ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
89ENTRY(U3_retl_o2_plus_g2_fp)
90	ba,pt	%xcc, __restore_fp
91	 add	%o2, %g2, %o0
92ENDPROC(U3_retl_o2_plus_g2_fp)
93ENTRY(U3_retl_o2_plus_g2_plus_8_fp)
94	add	%g2, 8, %g2
95	ba,pt	%xcc, __restore_fp
96	 add	%o2, %g2, %o0
97ENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
98ENTRY(U3_retl_o2)
99	retl
100	 mov	%o2, %o0
101ENDPROC(U3_retl_o2)
102ENTRY(U3_retl_o2_plus_1)
103	retl
104	 add	%o2, 1, %o0
105ENDPROC(U3_retl_o2_plus_1)
106ENTRY(U3_retl_o2_plus_4)
107	retl
108	 add	%o2, 4, %o0
109ENDPROC(U3_retl_o2_plus_4)
110ENTRY(U3_retl_o2_plus_8)
111	retl
112	 add	%o2, 8, %o0
113ENDPROC(U3_retl_o2_plus_8)
114ENTRY(U3_retl_o2_plus_g1_plus_1)
115	add	%g1, 1, %g1
116	retl
117	 add	%o2, %g1, %o0
118ENDPROC(U3_retl_o2_plus_g1_plus_1)
119ENTRY(U3_retl_o2_fp)
120	ba,pt	%xcc, __restore_fp
121	 mov	%o2, %o0
122ENDPROC(U3_retl_o2_fp)
123ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
124	sll	%o3, 6, %o3
125	add	%o3, 0x80, %o3
126	ba,pt	%xcc, __restore_fp
127	 add	%o2, %o3, %o0
128ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
129ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
130	sll	%o3, 6, %o3
131	add	%o3, 0x40, %o3
132	ba,pt	%xcc, __restore_fp
133	 add	%o2, %o3, %o0
134ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
135ENTRY(U3_retl_o2_plus_GS_plus_0x10)
136	add	GLOBAL_SPARE, 0x10, GLOBAL_SPARE
137	retl
138	 add	%o2, GLOBAL_SPARE, %o0
139ENDPROC(U3_retl_o2_plus_GS_plus_0x10)
140ENTRY(U3_retl_o2_plus_GS_plus_0x08)
141	add	GLOBAL_SPARE, 0x08, GLOBAL_SPARE
142	retl
143	 add	%o2, GLOBAL_SPARE, %o0
144ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
145ENTRY(U3_retl_o2_and_7_plus_GS)
146	and	%o2, 7, %o2
147	retl
148	 add	%o2, GLOBAL_SPARE, %o0
149ENDPROC(U3_retl_o2_and_7_plus_GS)
150ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
151	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
152	and	%o2, 7, %o2
153	retl
154	 add	%o2, GLOBAL_SPARE, %o0
155ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
156#endif
157
158	.align		64
159
160	/* The cheetah's flexible spine, oversized liver, enlarged heart,
161	 * slender muscular body, and claws make it the swiftest hunter
162	 * in Africa and the fastest animal on land.  Can reach speeds
163	 * of up to 2.4GB per second.
164	 */
165
166	.globl	FUNC_NAME
167	.type	FUNC_NAME,#function
168FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
169	srlx		%o2, 31, %g2
170	cmp		%g2, 0
171
172	/* software trap 5 "Range Check" if dst >= 0x80000000 */
173	tne		%xcc, 5
174	PREAMBLE
175	mov		%o0, %o4
176
177	/* if len == 0 */
178	cmp		%o2, 0
179	be,pn		%XCC, end_return
180	 or		%o0, %o1, %o3
181
182	/* if len < 16 */
183	cmp		%o2, 16
184	blu,a,pn	%XCC, less_than_16
185	 or		%o3, %o2, %o3
186
187	/* if len < 192 */
188	cmp		%o2, (3 * 64)
189	blu,pt		%XCC, less_than_192
190	 andcc		%o3, 0x7, %g0
191
192	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
193	 * o5 from here until we hit VISExitHalf.
194	 */
195	VISEntryHalf
196
197	/* Is 'dst' already aligned on an 64-byte boundary? */
198	andcc		%o0, 0x3f, %g2
199	be,pt		%XCC, 2f
200
201	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
202	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
203	 * subtract this from 'len'.
204	 */
205	 sub		%o0, %o1, GLOBAL_SPARE
206	sub		%g2, 0x40, %g2
207	sub		%g0, %g2, %g2
208	sub		%o2, %g2, %o2
209	andcc		%g2, 0x7, %g1
210	be,pt		%icc, 2f
211	 and		%g2, 0x38, %g2
212
2131:	subcc		%g1, 0x1, %g1
214	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
215	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
216	bgu,pt		%XCC, 1b
217	 add		%o1, 0x1, %o1
218
219	add		%o1, GLOBAL_SPARE, %o0
220
2212:	cmp		%g2, 0x0
222	and		%o1, 0x7, %g1
223	be,pt		%icc, 3f
224	 alignaddr	%o1, %g0, %o1
225
226	EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
2271:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
228	add		%o1, 0x8, %o1
229	subcc		%g2, 0x8, %g2
230	faligndata	%f4, %f6, %f0
231	EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
232	be,pn		%icc, 3f
233	 add		%o0, 0x8, %o0
234
235	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
236	add		%o1, 0x8, %o1
237	subcc		%g2, 0x8, %g2
238	faligndata	%f6, %f4, %f2
239	EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
240	bne,pt		%icc, 1b
241	 add		%o0, 0x8, %o0
242
2433:	LOAD(prefetch, %o1 + 0x000, #one_read)
244	LOAD(prefetch, %o1 + 0x040, #one_read)
245	andn		%o2, (0x40 - 1), GLOBAL_SPARE
246	LOAD(prefetch, %o1 + 0x080, #one_read)
247	LOAD(prefetch, %o1 + 0x0c0, #one_read)
248	LOAD(prefetch, %o1 + 0x100, #one_read)
249	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
250	LOAD(prefetch, %o1 + 0x140, #one_read)
251	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
252	LOAD(prefetch, %o1 + 0x180, #one_read)
253	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
254	LOAD(prefetch, %o1 + 0x1c0, #one_read)
255	faligndata	%f0, %f2, %f16
256	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
257	faligndata	%f2, %f4, %f18
258	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
259	faligndata	%f4, %f6, %f20
260	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
261	faligndata	%f6, %f8, %f22
262
263	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
264	faligndata	%f8, %f10, %f24
265	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
266	faligndata	%f10, %f12, %f26
267	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
268
269	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
270	add		%o1, 0x40, %o1
271	bgu,pt		%XCC, 1f
272	 srl		GLOBAL_SPARE, 6, %o3
273	ba,pt		%xcc, 2f
274	 nop
275
276	.align		64
2771:
278	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
279	faligndata	%f12, %f14, %f28
280	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
281	faligndata	%f14, %f0, %f30
282	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
283	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
284	faligndata	%f0, %f2, %f16
285	add		%o0, 0x40, %o0
286
287	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
288	faligndata	%f2, %f4, %f18
289	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
290	faligndata	%f4, %f6, %f20
291	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
292	subcc		%o3, 0x01, %o3
293	faligndata	%f6, %f8, %f22
294	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
295
296	faligndata	%f8, %f10, %f24
297	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
298	LOAD(prefetch, %o1 + 0x1c0, #one_read)
299	faligndata	%f10, %f12, %f26
300	bg,pt		%XCC, 1b
301	 add		%o1, 0x40, %o1
302
303	/* Finally we copy the last full 64-byte block. */
3042:
305	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
306	faligndata	%f12, %f14, %f28
307	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
308	faligndata	%f14, %f0, %f30
309	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
310	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
311	faligndata	%f0, %f2, %f16
312	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
313	faligndata	%f2, %f4, %f18
314	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
315	faligndata	%f4, %f6, %f20
316	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
317	faligndata	%f6, %f8, %f22
318	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
319	faligndata	%f8, %f10, %f24
320	cmp		%g1, 0
321	be,pt		%XCC, 1f
322	 add		%o0, 0x40, %o0
323	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
3241:	faligndata	%f10, %f12, %f26
325	faligndata	%f12, %f14, %f28
326	faligndata	%f14, %f0, %f30
327	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
328	add		%o0, 0x40, %o0
329	add		%o1, 0x40, %o1
330	membar		#Sync
331
332	/* Now we copy the (len modulo 64) bytes at the end.
333	 * Note how we borrow the %f0 loaded above.
334	 *
335	 * Also notice how this code is careful not to perform a
336	 * load past the end of the src buffer.
337	 */
338	and		%o2, 0x3f, %o2
339	andcc		%o2, 0x38, %g2
340	be,pn		%XCC, 2f
341	 subcc		%g2, 0x8, %g2
342	be,pn		%XCC, 2f
343	 cmp		%g1, 0
344
345	sub		%o2, %g2, %o2
346	be,a,pt		%XCC, 1f
347	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
348
3491:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
350	add		%o1, 0x8, %o1
351	subcc		%g2, 0x8, %g2
352	faligndata	%f0, %f2, %f8
353	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
354	be,pn		%XCC, 2f
355	 add		%o0, 0x8, %o0
356	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
357	add		%o1, 0x8, %o1
358	subcc		%g2, 0x8, %g2
359	faligndata	%f2, %f0, %f8
360	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
361	bne,pn		%XCC, 1b
362	 add		%o0, 0x8, %o0
363
364	/* If anything is left, we copy it one byte at a time.
365	 * Note that %g1 is (src & 0x3) saved above before the
366	 * alignaddr was performed.
367	 */
3682:
369	cmp		%o2, 0
370	add		%o1, %g1, %o1
371	VISExitHalf
372	be,pn		%XCC, end_return
373	 sub		%o0, %o1, %o3
374
375	andcc		%g1, 0x7, %g0
376	bne,pn		%icc, 90f
377	 andcc		%o2, 0x8, %g0
378	be,pt		%icc, 1f
379	 nop
380	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
381	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
382	add		%o1, 0x8, %o1
383	sub		%o2, 8, %o2
384
3851:	andcc		%o2, 0x4, %g0
386	be,pt		%icc, 1f
387	 nop
388	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
389	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
390	add		%o1, 0x4, %o1
391	sub		%o2, 4, %o2
392
3931:	andcc		%o2, 0x2, %g0
394	be,pt		%icc, 1f
395	 nop
396	EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
397	EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
398	add		%o1, 0x2, %o1
399	sub		%o2, 2, %o2
400
4011:	andcc		%o2, 0x1, %g0
402	be,pt		%icc, end_return
403	 nop
404	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
405	ba,pt		%xcc, end_return
406	 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
407
408	.align		64
409	/* 16 <= len < 192 */
410less_than_192:
411	bne,pn		%XCC, 75f
412	 sub		%o0, %o1, %o3
413
41472:
415	andn		%o2, 0xf, GLOBAL_SPARE
416	and		%o2, 0xf, %o2
4171:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
418	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
419	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
420	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
421	add		%o1, 0x8, %o1
422	EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
423	bgu,pt		%XCC, 1b
424	 add		%o1, 0x8, %o1
42573:	andcc		%o2, 0x8, %g0
426	be,pt		%XCC, 1f
427	 nop
428	sub		%o2, 0x8, %o2
429	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
430	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
431	add		%o1, 0x8, %o1
4321:	andcc		%o2, 0x4, %g0
433	be,pt		%XCC, 1f
434	 nop
435	sub		%o2, 0x4, %o2
436	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
437	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
438	add		%o1, 0x4, %o1
4391:	cmp		%o2, 0
440	be,pt		%XCC, end_return
441	 nop
442	ba,pt		%xcc, 90f
443	 nop
444
44575:
446	andcc		%o0, 0x7, %g1
447	sub		%g1, 0x8, %g1
448	be,pn		%icc, 2f
449	 sub		%g0, %g1, %g1
450	sub		%o2, %g1, %o2
451
4521:	subcc		%g1, 1, %g1
453	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
454	EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
455	bgu,pt		%icc, 1b
456	 add		%o1, 1, %o1
457
4582:	add		%o1, %o3, %o0
459	andcc		%o1, 0x7, %g1
460	bne,pt		%icc, 8f
461	 sll		%g1, 3, %g1
462
463	cmp		%o2, 16
464	bgeu,pt		%icc, 72b
465	 nop
466	ba,a,pt		%xcc, 73b
467
4688:	mov		64, %o3
469	andn		%o1, 0x7, %o1
470	EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
471	sub		%o3, %g1, %o3
472	andn		%o2, 0x7, GLOBAL_SPARE
473	sllx		%g2, %g1, %g2
4741:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
475	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
476	add		%o1, 0x8, %o1
477	srlx		%g3, %o3, %o5
478	or		%o5, %g2, %o5
479	EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
480	add		%o0, 0x8, %o0
481	bgu,pt		%icc, 1b
482	 sllx		%g3, %g1, %g2
483
484	srl		%g1, 3, %g1
485	andcc		%o2, 0x7, %o2
486	be,pn		%icc, end_return
487	 add		%o1, %g1, %o1
488	ba,pt		%xcc, 90f
489	 sub		%o0, %o1, %o3
490
491	.align		64
492	/* 0 < len < 16 */
493less_than_16:
494	andcc		%o3, 0x3, %g0
495	bne,pn		%XCC, 90f
496	 sub		%o0, %o1, %o3
497
4981:
499	subcc		%o2, 4, %o2
500	EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
501	EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
502	bgu,pt		%XCC, 1b
503	 add		%o1, 4, %o1
504
505end_return:
506	retl
507	 mov		EX_RETVAL(%o4), %o0
508
509	.align		32
51090:
511	subcc		%o2, 1, %o2
512	EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
513	EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
514	bgu,pt		%XCC, 90b
515	 add		%o1, 1, %o1
516	retl
517	 mov		EX_RETVAL(%o4), %o0
518
519	.size		FUNC_NAME, .-FUNC_NAME
520