xref: /openbmc/linux/arch/microblaze/lib/fastcopy.S (revision c13aca79ff3c4af5fd31a5b2743a90eba6e36a26)
1322ae8ebSMichal Simek/*
2322ae8ebSMichal Simek * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3322ae8ebSMichal Simek * Copyright (C) 2008-2009 PetaLogix
4322ae8ebSMichal Simek * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5322ae8ebSMichal Simek *
6322ae8ebSMichal Simek * This file is subject to the terms and conditions of the GNU General
7322ae8ebSMichal Simek * Public License.  See the file COPYING in the main directory of this
8322ae8ebSMichal Simek * archive for more details.
9322ae8ebSMichal Simek *
10322ae8ebSMichal Simek * Written by Jim Law <jlaw@irispower.com>
11322ae8ebSMichal Simek *
12322ae8ebSMichal Simek * intended to replace:
13322ae8ebSMichal Simek *	memcpy in memcpy.c and
14322ae8ebSMichal Simek *	memmove in memmove.c
15322ae8ebSMichal Simek * ... in arch/microblaze/lib
16322ae8ebSMichal Simek *
17322ae8ebSMichal Simek *
18322ae8ebSMichal Simek * assly_fastcopy.S
19322ae8ebSMichal Simek *
20322ae8ebSMichal Simek * Attempt at quicker memcpy and memmove for MicroBlaze
21322ae8ebSMichal Simek *	Input :	Operand1 in Reg r5 - destination address
22322ae8ebSMichal Simek *		Operand2 in Reg r6 - source address
23322ae8ebSMichal Simek *		Operand3 in Reg r7 - number of bytes to transfer
24322ae8ebSMichal Simek *	Output: Result in Reg r3 - starting destinaition address
25322ae8ebSMichal Simek *
26322ae8ebSMichal Simek *
27322ae8ebSMichal Simek * Explanation:
28322ae8ebSMichal Simek *	Perform (possibly unaligned) copy of a block of memory
29322ae8ebSMichal Simek *	between mem locations with size of xfer spec'd in bytes
30322ae8ebSMichal Simek */
31322ae8ebSMichal Simek
32322ae8ebSMichal Simek#include <linux/linkage.h>
33*13851966SMichal Simek	.text
34322ae8ebSMichal Simek	.globl	memcpy
35*13851966SMichal Simek	.type  memcpy, @function
36322ae8ebSMichal Simek	.ent	memcpy
37322ae8ebSMichal Simek
38322ae8ebSMichal Simekmemcpy:
39322ae8ebSMichal Simekfast_memcpy_ascending:
40322ae8ebSMichal Simek	/* move d to return register as value of function */
41322ae8ebSMichal Simek	addi	r3, r5, 0
42322ae8ebSMichal Simek
43322ae8ebSMichal Simek	addi	r4, r0, 4	/* n = 4 */
44322ae8ebSMichal Simek	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
45322ae8ebSMichal Simek	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
46322ae8ebSMichal Simek
47322ae8ebSMichal Simek	/* transfer first 0~3 bytes to get aligned dest address */
48322ae8ebSMichal Simek	andi	r4, r5, 3		/* n = d & 3 */
49322ae8ebSMichal Simek	/* if zero, destination already aligned */
50322ae8ebSMichal Simek	beqi	r4, a_dalign_done
51322ae8ebSMichal Simek	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52322ae8ebSMichal Simek	rsubi	r4, r4, 4
53322ae8ebSMichal Simek	rsub	r7, r4, r7		/* c = c - n adjust c */
54322ae8ebSMichal Simek
55322ae8ebSMichal Simeka_xfer_first_loop:
56322ae8ebSMichal Simek	/* if no bytes left to transfer, transfer the bulk */
57322ae8ebSMichal Simek	beqi	r4, a_dalign_done
58322ae8ebSMichal Simek	lbui	r11, r6, 0		/* h = *s */
59322ae8ebSMichal Simek	sbi	r11, r5, 0		/* *d = h */
60322ae8ebSMichal Simek	addi	r6, r6, 1		/* s++ */
61322ae8ebSMichal Simek	addi	r5, r5, 1		/* d++ */
62322ae8ebSMichal Simek	brid	a_xfer_first_loop	/* loop */
63322ae8ebSMichal Simek	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
64322ae8ebSMichal Simek
65322ae8ebSMichal Simeka_dalign_done:
66322ae8ebSMichal Simek	addi	r4, r0, 32		/* n = 32 */
67322ae8ebSMichal Simek	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
68322ae8ebSMichal Simek	/* if n < 0, less than one block to transfer */
69322ae8ebSMichal Simek	blti	r4, a_block_done
70322ae8ebSMichal Simek
71322ae8ebSMichal Simeka_block_xfer:
72322ae8ebSMichal Simek	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
73322ae8ebSMichal Simek	rsub	r7, r4, r7		/* c = c - n */
74322ae8ebSMichal Simek
75322ae8ebSMichal Simek	andi	r9, r6, 3		/* t1 = s & 3 */
76322ae8ebSMichal Simek	/* if temp != 0, unaligned transfers needed */
77322ae8ebSMichal Simek	bnei	r9, a_block_unaligned
78322ae8ebSMichal Simek
79322ae8ebSMichal Simeka_block_aligned:
80322ae8ebSMichal Simek	lwi	r9, r6, 0		/* t1 = *(s + 0) */
81322ae8ebSMichal Simek	lwi	r10, r6, 4		/* t2 = *(s + 4) */
82322ae8ebSMichal Simek	lwi	r11, r6, 8		/* t3 = *(s + 8) */
83322ae8ebSMichal Simek	lwi	r12, r6, 12		/* t4 = *(s + 12) */
84322ae8ebSMichal Simek	swi	r9, r5, 0		/* *(d + 0) = t1 */
85322ae8ebSMichal Simek	swi	r10, r5, 4		/* *(d + 4) = t2 */
86322ae8ebSMichal Simek	swi	r11, r5, 8		/* *(d + 8) = t3 */
87322ae8ebSMichal Simek	swi	r12, r5, 12		/* *(d + 12) = t4 */
88322ae8ebSMichal Simek	lwi	r9, r6, 16		/* t1 = *(s + 16) */
89322ae8ebSMichal Simek	lwi	r10, r6, 20		/* t2 = *(s + 20) */
90322ae8ebSMichal Simek	lwi	r11, r6, 24		/* t3 = *(s + 24) */
91322ae8ebSMichal Simek	lwi	r12, r6, 28		/* t4 = *(s + 28) */
92322ae8ebSMichal Simek	swi	r9, r5, 16		/* *(d + 16) = t1 */
93322ae8ebSMichal Simek	swi	r10, r5, 20		/* *(d + 20) = t2 */
94322ae8ebSMichal Simek	swi	r11, r5, 24		/* *(d + 24) = t3 */
95322ae8ebSMichal Simek	swi	r12, r5, 28		/* *(d + 28) = t4 */
96322ae8ebSMichal Simek	addi	r6, r6, 32		/* s = s + 32 */
97322ae8ebSMichal Simek	addi	r4, r4, -32		/* n = n - 32 */
98322ae8ebSMichal Simek	bneid	r4, a_block_aligned	/* while (n) loop */
99322ae8ebSMichal Simek	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
100322ae8ebSMichal Simek	bri	a_block_done
101322ae8ebSMichal Simek
102322ae8ebSMichal Simeka_block_unaligned:
103322ae8ebSMichal Simek	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
104322ae8ebSMichal Simek	add	r6, r6, r4		/* s = s + n */
105322ae8ebSMichal Simek	lwi	r11, r8, 0		/* h = *(as + 0) */
106322ae8ebSMichal Simek
107322ae8ebSMichal Simek	addi	r9, r9, -1
108322ae8ebSMichal Simek	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
109322ae8ebSMichal Simek	addi	r9, r9, -1
110322ae8ebSMichal Simek	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
111322ae8ebSMichal Simek
112322ae8ebSMichal Simeka_block_u3:
113322ae8ebSMichal Simek	bslli	r11, r11, 24	/* h = h << 24 */
114322ae8ebSMichal Simeka_bu3_loop:
115322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
116322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
117322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
118322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
119322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
120322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
121322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
122322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
123322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
124322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
125322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
126322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
127322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
128322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
129322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
130322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
131322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
132322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
133322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 12) = t1 */
134322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
135322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
136322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
137322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
138322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
139322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
140322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
141322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
142322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
143322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
144322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
145322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
146322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
147322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
148322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
149322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
150322ae8ebSMichal Simek	lwi	r12, r8, 32	/* v = *(as + 32) */
151322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
152322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
153322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
154322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
155322ae8ebSMichal Simek	addi	r8, r8, 32	/* as = as + 32 */
156322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
157322ae8ebSMichal Simek	bneid	r4, a_bu3_loop	/* while (n) loop */
158322ae8ebSMichal Simek	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
159322ae8ebSMichal Simek	bri	a_block_done
160322ae8ebSMichal Simek
161322ae8ebSMichal Simeka_block_u1:
162322ae8ebSMichal Simek	bslli	r11, r11, 8	/* h = h << 8 */
163322ae8ebSMichal Simeka_bu1_loop:
164322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
165322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
166322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
167322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
168322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
169322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
170322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
171322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
172322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
173322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
174322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
175322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
176322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
177322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
178322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
179322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
180322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
181322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
182322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 12) = t1 */
183322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
184322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
185322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
186322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
187322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
188322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
189322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
190322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
191322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
192322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
193322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
194322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
195322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
196322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
197322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
198322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
199322ae8ebSMichal Simek	lwi	r12, r8, 32	/* v = *(as + 32) */
200322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
201322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
202322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
203322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
204322ae8ebSMichal Simek	addi	r8, r8, 32	/* as = as + 32 */
205322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
206322ae8ebSMichal Simek	bneid	r4, a_bu1_loop	/* while (n) loop */
207322ae8ebSMichal Simek	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
208322ae8ebSMichal Simek	bri	a_block_done
209322ae8ebSMichal Simek
210322ae8ebSMichal Simeka_block_u2:
211322ae8ebSMichal Simek	bslli	r11, r11, 16	/* h = h << 16 */
212322ae8ebSMichal Simeka_bu2_loop:
213322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
214322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
215322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
216322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
217322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
218322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
219322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
220322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
221322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
222322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
223322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
224322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
225322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
226322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
227322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
228322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
229322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
230322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
231322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 12) = t1 */
232322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
233322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
234322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
235322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
236322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
237322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
238322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
239322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
240322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
241322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
242322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
243322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
244322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
245322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
246322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
247322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
248322ae8ebSMichal Simek	lwi	r12, r8, 32	/* v = *(as + 32) */
249322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
250322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
251322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
252322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
253322ae8ebSMichal Simek	addi	r8, r8, 32	/* as = as + 32 */
254322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
255322ae8ebSMichal Simek	bneid	r4, a_bu2_loop	/* while (n) loop */
256322ae8ebSMichal Simek	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
257322ae8ebSMichal Simek
258322ae8ebSMichal Simeka_block_done:
259322ae8ebSMichal Simek	addi	r4, r0, 4	/* n = 4 */
260322ae8ebSMichal Simek	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
261322ae8ebSMichal Simek	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
262322ae8ebSMichal Simek
263322ae8ebSMichal Simeka_word_xfer:
264322ae8ebSMichal Simek	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
265322ae8ebSMichal Simek	addi	r10, r0, 0		/* offset = 0 */
266322ae8ebSMichal Simek
267322ae8ebSMichal Simek	andi	r9, r6, 3		/* t1 = s & 3 */
268322ae8ebSMichal Simek	/* if temp != 0, unaligned transfers needed */
269322ae8ebSMichal Simek	bnei	r9, a_word_unaligned
270322ae8ebSMichal Simek
271322ae8ebSMichal Simeka_word_aligned:
272322ae8ebSMichal Simek	lw	r9, r6, r10		/* t1 = *(s+offset) */
273322ae8ebSMichal Simek	sw	r9, r5, r10		/* *(d+offset) = t1 */
274322ae8ebSMichal Simek	addi	r4, r4,-4		/* n-- */
275322ae8ebSMichal Simek	bneid	r4, a_word_aligned	/* loop */
276322ae8ebSMichal Simek	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
277322ae8ebSMichal Simek
278322ae8ebSMichal Simek	bri	a_word_done
279322ae8ebSMichal Simek
280322ae8ebSMichal Simeka_word_unaligned:
281322ae8ebSMichal Simek	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
282322ae8ebSMichal Simek	lwi	r11, r8, 0		/* h = *(as + 0) */
283322ae8ebSMichal Simek	addi	r8, r8, 4		/* as = as + 4 */
284322ae8ebSMichal Simek
285322ae8ebSMichal Simek	addi	r9, r9, -1
286322ae8ebSMichal Simek	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
287322ae8ebSMichal Simek	addi	r9, r9, -1
288322ae8ebSMichal Simek	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
289322ae8ebSMichal Simek
290322ae8ebSMichal Simeka_word_u3:
291322ae8ebSMichal Simek	bslli	r11, r11, 24	/* h = h << 24 */
292322ae8ebSMichal Simeka_wu3_loop:
293322ae8ebSMichal Simek	lw	r12, r8, r10	/* v = *(as + offset) */
294322ae8ebSMichal Simek	bsrli	r9, r12, 8	/* t1 = v >> 8 */
295322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
296322ae8ebSMichal Simek	sw	r9, r5, r10	/* *(d + offset) = t1 */
297322ae8ebSMichal Simek	bslli	r11, r12, 24	/* h = v << 24 */
298322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
299322ae8ebSMichal Simek	bneid	r4, a_wu3_loop	/* while (n) loop */
300322ae8ebSMichal Simek	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
301322ae8ebSMichal Simek
302322ae8ebSMichal Simek	bri	a_word_done
303322ae8ebSMichal Simek
304322ae8ebSMichal Simeka_word_u1:
305322ae8ebSMichal Simek	bslli	r11, r11, 8	/* h = h << 8 */
306322ae8ebSMichal Simeka_wu1_loop:
307322ae8ebSMichal Simek	lw	r12, r8, r10	/* v = *(as + offset) */
308322ae8ebSMichal Simek	bsrli	r9, r12, 24	/* t1 = v >> 24 */
309322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
310322ae8ebSMichal Simek	sw	r9, r5, r10	/* *(d + offset) = t1 */
311322ae8ebSMichal Simek	bslli	r11, r12, 8	/* h = v << 8 */
312322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
313322ae8ebSMichal Simek	bneid	r4, a_wu1_loop	/* while (n) loop */
314322ae8ebSMichal Simek	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
315322ae8ebSMichal Simek
316322ae8ebSMichal Simek	bri	a_word_done
317322ae8ebSMichal Simek
318322ae8ebSMichal Simeka_word_u2:
319322ae8ebSMichal Simek	bslli	r11, r11, 16	/* h = h << 16 */
320322ae8ebSMichal Simeka_wu2_loop:
321322ae8ebSMichal Simek	lw	r12, r8, r10	/* v = *(as + offset) */
322322ae8ebSMichal Simek	bsrli	r9, r12, 16	/* t1 = v >> 16 */
323322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
324322ae8ebSMichal Simek	sw	r9, r5, r10	/* *(d + offset) = t1 */
325322ae8ebSMichal Simek	bslli	r11, r12, 16	/* h = v << 16 */
326322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
327322ae8ebSMichal Simek	bneid	r4, a_wu2_loop	/* while (n) loop */
328322ae8ebSMichal Simek	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
329322ae8ebSMichal Simek
330322ae8ebSMichal Simeka_word_done:
331322ae8ebSMichal Simek	add	r5, r5, r10	/* d = d + offset */
332322ae8ebSMichal Simek	add	r6, r6, r10	/* s = s + offset */
333322ae8ebSMichal Simek	rsub	r7, r10, r7	/* c = c - offset */
334322ae8ebSMichal Simek
335322ae8ebSMichal Simeka_xfer_end:
336322ae8ebSMichal Simeka_xfer_end_loop:
337322ae8ebSMichal Simek	beqi	r7, a_done		/* while (c) */
338322ae8ebSMichal Simek	lbui	r9, r6, 0		/* t1 = *s */
339322ae8ebSMichal Simek	addi	r6, r6, 1		/* s++ */
340322ae8ebSMichal Simek	sbi	r9, r5, 0		/* *d = t1 */
341322ae8ebSMichal Simek	addi	r7, r7, -1		/* c-- */
342322ae8ebSMichal Simek	brid	a_xfer_end_loop		/* loop */
343322ae8ebSMichal Simek	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
344322ae8ebSMichal Simek
345322ae8ebSMichal Simeka_done:
346322ae8ebSMichal Simek	rtsd	r15, 8
347322ae8ebSMichal Simek	nop
348322ae8ebSMichal Simek
349*13851966SMichal Simek.size  memcpy, . - memcpy
350322ae8ebSMichal Simek.end memcpy
351322ae8ebSMichal Simek/*----------------------------------------------------------------------------*/
352322ae8ebSMichal Simek	.globl	memmove
353*13851966SMichal Simek	.type  memmove, @function
354322ae8ebSMichal Simek	.ent	memmove
355322ae8ebSMichal Simek
356322ae8ebSMichal Simekmemmove:
357322ae8ebSMichal Simek	cmpu	r4, r5, r6	/* n = s - d */
358322ae8ebSMichal Simek	bgei	r4,fast_memcpy_ascending
359322ae8ebSMichal Simek
360322ae8ebSMichal Simekfast_memcpy_descending:
361322ae8ebSMichal Simek	/* move d to return register as value of function */
362322ae8ebSMichal Simek	addi	r3, r5, 0
363322ae8ebSMichal Simek
364322ae8ebSMichal Simek	add	r5, r5, r7	/* d = d + c */
365322ae8ebSMichal Simek	add	r6, r6, r7	/* s = s + c */
366322ae8ebSMichal Simek
367322ae8ebSMichal Simek	addi	r4, r0, 4	/* n = 4 */
368322ae8ebSMichal Simek	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
369322ae8ebSMichal Simek	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
370322ae8ebSMichal Simek
371322ae8ebSMichal Simek	/* transfer first 0~3 bytes to get aligned dest address */
372322ae8ebSMichal Simek	andi	r4, r5, 3		/* n = d & 3 */
373322ae8ebSMichal Simek	/* if zero, destination already aligned */
374322ae8ebSMichal Simek	beqi	r4,d_dalign_done
375322ae8ebSMichal Simek	rsub	r7, r4, r7		/* c = c - n adjust c */
376322ae8ebSMichal Simek
377322ae8ebSMichal Simekd_xfer_first_loop:
378322ae8ebSMichal Simek	/* if no bytes left to transfer, transfer the bulk */
379322ae8ebSMichal Simek	beqi	r4,d_dalign_done
380322ae8ebSMichal Simek	addi	r6, r6, -1		/* s-- */
381322ae8ebSMichal Simek	addi	r5, r5, -1		/* d-- */
382322ae8ebSMichal Simek	lbui	r11, r6, 0		/* h = *s */
383322ae8ebSMichal Simek	sbi	r11, r5, 0		/* *d = h */
384322ae8ebSMichal Simek	brid	d_xfer_first_loop	/* loop */
385322ae8ebSMichal Simek	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
386322ae8ebSMichal Simek
387322ae8ebSMichal Simekd_dalign_done:
388322ae8ebSMichal Simek	addi	r4, r0, 32	/* n = 32 */
389322ae8ebSMichal Simek	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
390322ae8ebSMichal Simek	/* if n < 0, less than one block to transfer */
391322ae8ebSMichal Simek	blti	r4, d_block_done
392322ae8ebSMichal Simek
393322ae8ebSMichal Simekd_block_xfer:
394322ae8ebSMichal Simek	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
395322ae8ebSMichal Simek	rsub	r7, r4, r7		/* c = c - n */
396322ae8ebSMichal Simek
397322ae8ebSMichal Simek	andi	r9, r6, 3		/* t1 = s & 3 */
398322ae8ebSMichal Simek	/* if temp != 0, unaligned transfers needed */
399322ae8ebSMichal Simek	bnei	r9, d_block_unaligned
400322ae8ebSMichal Simek
401322ae8ebSMichal Simekd_block_aligned:
402322ae8ebSMichal Simek	addi	r6, r6, -32		/* s = s - 32 */
403322ae8ebSMichal Simek	addi	r5, r5, -32		/* d = d - 32 */
404322ae8ebSMichal Simek	lwi	r9, r6, 28		/* t1 = *(s + 28) */
405322ae8ebSMichal Simek	lwi	r10, r6, 24		/* t2 = *(s + 24) */
406322ae8ebSMichal Simek	lwi	r11, r6, 20		/* t3 = *(s + 20) */
407322ae8ebSMichal Simek	lwi	r12, r6, 16		/* t4 = *(s + 16) */
408322ae8ebSMichal Simek	swi	r9, r5, 28		/* *(d + 28) = t1 */
409322ae8ebSMichal Simek	swi	r10, r5, 24		/* *(d + 24) = t2 */
410322ae8ebSMichal Simek	swi	r11, r5, 20		/* *(d + 20) = t3 */
411322ae8ebSMichal Simek	swi	r12, r5, 16		/* *(d + 16) = t4 */
412322ae8ebSMichal Simek	lwi	r9, r6, 12		/* t1 = *(s + 12) */
413322ae8ebSMichal Simek	lwi	r10, r6, 8		/* t2 = *(s + 8) */
414322ae8ebSMichal Simek	lwi	r11, r6, 4		/* t3 = *(s + 4) */
415322ae8ebSMichal Simek	lwi	r12, r6, 0		/* t4 = *(s + 0) */
416322ae8ebSMichal Simek	swi	r9, r5, 12		/* *(d + 12) = t1 */
417322ae8ebSMichal Simek	swi	r10, r5, 8		/* *(d + 8) = t2 */
418322ae8ebSMichal Simek	swi	r11, r5, 4		/* *(d + 4) = t3 */
419322ae8ebSMichal Simek	addi	r4, r4, -32		/* n = n - 32 */
420322ae8ebSMichal Simek	bneid	r4, d_block_aligned	/* while (n) loop */
421322ae8ebSMichal Simek	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
422322ae8ebSMichal Simek	bri	d_block_done
423322ae8ebSMichal Simek
424322ae8ebSMichal Simekd_block_unaligned:
425322ae8ebSMichal Simek	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
426322ae8ebSMichal Simek	rsub	r6, r4, r6		/* s = s - n */
427322ae8ebSMichal Simek	lwi	r11, r8, 0		/* h = *(as + 0) */
428322ae8ebSMichal Simek
429322ae8ebSMichal Simek	addi	r9, r9, -1
430322ae8ebSMichal Simek	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
431322ae8ebSMichal Simek	addi	r9, r9, -1
432322ae8ebSMichal Simek	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
433322ae8ebSMichal Simek
434322ae8ebSMichal Simekd_block_u3:
435322ae8ebSMichal Simek	bsrli	r11, r11, 8	/* h = h >> 8 */
436322ae8ebSMichal Simekd_bu3_loop:
437322ae8ebSMichal Simek	addi	r8, r8, -32	/* as = as - 32 */
438322ae8ebSMichal Simek	addi	r5, r5, -32	/* d = d - 32 */
439322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
440322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
441322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
442322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
443322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
444322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
445322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
446322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
447322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
448322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
449322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
450322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
451322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
452322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
453322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
454322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
455322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
456322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
457322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
458322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
459322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
460322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
461322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
462322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 112) = t1 */
463322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
464322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
465322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
466322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
467322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
468322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
469322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
470322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
471322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
472322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
473322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 */
474322ae8ebSMichal Simek	lwi	r12, r8, 0	/* v = *(as + 0) */
475322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
476322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
477322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
478322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
479322ae8ebSMichal Simek	bneid	r4, d_bu3_loop	/* while (n) loop */
480322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
481322ae8ebSMichal Simek	bri	d_block_done
482322ae8ebSMichal Simek
483322ae8ebSMichal Simekd_block_u1:
484322ae8ebSMichal Simek	bsrli	r11, r11, 24	/* h = h >> 24 */
485322ae8ebSMichal Simekd_bu1_loop:
486322ae8ebSMichal Simek	addi	r8, r8, -32	/* as = as - 32 */
487322ae8ebSMichal Simek	addi	r5, r5, -32	/* d = d - 32 */
488322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
489322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
490322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
491322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
492322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
493322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
494322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
495322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
496322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
497322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
498322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
499322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
500322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
501322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
502322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
503322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
504322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
505322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
506322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
507322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
508322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
509322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
510322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
511322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 112) = t1 */
512322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
513322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
514322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
515322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
516322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
517322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
518322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
519322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
520322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
521322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
522322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 */
523322ae8ebSMichal Simek	lwi	r12, r8, 0	/* v = *(as + 0) */
524322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
525322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
526322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
527322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
528322ae8ebSMichal Simek	bneid	r4, d_bu1_loop	/* while (n) loop */
529322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
530322ae8ebSMichal Simek	bri	d_block_done
531322ae8ebSMichal Simek
532322ae8ebSMichal Simekd_block_u2:
533322ae8ebSMichal Simek	bsrli	r11, r11, 16	/* h = h >> 16 */
534322ae8ebSMichal Simekd_bu2_loop:
535322ae8ebSMichal Simek	addi	r8, r8, -32	/* as = as - 32 */
536322ae8ebSMichal Simek	addi	r5, r5, -32	/* d = d - 32 */
537322ae8ebSMichal Simek	lwi	r12, r8, 28	/* v = *(as + 28) */
538322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
539322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
540322ae8ebSMichal Simek	swi	r9, r5, 28	/* *(d + 28) = t1 */
541322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
542322ae8ebSMichal Simek	lwi	r12, r8, 24	/* v = *(as + 24) */
543322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
544322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
545322ae8ebSMichal Simek	swi	r9, r5, 24	/* *(d + 24) = t1 */
546322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
547322ae8ebSMichal Simek	lwi	r12, r8, 20	/* v = *(as + 20) */
548322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
549322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
550322ae8ebSMichal Simek	swi	r9, r5, 20	/* *(d + 20) = t1 */
551322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
552322ae8ebSMichal Simek	lwi	r12, r8, 16	/* v = *(as + 16) */
553322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
554322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
555322ae8ebSMichal Simek	swi	r9, r5, 16	/* *(d + 16) = t1 */
556322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
557322ae8ebSMichal Simek	lwi	r12, r8, 12	/* v = *(as + 12) */
558322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
559322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
560322ae8ebSMichal Simek	swi	r9, r5, 12	/* *(d + 112) = t1 */
561322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
562322ae8ebSMichal Simek	lwi	r12, r8, 8	/* v = *(as + 8) */
563322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
564322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
565322ae8ebSMichal Simek	swi	r9, r5, 8	/* *(d + 8) = t1 */
566322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
567322ae8ebSMichal Simek	lwi	r12, r8, 4	/* v = *(as + 4) */
568322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
569322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
570322ae8ebSMichal Simek	swi	r9, r5, 4	/* *(d + 4) = t1 */
571322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 */
572322ae8ebSMichal Simek	lwi	r12, r8, 0	/* v = *(as + 0) */
573322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
574322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
575322ae8ebSMichal Simek	swi	r9, r5, 0	/* *(d + 0) = t1 */
576322ae8ebSMichal Simek	addi	r4, r4, -32	/* n = n - 32 */
577322ae8ebSMichal Simek	bneid	r4, d_bu2_loop	/* while (n) loop */
578322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
579322ae8ebSMichal Simek
580322ae8ebSMichal Simekd_block_done:
581322ae8ebSMichal Simek	addi	r4, r0, 4	/* n = 4 */
582322ae8ebSMichal Simek	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
583322ae8ebSMichal Simek	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
584322ae8ebSMichal Simek
585322ae8ebSMichal Simekd_word_xfer:
586322ae8ebSMichal Simek	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
587322ae8ebSMichal Simek	rsub	r5, r4, r5		/* d = d - n */
588322ae8ebSMichal Simek	rsub	r6, r4, r6		/* s = s - n */
589322ae8ebSMichal Simek	rsub	r7, r4, r7		/* c = c - n */
590322ae8ebSMichal Simek
591322ae8ebSMichal Simek	andi	r9, r6, 3		/* t1 = s & 3 */
592322ae8ebSMichal Simek	/* if temp != 0, unaligned transfers needed */
593322ae8ebSMichal Simek	bnei	r9, d_word_unaligned
594322ae8ebSMichal Simek
595322ae8ebSMichal Simekd_word_aligned:
596322ae8ebSMichal Simek	addi	r4, r4,-4		/* n-- */
597322ae8ebSMichal Simek	lw	r9, r6, r4		/* t1 = *(s+n) */
598322ae8ebSMichal Simek	bneid	r4, d_word_aligned	/* loop */
599322ae8ebSMichal Simek	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
600322ae8ebSMichal Simek
601322ae8ebSMichal Simek	bri	d_word_done
602322ae8ebSMichal Simek
603322ae8ebSMichal Simekd_word_unaligned:
604322ae8ebSMichal Simek	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
605322ae8ebSMichal Simek	lw	r11, r8, r4		/* h = *(as + n) */
606322ae8ebSMichal Simek
607322ae8ebSMichal Simek	addi	r9, r9, -1
608322ae8ebSMichal Simek	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
609322ae8ebSMichal Simek	addi	r9, r9, -1
610322ae8ebSMichal Simek	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
611322ae8ebSMichal Simek
612322ae8ebSMichal Simekd_word_u3:
613322ae8ebSMichal Simek	bsrli	r11, r11, 8	/* h = h >> 8 */
614322ae8ebSMichal Simekd_wu3_loop:
615322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
616322ae8ebSMichal Simek	lw	r12, r8, r4	/* v = *(as + n) */
617322ae8ebSMichal Simek	bslli	r9, r12, 24	/* t1 = v << 24 */
618322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
619322ae8ebSMichal Simek	sw	r9, r5, r4	/* *(d + n) = t1 */
620322ae8ebSMichal Simek	bneid	r4, d_wu3_loop	/* while (n) loop */
621322ae8ebSMichal Simek	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
622322ae8ebSMichal Simek
623322ae8ebSMichal Simek	bri	d_word_done
624322ae8ebSMichal Simek
625322ae8ebSMichal Simekd_word_u1:
626322ae8ebSMichal Simek	bsrli	r11, r11, 24	/* h = h >> 24 */
627322ae8ebSMichal Simekd_wu1_loop:
628322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
629322ae8ebSMichal Simek	lw	r12, r8, r4	/* v = *(as + n) */
630322ae8ebSMichal Simek	bslli	r9, r12, 8	/* t1 = v << 8 */
631322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
632322ae8ebSMichal Simek	sw	r9, r5, r4	/* *(d + n) = t1 */
633322ae8ebSMichal Simek	bneid	r4, d_wu1_loop	/* while (n) loop */
634322ae8ebSMichal Simek	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
635322ae8ebSMichal Simek
636322ae8ebSMichal Simek	bri	d_word_done
637322ae8ebSMichal Simek
638322ae8ebSMichal Simekd_word_u2:
639322ae8ebSMichal Simek	bsrli	r11, r11, 16	/* h = h >> 16 */
640322ae8ebSMichal Simekd_wu2_loop:
641322ae8ebSMichal Simek	addi	r4, r4,-4	/* n = n - 4 */
642322ae8ebSMichal Simek	lw	r12, r8, r4	/* v = *(as + n) */
643322ae8ebSMichal Simek	bslli	r9, r12, 16	/* t1 = v << 16 */
644322ae8ebSMichal Simek	or	r9, r11, r9	/* t1 = h | t1 */
645322ae8ebSMichal Simek	sw	r9, r5, r4	/* *(d + n) = t1 */
646322ae8ebSMichal Simek	bneid	r4, d_wu2_loop	/* while (n) loop */
647322ae8ebSMichal Simek	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
648322ae8ebSMichal Simek
649322ae8ebSMichal Simekd_word_done:
650322ae8ebSMichal Simek
651322ae8ebSMichal Simekd_xfer_end:
652322ae8ebSMichal Simekd_xfer_end_loop:
653322ae8ebSMichal Simek	beqi	r7, a_done		/* while (c) */
654322ae8ebSMichal Simek	addi	r6, r6, -1		/* s-- */
655322ae8ebSMichal Simek	lbui	r9, r6, 0		/* t1 = *s */
656322ae8ebSMichal Simek	addi	r5, r5, -1		/* d-- */
657322ae8ebSMichal Simek	sbi	r9, r5, 0		/* *d = t1 */
658322ae8ebSMichal Simek	brid	d_xfer_end_loop		/* loop */
659322ae8ebSMichal Simek	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
660322ae8ebSMichal Simek
661322ae8ebSMichal Simekd_done:
662322ae8ebSMichal Simek	rtsd	r15, 8
663322ae8ebSMichal Simek	nop
664322ae8ebSMichal Simek
665*13851966SMichal Simek.size  memmove, . - memmove
666322ae8ebSMichal Simek.end memmove
667