xref: /openbmc/linux/arch/microblaze/lib/fastcopy.S (revision 7dd65feb)
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License.  See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 *	memcpy in memcpy.c and
14 *	memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 *	Input :	Operand1 in Reg r5 - destination address
22 *		Operand2 in Reg r6 - source address
23 *		Operand3 in Reg r7 - number of bytes to transfer
24 *	Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 *	Perform (possibly unaligned) copy of a block of memory
29 *	between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33
34	.globl	memcpy
35	.ent	memcpy
36
37memcpy:
38fast_memcpy_ascending:
39	/* move d to return register as value of function */
40	addi	r3, r5, 0
41
42	addi	r4, r0, 4	/* n = 4 */
43	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
44	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
45
46	/* transfer first 0~3 bytes to get aligned dest address */
47	andi	r4, r5, 3		/* n = d & 3 */
48	/* if zero, destination already aligned */
49	beqi	r4, a_dalign_done
50	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
51	rsubi	r4, r4, 4
52	rsub	r7, r4, r7		/* c = c - n adjust c */
53
54a_xfer_first_loop:
55	/* if no bytes left to transfer, transfer the bulk */
56	beqi	r4, a_dalign_done
57	lbui	r11, r6, 0		/* h = *s */
58	sbi	r11, r5, 0		/* *d = h */
59	addi	r6, r6, 1		/* s++ */
60	addi	r5, r5, 1		/* d++ */
61	brid	a_xfer_first_loop	/* loop */
62	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
63
64a_dalign_done:
65	addi	r4, r0, 32		/* n = 32 */
66	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
67	/* if n < 0, less than one block to transfer */
68	blti	r4, a_block_done
69
70a_block_xfer:
71	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
72	rsub	r7, r4, r7		/* c = c - n */
73
74	andi	r9, r6, 3		/* t1 = s & 3 */
75	/* if temp != 0, unaligned transfers needed */
76	bnei	r9, a_block_unaligned
77
78a_block_aligned:
79	lwi	r9, r6, 0		/* t1 = *(s + 0) */
80	lwi	r10, r6, 4		/* t2 = *(s + 4) */
81	lwi	r11, r6, 8		/* t3 = *(s + 8) */
82	lwi	r12, r6, 12		/* t4 = *(s + 12) */
83	swi	r9, r5, 0		/* *(d + 0) = t1 */
84	swi	r10, r5, 4		/* *(d + 4) = t2 */
85	swi	r11, r5, 8		/* *(d + 8) = t3 */
86	swi	r12, r5, 12		/* *(d + 12) = t4 */
87	lwi	r9, r6, 16		/* t1 = *(s + 16) */
88	lwi	r10, r6, 20		/* t2 = *(s + 20) */
89	lwi	r11, r6, 24		/* t3 = *(s + 24) */
90	lwi	r12, r6, 28		/* t4 = *(s + 28) */
91	swi	r9, r5, 16		/* *(d + 16) = t1 */
92	swi	r10, r5, 20		/* *(d + 20) = t2 */
93	swi	r11, r5, 24		/* *(d + 24) = t3 */
94	swi	r12, r5, 28		/* *(d + 28) = t4 */
95	addi	r6, r6, 32		/* s = s + 32 */
96	addi	r4, r4, -32		/* n = n - 32 */
97	bneid	r4, a_block_aligned	/* while (n) loop */
98	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
99	bri	a_block_done
100
101a_block_unaligned:
102	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
103	add	r6, r6, r4		/* s = s + n */
104	lwi	r11, r8, 0		/* h = *(as + 0) */
105
106	addi	r9, r9, -1
107	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
108	addi	r9, r9, -1
109	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
110
111a_block_u3:
112	bslli	r11, r11, 24	/* h = h << 24 */
113a_bu3_loop:
114	lwi	r12, r8, 4	/* v = *(as + 4) */
115	bsrli	r9, r12, 8	/* t1 = v >> 8 */
116	or	r9, r11, r9	/* t1 = h | t1 */
117	swi	r9, r5, 0	/* *(d + 0) = t1 */
118	bslli	r11, r12, 24	/* h = v << 24 */
119	lwi	r12, r8, 8	/* v = *(as + 8) */
120	bsrli	r9, r12, 8	/* t1 = v >> 8 */
121	or	r9, r11, r9	/* t1 = h | t1 */
122	swi	r9, r5, 4	/* *(d + 4) = t1 */
123	bslli	r11, r12, 24	/* h = v << 24 */
124	lwi	r12, r8, 12	/* v = *(as + 12) */
125	bsrli	r9, r12, 8	/* t1 = v >> 8 */
126	or	r9, r11, r9	/* t1 = h | t1 */
127	swi	r9, r5, 8	/* *(d + 8) = t1 */
128	bslli	r11, r12, 24	/* h = v << 24 */
129	lwi	r12, r8, 16	/* v = *(as + 16) */
130	bsrli	r9, r12, 8	/* t1 = v >> 8 */
131	or	r9, r11, r9	/* t1 = h | t1 */
132	swi	r9, r5, 12	/* *(d + 12) = t1 */
133	bslli	r11, r12, 24	/* h = v << 24 */
134	lwi	r12, r8, 20	/* v = *(as + 20) */
135	bsrli	r9, r12, 8	/* t1 = v >> 8 */
136	or	r9, r11, r9	/* t1 = h | t1 */
137	swi	r9, r5, 16	/* *(d + 16) = t1 */
138	bslli	r11, r12, 24	/* h = v << 24 */
139	lwi	r12, r8, 24	/* v = *(as + 24) */
140	bsrli	r9, r12, 8	/* t1 = v >> 8 */
141	or	r9, r11, r9	/* t1 = h | t1 */
142	swi	r9, r5, 20	/* *(d + 20) = t1 */
143	bslli	r11, r12, 24	/* h = v << 24 */
144	lwi	r12, r8, 28	/* v = *(as + 28) */
145	bsrli	r9, r12, 8	/* t1 = v >> 8 */
146	or	r9, r11, r9	/* t1 = h | t1 */
147	swi	r9, r5, 24	/* *(d + 24) = t1 */
148	bslli	r11, r12, 24	/* h = v << 24 */
149	lwi	r12, r8, 32	/* v = *(as + 32) */
150	bsrli	r9, r12, 8	/* t1 = v >> 8 */
151	or	r9, r11, r9	/* t1 = h | t1 */
152	swi	r9, r5, 28	/* *(d + 28) = t1 */
153	bslli	r11, r12, 24	/* h = v << 24 */
154	addi	r8, r8, 32	/* as = as + 32 */
155	addi	r4, r4, -32	/* n = n - 32 */
156	bneid	r4, a_bu3_loop	/* while (n) loop */
157	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
158	bri	a_block_done
159
160a_block_u1:
161	bslli	r11, r11, 8	/* h = h << 8 */
162a_bu1_loop:
163	lwi	r12, r8, 4	/* v = *(as + 4) */
164	bsrli	r9, r12, 24	/* t1 = v >> 24 */
165	or	r9, r11, r9	/* t1 = h | t1 */
166	swi	r9, r5, 0	/* *(d + 0) = t1 */
167	bslli	r11, r12, 8	/* h = v << 8 */
168	lwi	r12, r8, 8	/* v = *(as + 8) */
169	bsrli	r9, r12, 24	/* t1 = v >> 24 */
170	or	r9, r11, r9	/* t1 = h | t1 */
171	swi	r9, r5, 4	/* *(d + 4) = t1 */
172	bslli	r11, r12, 8	/* h = v << 8 */
173	lwi	r12, r8, 12	/* v = *(as + 12) */
174	bsrli	r9, r12, 24	/* t1 = v >> 24 */
175	or	r9, r11, r9	/* t1 = h | t1 */
176	swi	r9, r5, 8	/* *(d + 8) = t1 */
177	bslli	r11, r12, 8	/* h = v << 8 */
178	lwi	r12, r8, 16	/* v = *(as + 16) */
179	bsrli	r9, r12, 24	/* t1 = v >> 24 */
180	or	r9, r11, r9	/* t1 = h | t1 */
181	swi	r9, r5, 12	/* *(d + 12) = t1 */
182	bslli	r11, r12, 8	/* h = v << 8 */
183	lwi	r12, r8, 20	/* v = *(as + 20) */
184	bsrli	r9, r12, 24	/* t1 = v >> 24 */
185	or	r9, r11, r9	/* t1 = h | t1 */
186	swi	r9, r5, 16	/* *(d + 16) = t1 */
187	bslli	r11, r12, 8	/* h = v << 8 */
188	lwi	r12, r8, 24	/* v = *(as + 24) */
189	bsrli	r9, r12, 24	/* t1 = v >> 24 */
190	or	r9, r11, r9	/* t1 = h | t1 */
191	swi	r9, r5, 20	/* *(d + 20) = t1 */
192	bslli	r11, r12, 8	/* h = v << 8 */
193	lwi	r12, r8, 28	/* v = *(as + 28) */
194	bsrli	r9, r12, 24	/* t1 = v >> 24 */
195	or	r9, r11, r9	/* t1 = h | t1 */
196	swi	r9, r5, 24	/* *(d + 24) = t1 */
197	bslli	r11, r12, 8	/* h = v << 8 */
198	lwi	r12, r8, 32	/* v = *(as + 32) */
199	bsrli	r9, r12, 24	/* t1 = v >> 24 */
200	or	r9, r11, r9	/* t1 = h | t1 */
201	swi	r9, r5, 28	/* *(d + 28) = t1 */
202	bslli	r11, r12, 8	/* h = v << 8 */
203	addi	r8, r8, 32	/* as = as + 32 */
204	addi	r4, r4, -32	/* n = n - 32 */
205	bneid	r4, a_bu1_loop	/* while (n) loop */
206	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
207	bri	a_block_done
208
209a_block_u2:
210	bslli	r11, r11, 16	/* h = h << 16 */
211a_bu2_loop:
212	lwi	r12, r8, 4	/* v = *(as + 4) */
213	bsrli	r9, r12, 16	/* t1 = v >> 16 */
214	or	r9, r11, r9	/* t1 = h | t1 */
215	swi	r9, r5, 0	/* *(d + 0) = t1 */
216	bslli	r11, r12, 16	/* h = v << 16 */
217	lwi	r12, r8, 8	/* v = *(as + 8) */
218	bsrli	r9, r12, 16	/* t1 = v >> 16 */
219	or	r9, r11, r9	/* t1 = h | t1 */
220	swi	r9, r5, 4	/* *(d + 4) = t1 */
221	bslli	r11, r12, 16	/* h = v << 16 */
222	lwi	r12, r8, 12	/* v = *(as + 12) */
223	bsrli	r9, r12, 16	/* t1 = v >> 16 */
224	or	r9, r11, r9	/* t1 = h | t1 */
225	swi	r9, r5, 8	/* *(d + 8) = t1 */
226	bslli	r11, r12, 16	/* h = v << 16 */
227	lwi	r12, r8, 16	/* v = *(as + 16) */
228	bsrli	r9, r12, 16	/* t1 = v >> 16 */
229	or	r9, r11, r9	/* t1 = h | t1 */
230	swi	r9, r5, 12	/* *(d + 12) = t1 */
231	bslli	r11, r12, 16	/* h = v << 16 */
232	lwi	r12, r8, 20	/* v = *(as + 20) */
233	bsrli	r9, r12, 16	/* t1 = v >> 16 */
234	or	r9, r11, r9	/* t1 = h | t1 */
235	swi	r9, r5, 16	/* *(d + 16) = t1 */
236	bslli	r11, r12, 16	/* h = v << 16 */
237	lwi	r12, r8, 24	/* v = *(as + 24) */
238	bsrli	r9, r12, 16	/* t1 = v >> 16 */
239	or	r9, r11, r9	/* t1 = h | t1 */
240	swi	r9, r5, 20	/* *(d + 20) = t1 */
241	bslli	r11, r12, 16	/* h = v << 16 */
242	lwi	r12, r8, 28	/* v = *(as + 28) */
243	bsrli	r9, r12, 16	/* t1 = v >> 16 */
244	or	r9, r11, r9	/* t1 = h | t1 */
245	swi	r9, r5, 24	/* *(d + 24) = t1 */
246	bslli	r11, r12, 16	/* h = v << 16 */
247	lwi	r12, r8, 32	/* v = *(as + 32) */
248	bsrli	r9, r12, 16	/* t1 = v >> 16 */
249	or	r9, r11, r9	/* t1 = h | t1 */
250	swi	r9, r5, 28	/* *(d + 28) = t1 */
251	bslli	r11, r12, 16	/* h = v << 16 */
252	addi	r8, r8, 32	/* as = as + 32 */
253	addi	r4, r4, -32	/* n = n - 32 */
254	bneid	r4, a_bu2_loop	/* while (n) loop */
255	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
256
257a_block_done:
258	addi	r4, r0, 4	/* n = 4 */
259	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
260	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
261
262a_word_xfer:
263	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
264	addi	r10, r0, 0		/* offset = 0 */
265
266	andi	r9, r6, 3		/* t1 = s & 3 */
267	/* if temp != 0, unaligned transfers needed */
268	bnei	r9, a_word_unaligned
269
270a_word_aligned:
271	lw	r9, r6, r10		/* t1 = *(s+offset) */
272	sw	r9, r5, r10		/* *(d+offset) = t1 */
273	addi	r4, r4,-4		/* n-- */
274	bneid	r4, a_word_aligned	/* loop */
275	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
276
277	bri	a_word_done
278
279a_word_unaligned:
280	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
281	lwi	r11, r8, 0		/* h = *(as + 0) */
282	addi	r8, r8, 4		/* as = as + 4 */
283
284	addi	r9, r9, -1
285	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
286	addi	r9, r9, -1
287	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
288
289a_word_u3:
290	bslli	r11, r11, 24	/* h = h << 24 */
291a_wu3_loop:
292	lw	r12, r8, r10	/* v = *(as + offset) */
293	bsrli	r9, r12, 8	/* t1 = v >> 8 */
294	or	r9, r11, r9	/* t1 = h | t1 */
295	sw	r9, r5, r10	/* *(d + offset) = t1 */
296	bslli	r11, r12, 24	/* h = v << 24 */
297	addi	r4, r4,-4	/* n = n - 4 */
298	bneid	r4, a_wu3_loop	/* while (n) loop */
299	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
300
301	bri	a_word_done
302
303a_word_u1:
304	bslli	r11, r11, 8	/* h = h << 8 */
305a_wu1_loop:
306	lw	r12, r8, r10	/* v = *(as + offset) */
307	bsrli	r9, r12, 24	/* t1 = v >> 24 */
308	or	r9, r11, r9	/* t1 = h | t1 */
309	sw	r9, r5, r10	/* *(d + offset) = t1 */
310	bslli	r11, r12, 8	/* h = v << 8 */
311	addi	r4, r4,-4	/* n = n - 4 */
312	bneid	r4, a_wu1_loop	/* while (n) loop */
313	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
314
315	bri	a_word_done
316
317a_word_u2:
318	bslli	r11, r11, 16	/* h = h << 16 */
319a_wu2_loop:
320	lw	r12, r8, r10	/* v = *(as + offset) */
321	bsrli	r9, r12, 16	/* t1 = v >> 16 */
322	or	r9, r11, r9	/* t1 = h | t1 */
323	sw	r9, r5, r10	/* *(d + offset) = t1 */
324	bslli	r11, r12, 16	/* h = v << 16 */
325	addi	r4, r4,-4	/* n = n - 4 */
326	bneid	r4, a_wu2_loop	/* while (n) loop */
327	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
328
329a_word_done:
330	add	r5, r5, r10	/* d = d + offset */
331	add	r6, r6, r10	/* s = s + offset */
332	rsub	r7, r10, r7	/* c = c - offset */
333
334a_xfer_end:
335a_xfer_end_loop:
336	beqi	r7, a_done		/* while (c) */
337	lbui	r9, r6, 0		/* t1 = *s */
338	addi	r6, r6, 1		/* s++ */
339	sbi	r9, r5, 0		/* *d = t1 */
340	addi	r7, r7, -1		/* c-- */
341	brid	a_xfer_end_loop		/* loop */
342	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
343
344a_done:
345	rtsd	r15, 8
346	nop
347
348.end memcpy
349/*----------------------------------------------------------------------------*/
350	.globl	memmove
351	.ent	memmove
352
353memmove:
354	cmpu	r4, r5, r6	/* n = s - d */
355	bgei	r4,fast_memcpy_ascending
356
357fast_memcpy_descending:
358	/* move d to return register as value of function */
359	addi	r3, r5, 0
360
361	add	r5, r5, r7	/* d = d + c */
362	add	r6, r6, r7	/* s = s + c */
363
364	addi	r4, r0, 4	/* n = 4 */
365	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
366	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
367
368	/* transfer first 0~3 bytes to get aligned dest address */
369	andi	r4, r5, 3		/* n = d & 3 */
370	/* if zero, destination already aligned */
371	beqi	r4,d_dalign_done
372	rsub	r7, r4, r7		/* c = c - n adjust c */
373
374d_xfer_first_loop:
375	/* if no bytes left to transfer, transfer the bulk */
376	beqi	r4,d_dalign_done
377	addi	r6, r6, -1		/* s-- */
378	addi	r5, r5, -1		/* d-- */
379	lbui	r11, r6, 0		/* h = *s */
380	sbi	r11, r5, 0		/* *d = h */
381	brid	d_xfer_first_loop	/* loop */
382	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
383
384d_dalign_done:
385	addi	r4, r0, 32	/* n = 32 */
386	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
387	/* if n < 0, less than one block to transfer */
388	blti	r4, d_block_done
389
390d_block_xfer:
391	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
392	rsub	r7, r4, r7		/* c = c - n */
393
394	andi	r9, r6, 3		/* t1 = s & 3 */
395	/* if temp != 0, unaligned transfers needed */
396	bnei	r9, d_block_unaligned
397
398d_block_aligned:
399	addi	r6, r6, -32		/* s = s - 32 */
400	addi	r5, r5, -32		/* d = d - 32 */
401	lwi	r9, r6, 28		/* t1 = *(s + 28) */
402	lwi	r10, r6, 24		/* t2 = *(s + 24) */
403	lwi	r11, r6, 20		/* t3 = *(s + 20) */
404	lwi	r12, r6, 16		/* t4 = *(s + 16) */
405	swi	r9, r5, 28		/* *(d + 28) = t1 */
406	swi	r10, r5, 24		/* *(d + 24) = t2 */
407	swi	r11, r5, 20		/* *(d + 20) = t3 */
408	swi	r12, r5, 16		/* *(d + 16) = t4 */
409	lwi	r9, r6, 12		/* t1 = *(s + 12) */
410	lwi	r10, r6, 8		/* t2 = *(s + 8) */
411	lwi	r11, r6, 4		/* t3 = *(s + 4) */
412	lwi	r12, r6, 0		/* t4 = *(s + 0) */
413	swi	r9, r5, 12		/* *(d + 12) = t1 */
414	swi	r10, r5, 8		/* *(d + 8) = t2 */
415	swi	r11, r5, 4		/* *(d + 4) = t3 */
416	addi	r4, r4, -32		/* n = n - 32 */
417	bneid	r4, d_block_aligned	/* while (n) loop */
418	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
419	bri	d_block_done
420
421d_block_unaligned:
422	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
423	rsub	r6, r4, r6		/* s = s - n */
424	lwi	r11, r8, 0		/* h = *(as + 0) */
425
426	addi	r9, r9, -1
427	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
428	addi	r9, r9, -1
429	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
430
431d_block_u3:
432	bsrli	r11, r11, 8	/* h = h >> 8 */
433d_bu3_loop:
434	addi	r8, r8, -32	/* as = as - 32 */
435	addi	r5, r5, -32	/* d = d - 32 */
436	lwi	r12, r8, 28	/* v = *(as + 28) */
437	bslli	r9, r12, 24	/* t1 = v << 24 */
438	or	r9, r11, r9	/* t1 = h | t1 */
439	swi	r9, r5, 28	/* *(d + 28) = t1 */
440	bsrli	r11, r12, 8	/* h = v >> 8 */
441	lwi	r12, r8, 24	/* v = *(as + 24) */
442	bslli	r9, r12, 24	/* t1 = v << 24 */
443	or	r9, r11, r9	/* t1 = h | t1 */
444	swi	r9, r5, 24	/* *(d + 24) = t1 */
445	bsrli	r11, r12, 8	/* h = v >> 8 */
446	lwi	r12, r8, 20	/* v = *(as + 20) */
447	bslli	r9, r12, 24	/* t1 = v << 24 */
448	or	r9, r11, r9	/* t1 = h | t1 */
449	swi	r9, r5, 20	/* *(d + 20) = t1 */
450	bsrli	r11, r12, 8	/* h = v >> 8 */
451	lwi	r12, r8, 16	/* v = *(as + 16) */
452	bslli	r9, r12, 24	/* t1 = v << 24 */
453	or	r9, r11, r9	/* t1 = h | t1 */
454	swi	r9, r5, 16	/* *(d + 16) = t1 */
455	bsrli	r11, r12, 8	/* h = v >> 8 */
456	lwi	r12, r8, 12	/* v = *(as + 12) */
457	bslli	r9, r12, 24	/* t1 = v << 24 */
458	or	r9, r11, r9	/* t1 = h | t1 */
459	swi	r9, r5, 12	/* *(d + 112) = t1 */
460	bsrli	r11, r12, 8	/* h = v >> 8 */
461	lwi	r12, r8, 8	/* v = *(as + 8) */
462	bslli	r9, r12, 24	/* t1 = v << 24 */
463	or	r9, r11, r9	/* t1 = h | t1 */
464	swi	r9, r5, 8	/* *(d + 8) = t1 */
465	bsrli	r11, r12, 8	/* h = v >> 8 */
466	lwi	r12, r8, 4	/* v = *(as + 4) */
467	bslli	r9, r12, 24	/* t1 = v << 24 */
468	or	r9, r11, r9	/* t1 = h | t1 */
469	swi	r9, r5, 4	/* *(d + 4) = t1 */
470	bsrli	r11, r12, 8	/* h = v >> 8 */
471	lwi	r12, r8, 0	/* v = *(as + 0) */
472	bslli	r9, r12, 24	/* t1 = v << 24 */
473	or	r9, r11, r9	/* t1 = h | t1 */
474	swi	r9, r5, 0	/* *(d + 0) = t1 */
475	addi	r4, r4, -32	/* n = n - 32 */
476	bneid	r4, d_bu3_loop	/* while (n) loop */
477	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
478	bri	d_block_done
479
480d_block_u1:
481	bsrli	r11, r11, 24	/* h = h >> 24 */
482d_bu1_loop:
483	addi	r8, r8, -32	/* as = as - 32 */
484	addi	r5, r5, -32	/* d = d - 32 */
485	lwi	r12, r8, 28	/* v = *(as + 28) */
486	bslli	r9, r12, 8	/* t1 = v << 8 */
487	or	r9, r11, r9	/* t1 = h | t1 */
488	swi	r9, r5, 28	/* *(d + 28) = t1 */
489	bsrli	r11, r12, 24	/* h = v >> 24 */
490	lwi	r12, r8, 24	/* v = *(as + 24) */
491	bslli	r9, r12, 8	/* t1 = v << 8 */
492	or	r9, r11, r9	/* t1 = h | t1 */
493	swi	r9, r5, 24	/* *(d + 24) = t1 */
494	bsrli	r11, r12, 24	/* h = v >> 24 */
495	lwi	r12, r8, 20	/* v = *(as + 20) */
496	bslli	r9, r12, 8	/* t1 = v << 8 */
497	or	r9, r11, r9	/* t1 = h | t1 */
498	swi	r9, r5, 20	/* *(d + 20) = t1 */
499	bsrli	r11, r12, 24	/* h = v >> 24 */
500	lwi	r12, r8, 16	/* v = *(as + 16) */
501	bslli	r9, r12, 8	/* t1 = v << 8 */
502	or	r9, r11, r9	/* t1 = h | t1 */
503	swi	r9, r5, 16	/* *(d + 16) = t1 */
504	bsrli	r11, r12, 24	/* h = v >> 24 */
505	lwi	r12, r8, 12	/* v = *(as + 12) */
506	bslli	r9, r12, 8	/* t1 = v << 8 */
507	or	r9, r11, r9	/* t1 = h | t1 */
508	swi	r9, r5, 12	/* *(d + 112) = t1 */
509	bsrli	r11, r12, 24	/* h = v >> 24 */
510	lwi	r12, r8, 8	/* v = *(as + 8) */
511	bslli	r9, r12, 8	/* t1 = v << 8 */
512	or	r9, r11, r9	/* t1 = h | t1 */
513	swi	r9, r5, 8	/* *(d + 8) = t1 */
514	bsrli	r11, r12, 24	/* h = v >> 24 */
515	lwi	r12, r8, 4	/* v = *(as + 4) */
516	bslli	r9, r12, 8	/* t1 = v << 8 */
517	or	r9, r11, r9	/* t1 = h | t1 */
518	swi	r9, r5, 4	/* *(d + 4) = t1 */
519	bsrli	r11, r12, 24	/* h = v >> 24 */
520	lwi	r12, r8, 0	/* v = *(as + 0) */
521	bslli	r9, r12, 8	/* t1 = v << 8 */
522	or	r9, r11, r9	/* t1 = h | t1 */
523	swi	r9, r5, 0	/* *(d + 0) = t1 */
524	addi	r4, r4, -32	/* n = n - 32 */
525	bneid	r4, d_bu1_loop	/* while (n) loop */
526	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
527	bri	d_block_done
528
529d_block_u2:
530	bsrli	r11, r11, 16	/* h = h >> 16 */
531d_bu2_loop:
532	addi	r8, r8, -32	/* as = as - 32 */
533	addi	r5, r5, -32	/* d = d - 32 */
534	lwi	r12, r8, 28	/* v = *(as + 28) */
535	bslli	r9, r12, 16	/* t1 = v << 16 */
536	or	r9, r11, r9	/* t1 = h | t1 */
537	swi	r9, r5, 28	/* *(d + 28) = t1 */
538	bsrli	r11, r12, 16	/* h = v >> 16 */
539	lwi	r12, r8, 24	/* v = *(as + 24) */
540	bslli	r9, r12, 16	/* t1 = v << 16 */
541	or	r9, r11, r9	/* t1 = h | t1 */
542	swi	r9, r5, 24	/* *(d + 24) = t1 */
543	bsrli	r11, r12, 16	/* h = v >> 16 */
544	lwi	r12, r8, 20	/* v = *(as + 20) */
545	bslli	r9, r12, 16	/* t1 = v << 16 */
546	or	r9, r11, r9	/* t1 = h | t1 */
547	swi	r9, r5, 20	/* *(d + 20) = t1 */
548	bsrli	r11, r12, 16	/* h = v >> 16 */
549	lwi	r12, r8, 16	/* v = *(as + 16) */
550	bslli	r9, r12, 16	/* t1 = v << 16 */
551	or	r9, r11, r9	/* t1 = h | t1 */
552	swi	r9, r5, 16	/* *(d + 16) = t1 */
553	bsrli	r11, r12, 16	/* h = v >> 16 */
554	lwi	r12, r8, 12	/* v = *(as + 12) */
555	bslli	r9, r12, 16	/* t1 = v << 16 */
556	or	r9, r11, r9	/* t1 = h | t1 */
557	swi	r9, r5, 12	/* *(d + 112) = t1 */
558	bsrli	r11, r12, 16	/* h = v >> 16 */
559	lwi	r12, r8, 8	/* v = *(as + 8) */
560	bslli	r9, r12, 16	/* t1 = v << 16 */
561	or	r9, r11, r9	/* t1 = h | t1 */
562	swi	r9, r5, 8	/* *(d + 8) = t1 */
563	bsrli	r11, r12, 16	/* h = v >> 16 */
564	lwi	r12, r8, 4	/* v = *(as + 4) */
565	bslli	r9, r12, 16	/* t1 = v << 16 */
566	or	r9, r11, r9	/* t1 = h | t1 */
567	swi	r9, r5, 4	/* *(d + 4) = t1 */
568	bsrli	r11, r12, 16	/* h = v >> 16 */
569	lwi	r12, r8, 0	/* v = *(as + 0) */
570	bslli	r9, r12, 16	/* t1 = v << 16 */
571	or	r9, r11, r9	/* t1 = h | t1 */
572	swi	r9, r5, 0	/* *(d + 0) = t1 */
573	addi	r4, r4, -32	/* n = n - 32 */
574	bneid	r4, d_bu2_loop	/* while (n) loop */
575	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
576
577d_block_done:
578	addi	r4, r0, 4	/* n = 4 */
579	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
580	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
581
582d_word_xfer:
583	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
584	rsub	r5, r4, r5		/* d = d - n */
585	rsub	r6, r4, r6		/* s = s - n */
586	rsub	r7, r4, r7		/* c = c - n */
587
588	andi	r9, r6, 3		/* t1 = s & 3 */
589	/* if temp != 0, unaligned transfers needed */
590	bnei	r9, d_word_unaligned
591
592d_word_aligned:
593	addi	r4, r4,-4		/* n-- */
594	lw	r9, r6, r4		/* t1 = *(s+n) */
595	bneid	r4, d_word_aligned	/* loop */
596	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
597
598	bri	d_word_done
599
600d_word_unaligned:
601	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
602	lw	r11, r8, r4		/* h = *(as + n) */
603
604	addi	r9, r9, -1
605	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
606	addi	r9, r9, -1
607	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
608
609d_word_u3:
610	bsrli	r11, r11, 8	/* h = h >> 8 */
611d_wu3_loop:
612	addi	r4, r4,-4	/* n = n - 4 */
613	lw	r12, r8, r4	/* v = *(as + n) */
614	bslli	r9, r12, 24	/* t1 = v << 24 */
615	or	r9, r11, r9	/* t1 = h | t1 */
616	sw	r9, r5, r4	/* *(d + n) = t1 */
617	bneid	r4, d_wu3_loop	/* while (n) loop */
618	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
619
620	bri	d_word_done
621
622d_word_u1:
623	bsrli	r11, r11, 24	/* h = h >> 24 */
624d_wu1_loop:
625	addi	r4, r4,-4	/* n = n - 4 */
626	lw	r12, r8, r4	/* v = *(as + n) */
627	bslli	r9, r12, 8	/* t1 = v << 8 */
628	or	r9, r11, r9	/* t1 = h | t1 */
629	sw	r9, r5, r4	/* *(d + n) = t1 */
630	bneid	r4, d_wu1_loop	/* while (n) loop */
631	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
632
633	bri	d_word_done
634
635d_word_u2:
636	bsrli	r11, r11, 16	/* h = h >> 16 */
637d_wu2_loop:
638	addi	r4, r4,-4	/* n = n - 4 */
639	lw	r12, r8, r4	/* v = *(as + n) */
640	bslli	r9, r12, 16	/* t1 = v << 16 */
641	or	r9, r11, r9	/* t1 = h | t1 */
642	sw	r9, r5, r4	/* *(d + n) = t1 */
643	bneid	r4, d_wu2_loop	/* while (n) loop */
644	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
645
646d_word_done:
647
648d_xfer_end:
649d_xfer_end_loop:
650	beqi	r7, a_done		/* while (c) */
651	addi	r6, r6, -1		/* s-- */
652	lbui	r9, r6, 0		/* t1 = *s */
653	addi	r5, r5, -1		/* d-- */
654	sbi	r9, r5, 0		/* *d = t1 */
655	brid	d_xfer_end_loop		/* loop */
656	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
657
658d_done:
659	rtsd	r15, 8
660	nop
661
662.end memmove
663