xref: /openbmc/linux/arch/sh/lib/memcpy-sh4.S (revision bc5aa3a0)
1/*
2 * "memcpy" implementation of SuperH
3 *
4 * Copyright (C) 1999  Niibe Yutaka
5 * Copyright (c) 2002  STMicroelectronics Ltd
6 *   Modified from memcpy.S and micro-optimised for SH4
7 *   Stuart Menefy (stuart.menefy@st.com)
8 *
9 */
10#include <linux/linkage.h>
11
12/*
13 * void *memcpy(void *dst, const void *src, size_t n);
14 *
15 * It is assumed that there is no overlap between src and dst.
16 * If there is an overlap, then the results are undefined.
17 */
18
19	!
20	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
21	!
22
23	! Size is 16 or greater, and may have trailing bytes
24
25	.balign	32
26.Lcase1:
27	! Read a long word and write a long word at once
28	! At the start of each iteration, r7 contains last long load
29	add	#-1,r5		!  79 EX
30	mov	r4,r2		!   5 MT (0 cycles latency)
31
32	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
33	add	#-4,r5		!  50 EX
34
35	add	#7,r2		!  79 EX
36	!
37#ifdef CONFIG_CPU_LITTLE_ENDIAN
38	! 6 cycles, 4 bytes per iteration
393:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
40	mov	r7, r3		!   5 MT (latency=0)	! RQPO
41
42	cmp/hi	r2,r0		!  57 MT
43	shll16	r3		! 103 EX
44
45	mov	r1,r6		!   5 MT (latency=0)
46	shll8	r3		! 102 EX		! Oxxx
47
48	shlr8	r6		! 106 EX		! xNML
49	mov	r1, r7		!   5 MT (latency=0)
50
51	or	r6,r3		!  82 EX		! ONML
52	bt/s	3b		! 109 BR
53
54	 mov.l	r3,@-r0		!  30 LS
55#else
563:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
57	mov	r7,r3		!   5 MT (latency=0)	! OPQR
58
59	cmp/hi	r2,r0		!  57 MT
60	shlr16	r3		! 107 EX
61
62	shlr8	r3		! 106 EX		! xxxO
63	mov	r1,r6		!   5 MT (latency=0)
64
65	shll8	r6		! 102 EX		! LMNx
66	mov	r1,r7		!   5 MT (latency=0)
67
68	or	r6,r3		!  82 EX		! LMNO
69	bt/s	3b		! 109 BR
70
71	 mov.l	r3,@-r0		!  30 LS
72#endif
73	! Finally, copy a byte at once, if necessary
74
75	add	#4,r5		!  50 EX
76	cmp/eq	r4,r0		!  54 MT
77
78	add	#-6,r2		!  50 EX
79	bt	9f		! 109 BR
80
818:	cmp/hi	r2,r0		!  57 MT
82	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
83
84	bt/s	8b		! 109 BR
85
86	 mov.b	r1,@-r0		!  29 LS
87
889:	rts
89	 nop
90
91
92	!
93	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
94	!
95
96	! Size is 16 or greater, and may have trailing bytes
97
98	.balign	32
99.Lcase3:
100	! Read a long word and write a long word at once
101	! At the start of each iteration, r7 contains last long load
102	add	#-3,r5		! 79 EX
103	mov	r4,r2		!  5 MT (0 cycles latency)
104
105	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
106	add	#-4,r5		! 50 EX
107
108	add	#7,r2		!  79 EX
109	!
110#ifdef CONFIG_CPU_LITTLE_ENDIAN
111	! 6 cycles, 4 bytes per iteration
1123:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
113	mov	r7, r3		!   5 MT (latency=0)	! RQPO
114
115	cmp/hi	r2,r0		!  57 MT
116	shll8	r3		! 102 EX		! QPOx
117
118	mov	r1,r6		!   5 MT (latency=0)
119	shlr16	r6		! 107 EX
120
121	shlr8	r6		! 106 EX		! xxxN
122	mov	r1, r7		!   5 MT (latency=0)
123
124	or	r6,r3		!  82 EX		! QPON
125	bt/s	3b		! 109 BR
126
127	 mov.l	r3,@-r0		!  30 LS
128#else
1293:	mov	r7,r3		! OPQR
130	shlr8	r3		! xOPQ
131	mov.l	@(r0,r5),r7	! KLMN
132	mov	r7,r6
133	shll16	r6
134	shll8	r6		! Nxxx
135	or	r6,r3		! NOPQ
136	cmp/hi	r2,r0
137	bt/s	3b
138	 mov.l	r3,@-r0
139#endif
140
141	! Finally, copy a byte at once, if necessary
142
143	add	#6,r5		!  50 EX
144	cmp/eq	r4,r0		!  54 MT
145
146	add	#-6,r2		!  50 EX
147	bt	9f		! 109 BR
148
1498:	cmp/hi	r2,r0		!  57 MT
150	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
151
152	bt/s	8b		! 109 BR
153
154	 mov.b	r1,@-r0		!  29 LS
155
1569:	rts
157	 nop
158
159ENTRY(memcpy)
160
161	! Calculate the invariants which will be used in the remainder
162	! of the code:
163	!
164	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
165	!	         [ ...  ]                 [ ...  ]
166	!	           :                        :
167	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
168	!
169	!
170
171	! Short circuit the common case of src, dst and len being 32 bit aligned
172	! and test for zero length move
173
174	mov	r6, r0		!   5 MT (0 cycle latency)
175	or	r4, r0		!  82 EX
176
177	or	r5, r0		!  82 EX
178	tst	r6, r6		!  86 MT
179
180	bt/s	99f		! 111 BR		(zero len)
181	 tst	#3, r0		!  87 MT
182
183	mov	r4, r0		!   5 MT (0 cycle latency)
184	add	r6, r0		!  49 EX
185
186	mov	#16, r1		!   6 EX
187	bt/s	.Lcase00	! 111 BR		(aligned)
188
189	 sub	r4, r5		!  75 EX
190
191	! Arguments are not nicely long word aligned or zero len.
192	! Check for small copies, and if so do a simple byte at a time copy.
193	!
194	! Deciding on an exact value of 'small' is not easy, as the point at which
195	! using the optimised routines become worthwhile varies (these are the
196	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
197	!	size	byte-at-time	long	word	byte
198	!	16	42		39-40	46-50	50-55
199	!	24	58		43-44	54-58	62-67
200	!	36	82		49-50	66-70	80-85
201	! However the penalty for getting it 'wrong' is much higher for long word
202	! aligned data (and this is more common), so use a value of 16.
203
204	cmp/gt	r6,r1		!  56 MT
205
206	add	#-1,r5		!  50 EX
207	bf/s	6f		! 108 BR		(not small)
208
209	 mov	r5, r3		!   5 MT (latency=0)
210	shlr	r6		! 104 EX
211
212	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
213	bf/s	4f		! 111 BR
214
215	 add	#-1,r3		!  50 EX
216	tst	r6, r6		!  86 MT
217
218	bt/s	98f		! 110 BR
219	 mov.b	r1,@-r0		!  29 LS
220
221	! 4 cycles, 2 bytes per iteration
2223:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
223
2244:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
225	dt	r6		!  67 EX
226
227	mov.b	r1,@-r0		!  29 LS
228	bf/s	3b		! 111 BR
229
230	 mov.b	r2,@-r0		!  29 LS
23198:
232	rts
233	 nop
234
23599:	rts
236	 mov	r4, r0
237
238	! Size is not small, so its worthwhile looking for optimisations.
239	! First align destination to a long word boundary.
240	!
241	! r5 = normal value -1
242
2436:	tst	#3, r0		!  87 MT
244        mov	#3, r3		!   6 EX
245
246	bt/s	2f		! 111 BR
247	 and	r0,r3		!  78 EX
248
249	! 3 cycles, 1 byte per iteration
2501:	dt	r3		!  67 EX
251	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
252
253	add	#-1, r6		!  79 EX
254	bf/s	1b		! 109 BR
255
256	 mov.b	r1,@-r0		!  28 LS
257
2582:	add	#1, r5		!  79 EX
259
260	! Now select the appropriate bulk transfer code based on relative
261	! alignment of src and dst.
262
263	mov	r0, r3		!   5 MT (latency=0)
264
265	mov	r5, r0		!   5 MT (latency=0)
266	tst	#1, r0		!  87 MT
267
268	bf/s	1f		! 111 BR
269	 mov	#64, r7		!   6 EX
270
271	! bit 0 clear
272
273	cmp/ge	r7, r6		!  55 MT
274
275	bt/s	2f		! 111 BR
276	 tst	#2, r0		!  87 MT
277
278	! small
279	bt/s	.Lcase0
280	 mov	r3, r0
281
282	bra	.Lcase2
283	 nop
284
285	! big
2862:	bt/s	.Lcase0b
287	 mov	r3, r0
288
289	bra	.Lcase2b
290	 nop
291
292	! bit 0 set
2931:	tst	#2, r0		! 87 MT
294
295	bt/s	.Lcase1
296	 mov	r3, r0
297
298	bra	.Lcase3
299	 nop
300
301
302	!
303	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
304	!
305
306	! src, dst and size are all long word aligned
307	! size is non-zero
308
309	.balign	32
310.Lcase00:
311	mov	#64, r1		!   6 EX
312	mov	r5, r3		!   5 MT (latency=0)
313
314	cmp/gt	r6, r1		!  56 MT
315	add	#-4, r5		!  50 EX
316
317	bf	.Lcase00b	! 108 BR		(big loop)
318	shlr2	r6		! 105 EX
319
320	shlr	r6		! 104 EX
321	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
322
323	bf/s	4f		! 111 BR
324	 add	#-8, r3		!  50 EX
325
326	tst	r6, r6		!  86 MT
327	bt/s	5f		! 110 BR
328
329	 mov.l	r1,@-r0		!  30 LS
330
331	! 4 cycles, 2 long words per iteration
3323:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
333
3344:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
335	dt	r6		!  67 EX
336
337	mov.l	r1, @-r0	!  30 LS
338	bf/s	3b		! 109 BR
339
340	 mov.l	r2, @-r0	!  30 LS
341
3425:	rts
343	 nop
344
345
346	! Size is 16 or greater and less than 64, but may have trailing bytes
347
348	.balign	32
349.Lcase0:
350	add	#-4, r5		!  50 EX
351	mov	r4, r7		!   5 MT (latency=0)
352
353	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
354	mov	#4, r2		!   6 EX
355
356	add	#11, r7		!  50 EX
357	tst	r2, r6		!  86 MT
358
359	mov	r5, r3		!   5 MT (latency=0)
360	bt/s	4f		! 111 BR
361
362	 add	#-4, r3		!  50 EX
363	mov.l	r1,@-r0		!  30 LS
364
365	! 4 cycles, 2 long words per iteration
3663:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
367
3684:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
369	cmp/hi	r7, r0
370
371	mov.l	r1, @-r0	!  30 LS
372	bt/s	3b		! 109 BR
373
374	 mov.l	r2, @-r0	!  30 LS
375
376	! Copy the final 0-3 bytes
377
378	add	#3,r5		!  50 EX
379
380	cmp/eq	r0, r4		!  54 MT
381	add	#-10, r7	!  50 EX
382
383	bt	9f		! 110 BR
384
385	! 3 cycles, 1 byte per iteration
3861:	mov.b	@(r0,r5),r1	!  19 LS
387	cmp/hi	r7,r0		!  57 MT
388
389	bt/s	1b		! 111 BR
390	 mov.b	r1,@-r0		!  28 LS
391
3929:	rts
393	 nop
394
395	! Size is at least 64 bytes, so will be going round the big loop at least once.
396	!
397	!   r2 = rounded up r4
398	!   r3 = rounded down r0
399
400	.balign	32
401.Lcase0b:
402	add	#-4, r5		!  50 EX
403
404.Lcase00b:
405	mov	r0, r3		!   5 MT (latency=0)
406	mov	#(~0x1f), r1	!   6 EX
407
408	and	r1, r3		!  78 EX
409	mov	r4, r2		!   5 MT (latency=0)
410
411	cmp/eq	r3, r0		!  54 MT
412	add	#0x1f, r2	!  50 EX
413
414	bt/s	1f		! 110 BR
415	 and	r1, r2		!  78 EX
416
417	! copy initial words until cache line aligned
418
419	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
420	tst	#4, r0		!  87 MT
421
422	mov	r5, r6		!   5 MT (latency=0)
423	add	#-4, r6		!  50 EX
424
425	bt/s	4f		! 111 BR
426	 add	#8, r3		!  50 EX
427
428	tst	#0x18, r0	!  87 MT
429
430	bt/s	1f		! 109 BR
431	 mov.l	r1,@-r0		!  30 LS
432
433	! 4 cycles, 2 long words per iteration
4343:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
435
4364:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
437	cmp/eq	r3, r0		!  54 MT
438
439	mov.l	r1, @-r0	!  30 LS
440	bf/s	3b		! 109 BR
441
442	 mov.l	r7, @-r0	!  30 LS
443
444	! Copy the cache line aligned blocks
445	!
446	! In use: r0, r2, r4, r5
447	! Scratch: r1, r3, r6, r7
448	!
449	! We could do this with the four scratch registers, but if src
450	! and dest hit the same cache line, this will thrash, so make
451	! use of additional registers.
452	!
453	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
454	!   r5:	 src (was r0+r5)
455	!   r1:	 dest (was r0)
456	! this can be reversed at the end, so we don't need to save any extra
457	! state.
458	!
4591:	mov.l	r8, @-r15	!  30 LS
460	add	r0, r5		!  49 EX
461
462	mov.l	r9, @-r15	!  30 LS
463	mov	r0, r1		!   5 MT (latency=0)
464
465	mov.l	r10, @-r15	!  30 LS
466	add	#-0x1c, r5	!  50 EX
467
468	mov.l	r11, @-r15	!  30 LS
469
470	! 16 cycles, 32 bytes per iteration
4712:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
472	add	#-0x20, r1	! 50 EX
473	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
474	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
475	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
476	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
477	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
478	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
479	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
480	movca.l	r0,@r1		! 40 LS (latency=3-7)
481	mov.l	r3,@(0x04,r1)	! 33 LS
482	mov.l	r6,@(0x08,r1)	! 33 LS
483	mov.l	r7,@(0x0c,r1)	! 33 LS
484
485	mov.l	r8,@(0x10,r1)	! 33 LS
486	add	#-0x20, r5	! 50 EX
487
488	mov.l	r9,@(0x14,r1)	! 33 LS
489	cmp/eq	r2,r1		! 54 MT
490
491	mov.l	r10,@(0x18,r1)	!  33 LS
492	bf/s	2b		! 109 BR
493
494	 mov.l	r11,@(0x1c,r1)	!  33 LS
495
496	mov	r1, r0		!   5 MT (latency=0)
497
498	mov.l	@r15+, r11	!  15 LS
499	sub	r1, r5		!  75 EX
500
501	mov.l	@r15+, r10	!  15 LS
502	cmp/eq	r4, r0		!  54 MT
503
504	bf/s	1f		! 109 BR
505	 mov.l	 @r15+, r9	!  15 LS
506
507	rts
5081:	 mov.l	@r15+, r8	!  15 LS
509	sub	r4, r1		!  75 EX		(len remaining)
510
511	! number of trailing bytes is non-zero
512	!
513	! invariants restored (r5 already decremented by 4)
514	! also r1=num bytes remaining
515
516	mov	#4, r2		!   6 EX
517	mov	r4, r7		!   5 MT (latency=0)
518
519	add	#0x1c, r5	!  50 EX		(back to -4)
520	cmp/hs	r2, r1		!  58 MT
521
522	bf/s	5f		! 108 BR
523	 add	 #11, r7	!  50 EX
524
525	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
526	tst	r2, r1		!  86 MT
527
528	mov	r5, r3		!   5 MT (latency=0)
529	bt/s	4f		! 111 BR
530
531	 add	#-4, r3		!  50 EX
532	cmp/hs	r2, r1		!  58 MT
533
534	bt/s	5f		! 111 BR
535	 mov.l	r6,@-r0		!  30 LS
536
537	! 4 cycles, 2 long words per iteration
5383:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
539
5404:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
541	cmp/hi	r7, r0
542
543	mov.l	r6, @-r0	!  30 LS
544	bt/s	3b		! 109 BR
545
546	 mov.l	r2, @-r0	!  30 LS
547
548	! Copy the final 0-3 bytes
549
5505:	cmp/eq	r0, r4		!  54 MT
551	add	#-10, r7	!  50 EX
552
553	bt	9f		! 110 BR
554	add	#3,r5		!  50 EX
555
556	! 3 cycles, 1 byte per iteration
5571:	mov.b	@(r0,r5),r1	!  19 LS
558	cmp/hi	r7,r0		!  57 MT
559
560	bt/s	1b		! 111 BR
561	 mov.b	r1,@-r0		!  28 LS
562
5639:	rts
564	 nop
565
566	!
567	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
568	!
569
570	.balign	32
571.Lcase2:
572	! Size is 16 or greater and less then 64, but may have trailing bytes
573
5742:	mov	r5, r6		!   5 MT (latency=0)
575	add	#-2,r5		!  50 EX
576
577	mov	r4,r2		!   5 MT (latency=0)
578	add	#-4,r6		!  50 EX
579
580	add	#7,r2		!  50 EX
5813:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
582
583	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
584	cmp/hi	r2,r0		!  57 MT
585
586	mov.w	r1,@-r0		!  29 LS
587	bt/s	3b		! 111 BR
588
589	 mov.w	r3,@-r0		!  29 LS
590
591	bra	10f
592	 nop
593
594
595	.balign	32
596.Lcase2b:
597	! Size is at least 64 bytes, so will be going round the big loop at least once.
598	!
599	!   r2 = rounded up r4
600	!   r3 = rounded down r0
601
602	mov	r0, r3		!   5 MT (latency=0)
603	mov	#(~0x1f), r1	!   6 EX
604
605	and	r1, r3		!  78 EX
606	mov	r4, r2		!   5 MT (latency=0)
607
608	cmp/eq	r3, r0		!  54 MT
609	add	#0x1f, r2	!  50 EX
610
611	add	#-2, r5		!  50 EX
612	bt/s	1f		! 110 BR
613	 and	r1, r2		!  78 EX
614
615	! Copy a short word one at a time until we are cache line aligned
616	!   Normal values: r0, r2, r3, r4
617	!   Unused: r1, r6, r7
618	!   Mod: r5 (=r5-2)
619	!
620	add	#2, r3		!  50 EX
621
6222:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
623	cmp/eq	r3,r0		!  54 MT
624
625	bf/s	2b		! 111 BR
626
627	 mov.w	r1,@-r0		!  29 LS
628
629	! Copy the cache line aligned blocks
630	!
631	! In use: r0, r2, r4, r5 (=r5-2)
632	! Scratch: r1, r3, r6, r7
633	!
634	! We could do this with the four scratch registers, but if src
635	! and dest hit the same cache line, this will thrash, so make
636	! use of additional registers.
637	!
638	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
639	!   r5:	 src (was r0+r5)
640	!   r1:	 dest (was r0)
641	! this can be reversed at the end, so we don't need to save any extra
642	! state.
643	!
6441:	mov.l	r8, @-r15	!  30 LS
645	add	r0, r5		!  49 EX
646
647	mov.l	r9, @-r15	!  30 LS
648	mov	r0, r1		!   5 MT (latency=0)
649
650	mov.l	r10, @-r15	!  30 LS
651	add	#-0x1e, r5	!  50 EX
652
653	mov.l	r11, @-r15	!  30 LS
654
655	mov.l	r12, @-r15	!  30 LS
656
657	! 17 cycles, 32 bytes per iteration
658#ifdef CONFIG_CPU_LITTLE_ENDIAN
6592:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
660	add	#-0x20, r1	!  50 EX
661
662	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
663
664	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
665	shll16	r0		! 103 EX			JI..
666
667	mov.l	@r5+, r7	!  15 LS (latency=2)
668	xtrct	r3, r0		!  48 EX			LKJI
669
670	mov.l	@r5+, r8	!  15 LS (latency=2)
671	xtrct	r6, r3		!  48 EX			PONM
672
673	mov.l	@r5+, r9	!  15 LS (latency=2)
674	xtrct	r7, r6		!  48 EX
675
676	mov.l	@r5+, r10	!  15 LS (latency=2)
677	xtrct	r8, r7		!  48 EX
678
679	mov.l	@r5+, r11	!  15 LS (latency=2)
680	xtrct	r9, r8		!  48 EX
681
682	mov.w	@r5+, r12	!  15 LS (latency=2)
683	xtrct	r10, r9		!  48 EX
684
685	movca.l	r0,@r1		!  40 LS (latency=3-7)
686	xtrct	r11, r10	!  48 EX
687
688	mov.l	r3, @(0x04,r1)	!  33 LS
689	xtrct	r12, r11	!  48 EX
690
691	mov.l	r6, @(0x08,r1)	!  33 LS
692
693	mov.l	r7, @(0x0c,r1)	!  33 LS
694
695	mov.l	r8, @(0x10,r1)	!  33 LS
696	add	#-0x40, r5	!  50 EX
697
698	mov.l	r9, @(0x14,r1)	!  33 LS
699	cmp/eq	r2,r1		!  54 MT
700
701	mov.l	r10, @(0x18,r1)	!  33 LS
702	bf/s	2b		! 109 BR
703
704	 mov.l	r11, @(0x1c,r1)	!  33 LS
705#else
7062:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
707	add	#-2, r5		!  50 EX
708
709	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
710	add	#-4, r1		!  50 EX
711
712	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
713	shll16	r0		! 103 EX
714
715	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
716	xtrct	r3, r0		!  48 EX
717
718	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
719	xtrct	r6, r3		!  48 EX
720
721	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
722	xtrct	r7, r6		!  48 EX
723
724	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
725	xtrct	r8, r7		!  48 EX
726
727	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
728	xtrct	r9, r8		!  48 EX
729
730	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
731    	xtrct	r10, r9		!  48 EX
732
733	movca.l	r0,@r1		!  40 LS (latency=3-7)
734	add	#-0x1c, r1	!  50 EX
735
736	mov.l	r3, @(0x18,r1)	!  33 LS
737	xtrct	r11, r10	!  48 EX
738
739	mov.l	r6, @(0x14,r1)	!  33 LS
740	xtrct	r12, r11	!  48 EX
741
742	mov.l	r7, @(0x10,r1)	!  33 LS
743
744	mov.l	r8, @(0x0c,r1)	!  33 LS
745	add	#-0x1e, r5	!  50 EX
746
747	mov.l	r9, @(0x08,r1)	!  33 LS
748	cmp/eq	r2,r1		!  54 MT
749
750	mov.l	r10, @(0x04,r1)	!  33 LS
751	bf/s	2b		! 109 BR
752
753	 mov.l	r11, @(0x00,r1)	!  33 LS
754#endif
755
756	mov.l	@r15+, r12
757	mov	r1, r0		!   5 MT (latency=0)
758
759	mov.l	@r15+, r11	!  15 LS
760	sub	r1, r5		!  75 EX
761
762	mov.l	@r15+, r10	!  15 LS
763	cmp/eq	r4, r0		!  54 MT
764
765	bf/s	1f		! 109 BR
766	 mov.l	 @r15+, r9	!  15 LS
767
768	rts
7691:	 mov.l	@r15+, r8	!  15 LS
770
771	add	#0x1e, r5	!  50 EX
772
773	! Finish off a short word at a time
774	! r5 must be invariant - 2
77510:	mov	r4,r2		!   5 MT (latency=0)
776	add	#1,r2		!  50 EX
777
778	cmp/hi	r2, r0		!  57 MT
779	bf/s	1f		! 109 BR
780
781	 add	#2, r2		!  50 EX
782
7833:	mov.w	@(r0,r5),r1	!  20 LS
784	cmp/hi	r2,r0		!  57 MT
785
786	bt/s	3b		! 109 BR
787
788	 mov.w	r1,@-r0		!  29 LS
7891:
790
791	!
792	! Finally, copy the last byte if necessary
793	cmp/eq	r4,r0		!  54 MT
794	bt/s	9b
795	 add	#1,r5
796	mov.b	@(r0,r5),r1
797	rts
798	 mov.b	r1,@-r0
799
800