xref: /openbmc/linux/arch/powerpc/lib/memcpy_64.S (revision 8fa5723aa7e053d498336b48448b292fc2e0458b)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14	std	r3,48(r1)	/* save destination pointer for return value */
15	PPC_MTOCRF	0x01,r5
16	cmpldi	cr1,r5,16
17	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
18	andi.	r6,r6,7
19	dcbt	0,r4
20	blt	cr1,.Lshort_copy
21	bne	.Ldst_unaligned
22.Ldst_aligned:
23	andi.	r0,r4,7
24	addi	r3,r3,-16
25	bne	.Lsrc_unaligned
26	srdi	r7,r5,4
27	ld	r9,0(r4)
28	addi	r4,r4,-8
29	mtctr	r7
30	andi.	r5,r5,7
31	bf	cr7*4+0,2f
32	addi	r3,r3,8
33	addi	r4,r4,8
34	mr	r8,r9
35	blt	cr1,3f
361:	ld	r9,8(r4)
37	std	r8,8(r3)
382:	ldu	r8,16(r4)
39	stdu	r9,16(r3)
40	bdnz	1b
413:	std	r8,8(r3)
42	beq	3f
43	addi	r3,r3,16
44	ld	r9,8(r4)
45.Ldo_tail:
46	bf	cr7*4+1,1f
47	rotldi	r9,r9,32
48	stw	r9,0(r3)
49	addi	r3,r3,4
501:	bf	cr7*4+2,2f
51	rotldi	r9,r9,16
52	sth	r9,0(r3)
53	addi	r3,r3,2
542:	bf	cr7*4+3,3f
55	rotldi	r9,r9,8
56	stb	r9,0(r3)
573:	ld	r3,48(r1)	/* return dest pointer */
58	blr
59
60.Lsrc_unaligned:
61	srdi	r6,r5,3
62	addi	r5,r5,-16
63	subf	r4,r0,r4
64	srdi	r7,r5,4
65	sldi	r10,r0,3
66	cmpdi	cr6,r6,3
67	andi.	r5,r5,7
68	mtctr	r7
69	subfic	r11,r10,64
70	add	r5,r5,r0
71
72	bt	cr7*4+0,0f
73
74	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
75	ld	r0,8(r4)
76	sld	r6,r9,r10
77	ldu	r9,16(r4)
78	srd	r7,r0,r11
79	sld	r8,r0,r10
80	or	r7,r7,r6
81	blt	cr6,4f
82	ld	r0,8(r4)
83	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
84	b	2f
85
860:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
87	ldu	r9,8(r4)
88	sld	r8,r0,r10
89	addi	r3,r3,-8
90	blt	cr6,5f
91	ld	r0,8(r4)
92	srd	r12,r9,r11
93	sld	r6,r9,r10
94	ldu	r9,16(r4)
95	or	r12,r8,r12
96	srd	r7,r0,r11
97	sld	r8,r0,r10
98	addi	r3,r3,16
99	beq	cr6,3f
100
101	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1021:	or	r7,r7,r6
103	ld	r0,8(r4)
104	std	r12,8(r3)
1052:	srd	r12,r9,r11
106	sld	r6,r9,r10
107	ldu	r9,16(r4)
108	or	r12,r8,r12
109	stdu	r7,16(r3)
110	srd	r7,r0,r11
111	sld	r8,r0,r10
112	bdnz	1b
113
1143:	std	r12,8(r3)
115	or	r7,r7,r6
1164:	std	r7,16(r3)
1175:	srd	r12,r9,r11
118	or	r12,r8,r12
119	std	r12,24(r3)
120	beq	4f
121	cmpwi	cr1,r5,8
122	addi	r3,r3,32
123	sld	r9,r9,r10
124	ble	cr1,.Ldo_tail
125	ld	r0,8(r4)
126	srd	r7,r0,r11
127	or	r9,r7,r9
128	b	.Ldo_tail
129
130.Ldst_unaligned:
131	PPC_MTOCRF	0x01,r6		# put #bytes to 8B bdry into cr7
132	subf	r5,r6,r5
133	li	r7,0
134	cmpldi	r1,r5,16
135	bf	cr7*4+3,1f
136	lbz	r0,0(r4)
137	stb	r0,0(r3)
138	addi	r7,r7,1
1391:	bf	cr7*4+2,2f
140	lhzx	r0,r7,r4
141	sthx	r0,r7,r3
142	addi	r7,r7,2
1432:	bf	cr7*4+1,3f
144	lwzx	r0,r7,r4
145	stwx	r0,r7,r3
1463:	PPC_MTOCRF	0x01,r5
147	add	r4,r6,r4
148	add	r3,r6,r3
149	b	.Ldst_aligned
150
151.Lshort_copy:
152	bf	cr7*4+0,1f
153	lwz	r0,0(r4)
154	lwz	r9,4(r4)
155	addi	r4,r4,8
156	stw	r0,0(r3)
157	stw	r9,4(r3)
158	addi	r3,r3,8
1591:	bf	cr7*4+1,2f
160	lwz	r0,0(r4)
161	addi	r4,r4,4
162	stw	r0,0(r3)
163	addi	r3,r3,4
1642:	bf	cr7*4+2,3f
165	lhz	r0,0(r4)
166	addi	r4,r4,2
167	sth	r0,0(r3)
168	addi	r3,r3,2
1693:	bf	cr7*4+3,4f
170	lbz	r0,0(r4)
171	stb	r0,0(r3)
1724:	ld	r3,48(r1)	/* return dest pointer */
173	blr
174