xref: /openbmc/linux/arch/powerpc/lib/memcpy_64.S (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14	std	r3,48(r1)	/* save destination pointer for return value */
15	PPC_MTOCRF	0x01,r5
16	cmpldi	cr1,r5,16
17	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
18	andi.	r6,r6,7
19	dcbt	0,r4
20	blt	cr1,.Lshort_copy
21/* Below we want to nop out the bne if we're on a CPU that has the
22   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
23   cleared.
24   At the time of writing the only CPU that has this combination of bits
25   set is Power6. */
26BEGIN_FTR_SECTION
27	nop
28FTR_SECTION_ELSE
29	bne	.Ldst_unaligned
30ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
31                    CPU_FTR_UNALIGNED_LD_STD)
32.Ldst_aligned:
33	addi	r3,r3,-16
34BEGIN_FTR_SECTION
35	andi.	r0,r4,7
36	bne	.Lsrc_unaligned
37END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
38	srdi	r7,r5,4
39	ld	r9,0(r4)
40	addi	r4,r4,-8
41	mtctr	r7
42	andi.	r5,r5,7
43	bf	cr7*4+0,2f
44	addi	r3,r3,8
45	addi	r4,r4,8
46	mr	r8,r9
47	blt	cr1,3f
481:	ld	r9,8(r4)
49	std	r8,8(r3)
502:	ldu	r8,16(r4)
51	stdu	r9,16(r3)
52	bdnz	1b
533:	std	r8,8(r3)
54	beq	3f
55	addi	r3,r3,16
56.Ldo_tail:
57	bf	cr7*4+1,1f
58	lwz	r9,8(r4)
59	addi	r4,r4,4
60	stw	r9,0(r3)
61	addi	r3,r3,4
621:	bf	cr7*4+2,2f
63	lhz	r9,8(r4)
64	addi	r4,r4,2
65	sth	r9,0(r3)
66	addi	r3,r3,2
672:	bf	cr7*4+3,3f
68	lbz	r9,8(r4)
69	stb	r9,0(r3)
703:	ld	r3,48(r1)	/* return dest pointer */
71	blr
72
73.Lsrc_unaligned:
74	srdi	r6,r5,3
75	addi	r5,r5,-16
76	subf	r4,r0,r4
77	srdi	r7,r5,4
78	sldi	r10,r0,3
79	cmpdi	cr6,r6,3
80	andi.	r5,r5,7
81	mtctr	r7
82	subfic	r11,r10,64
83	add	r5,r5,r0
84
85	bt	cr7*4+0,0f
86
87	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
88	ld	r0,8(r4)
89	sld	r6,r9,r10
90	ldu	r9,16(r4)
91	srd	r7,r0,r11
92	sld	r8,r0,r10
93	or	r7,r7,r6
94	blt	cr6,4f
95	ld	r0,8(r4)
96	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
97	b	2f
98
990:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
100	ldu	r9,8(r4)
101	sld	r8,r0,r10
102	addi	r3,r3,-8
103	blt	cr6,5f
104	ld	r0,8(r4)
105	srd	r12,r9,r11
106	sld	r6,r9,r10
107	ldu	r9,16(r4)
108	or	r12,r8,r12
109	srd	r7,r0,r11
110	sld	r8,r0,r10
111	addi	r3,r3,16
112	beq	cr6,3f
113
114	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1151:	or	r7,r7,r6
116	ld	r0,8(r4)
117	std	r12,8(r3)
1182:	srd	r12,r9,r11
119	sld	r6,r9,r10
120	ldu	r9,16(r4)
121	or	r12,r8,r12
122	stdu	r7,16(r3)
123	srd	r7,r0,r11
124	sld	r8,r0,r10
125	bdnz	1b
126
1273:	std	r12,8(r3)
128	or	r7,r7,r6
1294:	std	r7,16(r3)
1305:	srd	r12,r9,r11
131	or	r12,r8,r12
132	std	r12,24(r3)
133	beq	4f
134	cmpwi	cr1,r5,8
135	addi	r3,r3,32
136	sld	r9,r9,r10
137	ble	cr1,6f
138	ld	r0,8(r4)
139	srd	r7,r0,r11
140	or	r9,r7,r9
1416:
142	bf	cr7*4+1,1f
143	rotldi	r9,r9,32
144	stw	r9,0(r3)
145	addi	r3,r3,4
1461:	bf	cr7*4+2,2f
147	rotldi	r9,r9,16
148	sth	r9,0(r3)
149	addi	r3,r3,2
1502:	bf	cr7*4+3,3f
151	rotldi	r9,r9,8
152	stb	r9,0(r3)
1533:	ld	r3,48(r1)	/* return dest pointer */
154	blr
155
156.Ldst_unaligned:
157	PPC_MTOCRF	0x01,r6		# put #bytes to 8B bdry into cr7
158	subf	r5,r6,r5
159	li	r7,0
160	cmpldi	cr1,r5,16
161	bf	cr7*4+3,1f
162	lbz	r0,0(r4)
163	stb	r0,0(r3)
164	addi	r7,r7,1
1651:	bf	cr7*4+2,2f
166	lhzx	r0,r7,r4
167	sthx	r0,r7,r3
168	addi	r7,r7,2
1692:	bf	cr7*4+1,3f
170	lwzx	r0,r7,r4
171	stwx	r0,r7,r3
1723:	PPC_MTOCRF	0x01,r5
173	add	r4,r6,r4
174	add	r3,r6,r3
175	b	.Ldst_aligned
176
177.Lshort_copy:
178	bf	cr7*4+0,1f
179	lwz	r0,0(r4)
180	lwz	r9,4(r4)
181	addi	r4,r4,8
182	stw	r0,0(r3)
183	stw	r9,4(r3)
184	addi	r3,r3,8
1851:	bf	cr7*4+1,2f
186	lwz	r0,0(r4)
187	addi	r4,r4,4
188	stw	r0,0(r3)
189	addi	r3,r3,4
1902:	bf	cr7*4+2,3f
191	lhz	r0,0(r4)
192	addi	r4,r4,2
193	sth	r0,0(r3)
194	addi	r3,r3,2
1953:	bf	cr7*4+3,4f
196	lbz	r0,0(r4)
197	stb	r0,0(r3)
1984:	ld	r3,48(r1)	/* return dest pointer */
199	blr
200