xref: /openbmc/linux/arch/powerpc/lib/memcpy_64.S (revision 74ba9207e1adf1966c57450340534ae9742d00af)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/asm-compat.h>
13#include <asm/feature-fixups.h>
14#include <asm/kasan.h>
15
16#ifndef SELFTEST_CASE
17/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
18#define SELFTEST_CASE	0
19#endif
20
21	.align	7
22_GLOBAL_TOC_KASAN(memcpy)
23BEGIN_FTR_SECTION
24#ifdef __LITTLE_ENDIAN__
25	cmpdi	cr7,r5,0
26#else
27	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
28#endif
29FTR_SECTION_ELSE
30#ifdef CONFIG_PPC_BOOK3S_64
31	b	memcpy_power7
32#endif
33ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
34#ifdef __LITTLE_ENDIAN__
35	/* dumb little-endian memcpy that will get replaced at runtime */
36	addi r9,r3,-1
37	addi r4,r4,-1
38	beqlr cr7
39	mtctr r5
401:	lbzu r10,1(r4)
41	stbu r10,1(r9)
42	bdnz 1b
43	blr
44#else
45	PPC_MTOCRF(0x01,r5)
46	cmpldi	cr1,r5,16
47	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
48	andi.	r6,r6,7
49	dcbt	0,r4
50	blt	cr1,.Lshort_copy
51/* Below we want to nop out the bne if we're on a CPU that has the
52   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
53   cleared.
54   At the time of writing the only CPU that has this combination of bits
55   set is Power6. */
56test_feature = (SELFTEST_CASE == 1)
57BEGIN_FTR_SECTION
58	nop
59FTR_SECTION_ELSE
60	bne	.Ldst_unaligned
61ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
62                    CPU_FTR_UNALIGNED_LD_STD)
63.Ldst_aligned:
64	addi	r3,r3,-16
65test_feature = (SELFTEST_CASE == 0)
66BEGIN_FTR_SECTION
67	andi.	r0,r4,7
68	bne	.Lsrc_unaligned
69END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
70	srdi	r7,r5,4
71	ld	r9,0(r4)
72	addi	r4,r4,-8
73	mtctr	r7
74	andi.	r5,r5,7
75	bf	cr7*4+0,2f
76	addi	r3,r3,8
77	addi	r4,r4,8
78	mr	r8,r9
79	blt	cr1,3f
801:	ld	r9,8(r4)
81	std	r8,8(r3)
822:	ldu	r8,16(r4)
83	stdu	r9,16(r3)
84	bdnz	1b
853:	std	r8,8(r3)
86	beq	3f
87	addi	r3,r3,16
88.Ldo_tail:
89	bf	cr7*4+1,1f
90	lwz	r9,8(r4)
91	addi	r4,r4,4
92	stw	r9,0(r3)
93	addi	r3,r3,4
941:	bf	cr7*4+2,2f
95	lhz	r9,8(r4)
96	addi	r4,r4,2
97	sth	r9,0(r3)
98	addi	r3,r3,2
992:	bf	cr7*4+3,3f
100	lbz	r9,8(r4)
101	stb	r9,0(r3)
1023:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
103	blr
104
105.Lsrc_unaligned:
106	srdi	r6,r5,3
107	addi	r5,r5,-16
108	subf	r4,r0,r4
109	srdi	r7,r5,4
110	sldi	r10,r0,3
111	cmpdi	cr6,r6,3
112	andi.	r5,r5,7
113	mtctr	r7
114	subfic	r11,r10,64
115	add	r5,r5,r0
116
117	bt	cr7*4+0,0f
118
119	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
120	ld	r0,8(r4)
121	sld	r6,r9,r10
122	ldu	r9,16(r4)
123	srd	r7,r0,r11
124	sld	r8,r0,r10
125	or	r7,r7,r6
126	blt	cr6,4f
127	ld	r0,8(r4)
128	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
129	b	2f
130
1310:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
132	ldu	r9,8(r4)
133	sld	r8,r0,r10
134	addi	r3,r3,-8
135	blt	cr6,5f
136	ld	r0,8(r4)
137	srd	r12,r9,r11
138	sld	r6,r9,r10
139	ldu	r9,16(r4)
140	or	r12,r8,r12
141	srd	r7,r0,r11
142	sld	r8,r0,r10
143	addi	r3,r3,16
144	beq	cr6,3f
145
146	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1471:	or	r7,r7,r6
148	ld	r0,8(r4)
149	std	r12,8(r3)
1502:	srd	r12,r9,r11
151	sld	r6,r9,r10
152	ldu	r9,16(r4)
153	or	r12,r8,r12
154	stdu	r7,16(r3)
155	srd	r7,r0,r11
156	sld	r8,r0,r10
157	bdnz	1b
158
1593:	std	r12,8(r3)
160	or	r7,r7,r6
1614:	std	r7,16(r3)
1625:	srd	r12,r9,r11
163	or	r12,r8,r12
164	std	r12,24(r3)
165	beq	4f
166	cmpwi	cr1,r5,8
167	addi	r3,r3,32
168	sld	r9,r9,r10
169	ble	cr1,6f
170	ld	r0,8(r4)
171	srd	r7,r0,r11
172	or	r9,r7,r9
1736:
174	bf	cr7*4+1,1f
175	rotldi	r9,r9,32
176	stw	r9,0(r3)
177	addi	r3,r3,4
1781:	bf	cr7*4+2,2f
179	rotldi	r9,r9,16
180	sth	r9,0(r3)
181	addi	r3,r3,2
1822:	bf	cr7*4+3,3f
183	rotldi	r9,r9,8
184	stb	r9,0(r3)
1853:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
186	blr
187
188.Ldst_unaligned:
189	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
190	subf	r5,r6,r5
191	li	r7,0
192	cmpldi	cr1,r5,16
193	bf	cr7*4+3,1f
194	lbz	r0,0(r4)
195	stb	r0,0(r3)
196	addi	r7,r7,1
1971:	bf	cr7*4+2,2f
198	lhzx	r0,r7,r4
199	sthx	r0,r7,r3
200	addi	r7,r7,2
2012:	bf	cr7*4+1,3f
202	lwzx	r0,r7,r4
203	stwx	r0,r7,r3
2043:	PPC_MTOCRF(0x01,r5)
205	add	r4,r6,r4
206	add	r3,r6,r3
207	b	.Ldst_aligned
208
209.Lshort_copy:
210	bf	cr7*4+0,1f
211	lwz	r0,0(r4)
212	lwz	r9,4(r4)
213	addi	r4,r4,8
214	stw	r0,0(r3)
215	stw	r9,4(r3)
216	addi	r3,r3,8
2171:	bf	cr7*4+1,2f
218	lwz	r0,0(r4)
219	addi	r4,r4,4
220	stw	r0,0(r3)
221	addi	r3,r3,4
2222:	bf	cr7*4+2,3f
223	lhz	r0,0(r4)
224	addi	r4,r4,2
225	sth	r0,0(r3)
226	addi	r3,r3,2
2273:	bf	cr7*4+3,4f
228	lbz	r0,0(r4)
229	stb	r0,0(r3)
2304:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
231	blr
232#endif
233EXPORT_SYMBOL(memcpy)
234EXPORT_SYMBOL_KASAN(memcpy)
235