xref: /openbmc/linux/tools/testing/selftests/powerpc/copyloops/memcpy_64.S (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/asm-compat.h>
13#include <asm/feature-fixups.h>
14
15#ifndef SELFTEST_CASE
16/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
17#define SELFTEST_CASE	0
18#endif
19
20	.align	7
21_GLOBAL_TOC(memcpy)
22BEGIN_FTR_SECTION
23#ifdef __LITTLE_ENDIAN__
24	cmpdi	cr7,r5,0
25#else
26	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
27#endif
28FTR_SECTION_ELSE
29#ifdef CONFIG_PPC_BOOK3S_64
30	b	memcpy_power7
31#endif
32ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
33#ifdef __LITTLE_ENDIAN__
34	/* dumb little-endian memcpy that will get replaced at runtime */
35	addi r9,r3,-1
36	addi r4,r4,-1
37	beqlr cr7
38	mtctr r5
391:	lbzu r10,1(r4)
40	stbu r10,1(r9)
41	bdnz 1b
42	blr
43#else
44	PPC_MTOCRF(0x01,r5)
45	cmpldi	cr1,r5,16
46	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
47	andi.	r6,r6,7
48	dcbt	0,r4
49	blt	cr1,.Lshort_copy
50/* Below we want to nop out the bne if we're on a CPU that has the
51   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
52   cleared.
53   At the time of writing the only CPU that has this combination of bits
54   set is Power6. */
55test_feature = (SELFTEST_CASE == 1)
56BEGIN_FTR_SECTION
57	nop
58FTR_SECTION_ELSE
59	bne	.Ldst_unaligned
60ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
61                    CPU_FTR_UNALIGNED_LD_STD)
62.Ldst_aligned:
63	addi	r3,r3,-16
64test_feature = (SELFTEST_CASE == 0)
65BEGIN_FTR_SECTION
66	andi.	r0,r4,7
67	bne	.Lsrc_unaligned
68END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
69	srdi	r7,r5,4
70	ld	r9,0(r4)
71	addi	r4,r4,-8
72	mtctr	r7
73	andi.	r5,r5,7
74	bf	cr7*4+0,2f
75	addi	r3,r3,8
76	addi	r4,r4,8
77	mr	r8,r9
78	blt	cr1,3f
791:	ld	r9,8(r4)
80	std	r8,8(r3)
812:	ldu	r8,16(r4)
82	stdu	r9,16(r3)
83	bdnz	1b
843:	std	r8,8(r3)
85	beq	3f
86	addi	r3,r3,16
87.Ldo_tail:
88	bf	cr7*4+1,1f
89	lwz	r9,8(r4)
90	addi	r4,r4,4
91	stw	r9,0(r3)
92	addi	r3,r3,4
931:	bf	cr7*4+2,2f
94	lhz	r9,8(r4)
95	addi	r4,r4,2
96	sth	r9,0(r3)
97	addi	r3,r3,2
982:	bf	cr7*4+3,3f
99	lbz	r9,8(r4)
100	stb	r9,0(r3)
1013:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
102	blr
103
104.Lsrc_unaligned:
105	srdi	r6,r5,3
106	addi	r5,r5,-16
107	subf	r4,r0,r4
108	srdi	r7,r5,4
109	sldi	r10,r0,3
110	cmpdi	cr6,r6,3
111	andi.	r5,r5,7
112	mtctr	r7
113	subfic	r11,r10,64
114	add	r5,r5,r0
115
116	bt	cr7*4+0,0f
117
118	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
119	ld	r0,8(r4)
120	sld	r6,r9,r10
121	ldu	r9,16(r4)
122	srd	r7,r0,r11
123	sld	r8,r0,r10
124	or	r7,r7,r6
125	blt	cr6,4f
126	ld	r0,8(r4)
127	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
128	b	2f
129
1300:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
131	ldu	r9,8(r4)
132	sld	r8,r0,r10
133	addi	r3,r3,-8
134	blt	cr6,5f
135	ld	r0,8(r4)
136	srd	r12,r9,r11
137	sld	r6,r9,r10
138	ldu	r9,16(r4)
139	or	r12,r8,r12
140	srd	r7,r0,r11
141	sld	r8,r0,r10
142	addi	r3,r3,16
143	beq	cr6,3f
144
145	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1461:	or	r7,r7,r6
147	ld	r0,8(r4)
148	std	r12,8(r3)
1492:	srd	r12,r9,r11
150	sld	r6,r9,r10
151	ldu	r9,16(r4)
152	or	r12,r8,r12
153	stdu	r7,16(r3)
154	srd	r7,r0,r11
155	sld	r8,r0,r10
156	bdnz	1b
157
1583:	std	r12,8(r3)
159	or	r7,r7,r6
1604:	std	r7,16(r3)
1615:	srd	r12,r9,r11
162	or	r12,r8,r12
163	std	r12,24(r3)
164	beq	4f
165	cmpwi	cr1,r5,8
166	addi	r3,r3,32
167	sld	r9,r9,r10
168	ble	cr1,6f
169	ld	r0,8(r4)
170	srd	r7,r0,r11
171	or	r9,r7,r9
1726:
173	bf	cr7*4+1,1f
174	rotldi	r9,r9,32
175	stw	r9,0(r3)
176	addi	r3,r3,4
1771:	bf	cr7*4+2,2f
178	rotldi	r9,r9,16
179	sth	r9,0(r3)
180	addi	r3,r3,2
1812:	bf	cr7*4+3,3f
182	rotldi	r9,r9,8
183	stb	r9,0(r3)
1843:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
185	blr
186
187.Ldst_unaligned:
188	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
189	subf	r5,r6,r5
190	li	r7,0
191	cmpldi	cr1,r5,16
192	bf	cr7*4+3,1f
193	lbz	r0,0(r4)
194	stb	r0,0(r3)
195	addi	r7,r7,1
1961:	bf	cr7*4+2,2f
197	lhzx	r0,r7,r4
198	sthx	r0,r7,r3
199	addi	r7,r7,2
2002:	bf	cr7*4+1,3f
201	lwzx	r0,r7,r4
202	stwx	r0,r7,r3
2033:	PPC_MTOCRF(0x01,r5)
204	add	r4,r6,r4
205	add	r3,r6,r3
206	b	.Ldst_aligned
207
208.Lshort_copy:
209	bf	cr7*4+0,1f
210	lwz	r0,0(r4)
211	lwz	r9,4(r4)
212	addi	r4,r4,8
213	stw	r0,0(r3)
214	stw	r9,4(r3)
215	addi	r3,r3,8
2161:	bf	cr7*4+1,2f
217	lwz	r0,0(r4)
218	addi	r4,r4,4
219	stw	r0,0(r3)
220	addi	r3,r3,4
2212:	bf	cr7*4+2,3f
222	lhz	r0,0(r4)
223	addi	r4,r4,2
224	sth	r0,0(r3)
225	addi	r3,r3,2
2263:	bf	cr7*4+3,4f
227	lbz	r0,0(r4)
228	stb	r0,0(r3)
2294:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
230	blr
231#endif
232EXPORT_SYMBOL(memcpy)
233