1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12
13	.align	7
14_GLOBAL_TOC(memcpy)
15BEGIN_FTR_SECTION
16#ifdef __LITTLE_ENDIAN__
17	cmpdi	cr7,r5,0
18#else
19	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
20#endif
21FTR_SECTION_ELSE
22#ifndef SELFTEST
23	b	memcpy_power7
24#endif
25ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
26#ifdef __LITTLE_ENDIAN__
27	/* dumb little-endian memcpy that will get replaced at runtime */
28	addi r9,r3,-1
29	addi r4,r4,-1
30	beqlr cr7
31	mtctr r5
321:	lbzu r10,1(r4)
33	stbu r10,1(r9)
34	bdnz 1b
35	blr
36#else
37	PPC_MTOCRF(0x01,r5)
38	cmpldi	cr1,r5,16
39	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
40	andi.	r6,r6,7
41	dcbt	0,r4
42	blt	cr1,.Lshort_copy
43/* Below we want to nop out the bne if we're on a CPU that has the
44   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
45   cleared.
46   At the time of writing the only CPU that has this combination of bits
47   set is Power6. */
48BEGIN_FTR_SECTION
49	nop
50FTR_SECTION_ELSE
51	bne	.Ldst_unaligned
52ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
53                    CPU_FTR_UNALIGNED_LD_STD)
54.Ldst_aligned:
55	addi	r3,r3,-16
56BEGIN_FTR_SECTION
57	andi.	r0,r4,7
58	bne	.Lsrc_unaligned
59END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
60	srdi	r7,r5,4
61	ld	r9,0(r4)
62	addi	r4,r4,-8
63	mtctr	r7
64	andi.	r5,r5,7
65	bf	cr7*4+0,2f
66	addi	r3,r3,8
67	addi	r4,r4,8
68	mr	r8,r9
69	blt	cr1,3f
701:	ld	r9,8(r4)
71	std	r8,8(r3)
722:	ldu	r8,16(r4)
73	stdu	r9,16(r3)
74	bdnz	1b
753:	std	r8,8(r3)
76	beq	3f
77	addi	r3,r3,16
78.Ldo_tail:
79	bf	cr7*4+1,1f
80	lwz	r9,8(r4)
81	addi	r4,r4,4
82	stw	r9,0(r3)
83	addi	r3,r3,4
841:	bf	cr7*4+2,2f
85	lhz	r9,8(r4)
86	addi	r4,r4,2
87	sth	r9,0(r3)
88	addi	r3,r3,2
892:	bf	cr7*4+3,3f
90	lbz	r9,8(r4)
91	stb	r9,0(r3)
923:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
93	blr
94
95.Lsrc_unaligned:
96	srdi	r6,r5,3
97	addi	r5,r5,-16
98	subf	r4,r0,r4
99	srdi	r7,r5,4
100	sldi	r10,r0,3
101	cmpdi	cr6,r6,3
102	andi.	r5,r5,7
103	mtctr	r7
104	subfic	r11,r10,64
105	add	r5,r5,r0
106
107	bt	cr7*4+0,0f
108
109	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
110	ld	r0,8(r4)
111	sld	r6,r9,r10
112	ldu	r9,16(r4)
113	srd	r7,r0,r11
114	sld	r8,r0,r10
115	or	r7,r7,r6
116	blt	cr6,4f
117	ld	r0,8(r4)
118	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
119	b	2f
120
1210:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
122	ldu	r9,8(r4)
123	sld	r8,r0,r10
124	addi	r3,r3,-8
125	blt	cr6,5f
126	ld	r0,8(r4)
127	srd	r12,r9,r11
128	sld	r6,r9,r10
129	ldu	r9,16(r4)
130	or	r12,r8,r12
131	srd	r7,r0,r11
132	sld	r8,r0,r10
133	addi	r3,r3,16
134	beq	cr6,3f
135
136	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1371:	or	r7,r7,r6
138	ld	r0,8(r4)
139	std	r12,8(r3)
1402:	srd	r12,r9,r11
141	sld	r6,r9,r10
142	ldu	r9,16(r4)
143	or	r12,r8,r12
144	stdu	r7,16(r3)
145	srd	r7,r0,r11
146	sld	r8,r0,r10
147	bdnz	1b
148
1493:	std	r12,8(r3)
150	or	r7,r7,r6
1514:	std	r7,16(r3)
1525:	srd	r12,r9,r11
153	or	r12,r8,r12
154	std	r12,24(r3)
155	beq	4f
156	cmpwi	cr1,r5,8
157	addi	r3,r3,32
158	sld	r9,r9,r10
159	ble	cr1,6f
160	ld	r0,8(r4)
161	srd	r7,r0,r11
162	or	r9,r7,r9
1636:
164	bf	cr7*4+1,1f
165	rotldi	r9,r9,32
166	stw	r9,0(r3)
167	addi	r3,r3,4
1681:	bf	cr7*4+2,2f
169	rotldi	r9,r9,16
170	sth	r9,0(r3)
171	addi	r3,r3,2
1722:	bf	cr7*4+3,3f
173	rotldi	r9,r9,8
174	stb	r9,0(r3)
1753:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
176	blr
177
178.Ldst_unaligned:
179	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
180	subf	r5,r6,r5
181	li	r7,0
182	cmpldi	cr1,r5,16
183	bf	cr7*4+3,1f
184	lbz	r0,0(r4)
185	stb	r0,0(r3)
186	addi	r7,r7,1
1871:	bf	cr7*4+2,2f
188	lhzx	r0,r7,r4
189	sthx	r0,r7,r3
190	addi	r7,r7,2
1912:	bf	cr7*4+1,3f
192	lwzx	r0,r7,r4
193	stwx	r0,r7,r3
1943:	PPC_MTOCRF(0x01,r5)
195	add	r4,r6,r4
196	add	r3,r6,r3
197	b	.Ldst_aligned
198
199.Lshort_copy:
200	bf	cr7*4+0,1f
201	lwz	r0,0(r4)
202	lwz	r9,4(r4)
203	addi	r4,r4,8
204	stw	r0,0(r3)
205	stw	r9,4(r3)
206	addi	r3,r3,8
2071:	bf	cr7*4+1,2f
208	lwz	r0,0(r4)
209	addi	r4,r4,4
210	stw	r0,0(r3)
211	addi	r3,r3,4
2122:	bf	cr7*4+2,3f
213	lhz	r0,0(r4)
214	addi	r4,r4,2
215	sth	r0,0(r3)
216	addi	r3,r3,2
2173:	bf	cr7*4+3,4f
218	lbz	r0,0(r4)
219	stb	r0,0(r3)
2204:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
221	blr
222#endif
223EXPORT_SYMBOL(memcpy)
224