xref: /openbmc/linux/arch/powerpc/lib/memcpy_64.S (revision b21a207141d83a06abc5f492b80204602e02ca44)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14	std	r3,48(r1)	/* save destination pointer for return value */
15	PPC_MTOCRF	0x01,r5
16	cmpldi	cr1,r5,16
17	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
18	andi.	r6,r6,7
19	dcbt	0,r4
20	blt	cr1,.Lshort_copy
21/* Below we want to nop out the bne if we're on a CPU that has the
22   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
23   cleared.
24   At the time of writing the only CPU that has this combination of bits
25   set is Power6. */
26BEGIN_FTR_SECTION
27	nop
28FTR_SECTION_ELSE
29	bne	.Ldst_unaligned
30ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
31                    CPU_FTR_UNALIGNED_LD_STD)
32.Ldst_aligned:
33	addi	r3,r3,-16
34BEGIN_FTR_SECTION
35	andi.	r0,r4,7
36	bne	.Lsrc_unaligned
37END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
38	srdi	r7,r5,4
39	ld	r9,0(r4)
40	addi	r4,r4,-8
41	mtctr	r7
42	andi.	r5,r5,7
43	bf	cr7*4+0,2f
44	addi	r3,r3,8
45	addi	r4,r4,8
46	mr	r8,r9
47	blt	cr1,3f
481:	ld	r9,8(r4)
49	std	r8,8(r3)
502:	ldu	r8,16(r4)
51	stdu	r9,16(r3)
52	bdnz	1b
533:	std	r8,8(r3)
54	beq	3f
55	addi	r3,r3,16
56	ld	r9,8(r4)
57.Ldo_tail:
58	bf	cr7*4+1,1f
59	rotldi	r9,r9,32
60	stw	r9,0(r3)
61	addi	r3,r3,4
621:	bf	cr7*4+2,2f
63	rotldi	r9,r9,16
64	sth	r9,0(r3)
65	addi	r3,r3,2
662:	bf	cr7*4+3,3f
67	rotldi	r9,r9,8
68	stb	r9,0(r3)
693:	ld	r3,48(r1)	/* return dest pointer */
70	blr
71
72.Lsrc_unaligned:
73	srdi	r6,r5,3
74	addi	r5,r5,-16
75	subf	r4,r0,r4
76	srdi	r7,r5,4
77	sldi	r10,r0,3
78	cmpdi	cr6,r6,3
79	andi.	r5,r5,7
80	mtctr	r7
81	subfic	r11,r10,64
82	add	r5,r5,r0
83
84	bt	cr7*4+0,0f
85
86	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
87	ld	r0,8(r4)
88	sld	r6,r9,r10
89	ldu	r9,16(r4)
90	srd	r7,r0,r11
91	sld	r8,r0,r10
92	or	r7,r7,r6
93	blt	cr6,4f
94	ld	r0,8(r4)
95	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
96	b	2f
97
980:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
99	ldu	r9,8(r4)
100	sld	r8,r0,r10
101	addi	r3,r3,-8
102	blt	cr6,5f
103	ld	r0,8(r4)
104	srd	r12,r9,r11
105	sld	r6,r9,r10
106	ldu	r9,16(r4)
107	or	r12,r8,r12
108	srd	r7,r0,r11
109	sld	r8,r0,r10
110	addi	r3,r3,16
111	beq	cr6,3f
112
113	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1141:	or	r7,r7,r6
115	ld	r0,8(r4)
116	std	r12,8(r3)
1172:	srd	r12,r9,r11
118	sld	r6,r9,r10
119	ldu	r9,16(r4)
120	or	r12,r8,r12
121	stdu	r7,16(r3)
122	srd	r7,r0,r11
123	sld	r8,r0,r10
124	bdnz	1b
125
1263:	std	r12,8(r3)
127	or	r7,r7,r6
1284:	std	r7,16(r3)
1295:	srd	r12,r9,r11
130	or	r12,r8,r12
131	std	r12,24(r3)
132	beq	4f
133	cmpwi	cr1,r5,8
134	addi	r3,r3,32
135	sld	r9,r9,r10
136	ble	cr1,.Ldo_tail
137	ld	r0,8(r4)
138	srd	r7,r0,r11
139	or	r9,r7,r9
140	b	.Ldo_tail
141
142.Ldst_unaligned:
143	PPC_MTOCRF	0x01,r6		# put #bytes to 8B bdry into cr7
144	subf	r5,r6,r5
145	li	r7,0
146	cmpldi	cr1,r5,16
147	bf	cr7*4+3,1f
148	lbz	r0,0(r4)
149	stb	r0,0(r3)
150	addi	r7,r7,1
1511:	bf	cr7*4+2,2f
152	lhzx	r0,r7,r4
153	sthx	r0,r7,r3
154	addi	r7,r7,2
1552:	bf	cr7*4+1,3f
156	lwzx	r0,r7,r4
157	stwx	r0,r7,r3
1583:	PPC_MTOCRF	0x01,r5
159	add	r4,r6,r4
160	add	r3,r6,r3
161	b	.Ldst_aligned
162
163.Lshort_copy:
164	bf	cr7*4+0,1f
165	lwz	r0,0(r4)
166	lwz	r9,4(r4)
167	addi	r4,r4,8
168	stw	r0,0(r3)
169	stw	r9,4(r3)
170	addi	r3,r3,8
1711:	bf	cr7*4+1,2f
172	lwz	r0,0(r4)
173	addi	r4,r4,4
174	stw	r0,0(r3)
175	addi	r3,r3,4
1762:	bf	cr7*4+2,3f
177	lhz	r0,0(r4)
178	addi	r4,r4,2
179	sth	r0,0(r3)
180	addi	r3,r3,2
1813:	bf	cr7*4+3,4f
182	lbz	r0,0(r4)
183	stb	r0,0(r3)
1844:	ld	r3,48(r1)	/* return dest pointer */
185	blr
186