xref: /openbmc/linux/arch/arm64/lib/memcpy.S (revision 93df8a1e)
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27#include <asm/cache.h>
28
29/*
30 * Copy a buffer from src to dest (alignment handled by the hardware)
31 *
32 * Parameters:
33 *	x0 - dest
34 *	x1 - src
35 *	x2 - n
36 * Returns:
37 *	x0 - dest
38 */
39dstin	.req	x0
40src	.req	x1
41count	.req	x2
42tmp1	.req	x3
43tmp1w	.req	w3
44tmp2	.req	x4
45tmp2w	.req	w4
46tmp3	.req	x5
47tmp3w	.req	w5
48dst	.req	x6
49
50A_l	.req	x7
51A_h	.req	x8
52B_l	.req	x9
53B_h	.req	x10
54C_l	.req	x11
55C_h	.req	x12
56D_l	.req	x13
57D_h	.req	x14
58
59ENTRY(memcpy)
60	mov	dst, dstin
61	cmp	count, #16
62	/*When memory length is less than 16, the accessed are not aligned.*/
63	b.lo	.Ltiny15
64
65	neg	tmp2, src
66	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
67	b.eq	.LSrcAligned
68	sub	count, count, tmp2
69	/*
70	* Copy the leading memory data from src to dst in an increasing
71	* address order.By this way,the risk of overwritting the source
72	* memory data is eliminated when the distance between src and
73	* dst is less than 16. The memory accesses here are alignment.
74	*/
75	tbz	tmp2, #0, 1f
76	ldrb	tmp1w, [src], #1
77	strb	tmp1w, [dst], #1
781:
79	tbz	tmp2, #1, 2f
80	ldrh	tmp1w, [src], #2
81	strh	tmp1w, [dst], #2
822:
83	tbz	tmp2, #2, 3f
84	ldr	tmp1w, [src], #4
85	str	tmp1w, [dst], #4
863:
87	tbz	tmp2, #3, .LSrcAligned
88	ldr	tmp1, [src],#8
89	str	tmp1, [dst],#8
90
91.LSrcAligned:
92	cmp	count, #64
93	b.ge	.Lcpy_over64
94	/*
95	* Deal with small copies quickly by dropping straight into the
96	* exit block.
97	*/
98.Ltail63:
99	/*
100	* Copy up to 48 bytes of data. At this point we only need the
101	* bottom 6 bits of count to be accurate.
102	*/
103	ands	tmp1, count, #0x30
104	b.eq	.Ltiny15
105	cmp	tmp1w, #0x20
106	b.eq	1f
107	b.lt	2f
108	ldp	A_l, A_h, [src], #16
109	stp	A_l, A_h, [dst], #16
1101:
111	ldp	A_l, A_h, [src], #16
112	stp	A_l, A_h, [dst], #16
1132:
114	ldp	A_l, A_h, [src], #16
115	stp	A_l, A_h, [dst], #16
116.Ltiny15:
117	/*
118	* Prefer to break one ldp/stp into several load/store to access
119	* memory in an increasing address order,rather than to load/store 16
120	* bytes from (src-16) to (dst-16) and to backward the src to aligned
121	* address,which way is used in original cortex memcpy. If keeping
122	* the original memcpy process here, memmove need to satisfy the
123	* precondition that src address is at least 16 bytes bigger than dst
124	* address,otherwise some source data will be overwritten when memove
125	* call memcpy directly. To make memmove simpler and decouple the
126	* memcpy's dependency on memmove, withdrew the original process.
127	*/
128	tbz	count, #3, 1f
129	ldr	tmp1, [src], #8
130	str	tmp1, [dst], #8
1311:
132	tbz	count, #2, 2f
133	ldr	tmp1w, [src], #4
134	str	tmp1w, [dst], #4
1352:
136	tbz	count, #1, 3f
137	ldrh	tmp1w, [src], #2
138	strh	tmp1w, [dst], #2
1393:
140	tbz	count, #0, .Lexitfunc
141	ldrb	tmp1w, [src]
142	strb	tmp1w, [dst]
143
144.Lexitfunc:
145	ret
146
147.Lcpy_over64:
148	subs	count, count, #128
149	b.ge	.Lcpy_body_large
150	/*
151	* Less than 128 bytes to copy, so handle 64 here and then jump
152	* to the tail.
153	*/
154	ldp	A_l, A_h, [src],#16
155	stp	A_l, A_h, [dst],#16
156	ldp	B_l, B_h, [src],#16
157	ldp	C_l, C_h, [src],#16
158	stp	B_l, B_h, [dst],#16
159	stp	C_l, C_h, [dst],#16
160	ldp	D_l, D_h, [src],#16
161	stp	D_l, D_h, [dst],#16
162
163	tst	count, #0x3f
164	b.ne	.Ltail63
165	ret
166
167	/*
168	* Critical loop.  Start at a new cache line boundary.  Assuming
169	* 64 bytes per line this ensures the entire loop is in one line.
170	*/
171	.p2align	L1_CACHE_SHIFT
172.Lcpy_body_large:
173	/* pre-get 64 bytes data. */
174	ldp	A_l, A_h, [src],#16
175	ldp	B_l, B_h, [src],#16
176	ldp	C_l, C_h, [src],#16
177	ldp	D_l, D_h, [src],#16
1781:
179	/*
180	* interlace the load of next 64 bytes data block with store of the last
181	* loaded 64 bytes data.
182	*/
183	stp	A_l, A_h, [dst],#16
184	ldp	A_l, A_h, [src],#16
185	stp	B_l, B_h, [dst],#16
186	ldp	B_l, B_h, [src],#16
187	stp	C_l, C_h, [dst],#16
188	ldp	C_l, C_h, [src],#16
189	stp	D_l, D_h, [dst],#16
190	ldp	D_l, D_h, [src],#16
191	subs	count, count, #64
192	b.ge	1b
193	stp	A_l, A_h, [dst],#16
194	stp	B_l, B_h, [dst],#16
195	stp	C_l, C_h, [dst],#16
196	stp	D_l, D_h, [dst],#16
197
198	tst	count, #0x3f
199	b.ne	.Ltail63
200	ret
201ENDPROC(memcpy)
202