xref: /openbmc/linux/arch/arm64/lib/copy_template.S (revision e5c88e3f)
1e5c88e3fSFeng Kan/*
2e5c88e3fSFeng Kan * Copyright (C) 2013 ARM Ltd.
3e5c88e3fSFeng Kan * Copyright (C) 2013 Linaro.
4e5c88e3fSFeng Kan *
5e5c88e3fSFeng Kan * This code is based on glibc cortex strings work originally authored by Linaro
6e5c88e3fSFeng Kan * and re-licensed under GPLv2 for the Linux kernel. The original code can
7e5c88e3fSFeng Kan * be found @
8e5c88e3fSFeng Kan *
9e5c88e3fSFeng Kan * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10e5c88e3fSFeng Kan * files/head:/src/aarch64/
11e5c88e3fSFeng Kan *
12e5c88e3fSFeng Kan * This program is free software; you can redistribute it and/or modify
13e5c88e3fSFeng Kan * it under the terms of the GNU General Public License version 2 as
14e5c88e3fSFeng Kan * published by the Free Software Foundation.
15e5c88e3fSFeng Kan *
16e5c88e3fSFeng Kan * This program is distributed in the hope that it will be useful,
17e5c88e3fSFeng Kan * but WITHOUT ANY WARRANTY; without even the implied warranty of
18e5c88e3fSFeng Kan * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19e5c88e3fSFeng Kan * GNU General Public License for more details.
20e5c88e3fSFeng Kan *
21e5c88e3fSFeng Kan * You should have received a copy of the GNU General Public License
22e5c88e3fSFeng Kan * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23e5c88e3fSFeng Kan */
24e5c88e3fSFeng Kan
25e5c88e3fSFeng Kan
26e5c88e3fSFeng Kan/*
27e5c88e3fSFeng Kan * Copy a buffer from src to dest (alignment handled by the hardware)
28e5c88e3fSFeng Kan *
29e5c88e3fSFeng Kan * Parameters:
30e5c88e3fSFeng Kan *	x0 - dest
31e5c88e3fSFeng Kan *	x1 - src
32e5c88e3fSFeng Kan *	x2 - n
33e5c88e3fSFeng Kan * Returns:
34e5c88e3fSFeng Kan *	x0 - dest
35e5c88e3fSFeng Kan */
36e5c88e3fSFeng Kandstin	.req	x0
37e5c88e3fSFeng Kansrc	.req	x1
38e5c88e3fSFeng Kancount	.req	x2
39e5c88e3fSFeng Kantmp1	.req	x3
40e5c88e3fSFeng Kantmp1w	.req	w3
41e5c88e3fSFeng Kantmp2	.req	x4
42e5c88e3fSFeng Kantmp2w	.req	w4
43e5c88e3fSFeng Kandst	.req	x6
44e5c88e3fSFeng Kan
45e5c88e3fSFeng KanA_l	.req	x7
46e5c88e3fSFeng KanA_h	.req	x8
47e5c88e3fSFeng KanB_l	.req	x9
48e5c88e3fSFeng KanB_h	.req	x10
49e5c88e3fSFeng KanC_l	.req	x11
50e5c88e3fSFeng KanC_h	.req	x12
51e5c88e3fSFeng KanD_l	.req	x13
52e5c88e3fSFeng KanD_h	.req	x14
53e5c88e3fSFeng Kan
54e5c88e3fSFeng Kan	mov	dst, dstin
55e5c88e3fSFeng Kan	cmp	count, #16
56e5c88e3fSFeng Kan	/*When memory length is less than 16, the accessed are not aligned.*/
57e5c88e3fSFeng Kan	b.lo	.Ltiny15
58e5c88e3fSFeng Kan
59e5c88e3fSFeng Kan	neg	tmp2, src
60e5c88e3fSFeng Kan	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
61e5c88e3fSFeng Kan	b.eq	.LSrcAligned
62e5c88e3fSFeng Kan	sub	count, count, tmp2
63e5c88e3fSFeng Kan	/*
64e5c88e3fSFeng Kan	* Copy the leading memory data from src to dst in an increasing
65e5c88e3fSFeng Kan	* address order.By this way,the risk of overwritting the source
66e5c88e3fSFeng Kan	* memory data is eliminated when the distance between src and
67e5c88e3fSFeng Kan	* dst is less than 16. The memory accesses here are alignment.
68e5c88e3fSFeng Kan	*/
69e5c88e3fSFeng Kan	tbz	tmp2, #0, 1f
70e5c88e3fSFeng Kan	ldrb1	tmp1w, src, #1
71e5c88e3fSFeng Kan	strb1	tmp1w, dst, #1
72e5c88e3fSFeng Kan1:
73e5c88e3fSFeng Kan	tbz	tmp2, #1, 2f
74e5c88e3fSFeng Kan	ldrh1	tmp1w, src, #2
75e5c88e3fSFeng Kan	strh1	tmp1w, dst, #2
76e5c88e3fSFeng Kan2:
77e5c88e3fSFeng Kan	tbz	tmp2, #2, 3f
78e5c88e3fSFeng Kan	ldr1	tmp1w, src, #4
79e5c88e3fSFeng Kan	str1	tmp1w, dst, #4
80e5c88e3fSFeng Kan3:
81e5c88e3fSFeng Kan	tbz	tmp2, #3, .LSrcAligned
82e5c88e3fSFeng Kan	ldr1	tmp1, src, #8
83e5c88e3fSFeng Kan	str1	tmp1, dst, #8
84e5c88e3fSFeng Kan
85e5c88e3fSFeng Kan.LSrcAligned:
86e5c88e3fSFeng Kan	cmp	count, #64
87e5c88e3fSFeng Kan	b.ge	.Lcpy_over64
88e5c88e3fSFeng Kan	/*
89e5c88e3fSFeng Kan	* Deal with small copies quickly by dropping straight into the
90e5c88e3fSFeng Kan	* exit block.
91e5c88e3fSFeng Kan	*/
92e5c88e3fSFeng Kan.Ltail63:
93e5c88e3fSFeng Kan	/*
94e5c88e3fSFeng Kan	* Copy up to 48 bytes of data. At this point we only need the
95e5c88e3fSFeng Kan	* bottom 6 bits of count to be accurate.
96e5c88e3fSFeng Kan	*/
97e5c88e3fSFeng Kan	ands	tmp1, count, #0x30
98e5c88e3fSFeng Kan	b.eq	.Ltiny15
99e5c88e3fSFeng Kan	cmp	tmp1w, #0x20
100e5c88e3fSFeng Kan	b.eq	1f
101e5c88e3fSFeng Kan	b.lt	2f
102e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
103e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
104e5c88e3fSFeng Kan1:
105e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
106e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
107e5c88e3fSFeng Kan2:
108e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
109e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
110e5c88e3fSFeng Kan.Ltiny15:
111e5c88e3fSFeng Kan	/*
112e5c88e3fSFeng Kan	* Prefer to break one ldp/stp into several load/store to access
113e5c88e3fSFeng Kan	* memory in an increasing address order,rather than to load/store 16
114e5c88e3fSFeng Kan	* bytes from (src-16) to (dst-16) and to backward the src to aligned
115e5c88e3fSFeng Kan	* address,which way is used in original cortex memcpy. If keeping
116e5c88e3fSFeng Kan	* the original memcpy process here, memmove need to satisfy the
117e5c88e3fSFeng Kan	* precondition that src address is at least 16 bytes bigger than dst
118e5c88e3fSFeng Kan	* address,otherwise some source data will be overwritten when memove
119e5c88e3fSFeng Kan	* call memcpy directly. To make memmove simpler and decouple the
120e5c88e3fSFeng Kan	* memcpy's dependency on memmove, withdrew the original process.
121e5c88e3fSFeng Kan	*/
122e5c88e3fSFeng Kan	tbz	count, #3, 1f
123e5c88e3fSFeng Kan	ldr1	tmp1, src, #8
124e5c88e3fSFeng Kan	str1	tmp1, dst, #8
125e5c88e3fSFeng Kan1:
126e5c88e3fSFeng Kan	tbz	count, #2, 2f
127e5c88e3fSFeng Kan	ldr1	tmp1w, src, #4
128e5c88e3fSFeng Kan	str1	tmp1w, dst, #4
129e5c88e3fSFeng Kan2:
130e5c88e3fSFeng Kan	tbz	count, #1, 3f
131e5c88e3fSFeng Kan	ldrh1	tmp1w, src, #2
132e5c88e3fSFeng Kan	strh1	tmp1w, dst, #2
133e5c88e3fSFeng Kan3:
134e5c88e3fSFeng Kan	tbz	count, #0, .Lexitfunc
135e5c88e3fSFeng Kan	ldrb1	tmp1w, src, #1
136e5c88e3fSFeng Kan	strb1	tmp1w, dst, #1
137e5c88e3fSFeng Kan
138e5c88e3fSFeng Kan	b	.Lexitfunc
139e5c88e3fSFeng Kan
140e5c88e3fSFeng Kan.Lcpy_over64:
141e5c88e3fSFeng Kan	subs	count, count, #128
142e5c88e3fSFeng Kan	b.ge	.Lcpy_body_large
143e5c88e3fSFeng Kan	/*
144e5c88e3fSFeng Kan	* Less than 128 bytes to copy, so handle 64 here and then jump
145e5c88e3fSFeng Kan	* to the tail.
146e5c88e3fSFeng Kan	*/
147e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
148e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
149e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
150e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
151e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
152e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
153e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
154e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
155e5c88e3fSFeng Kan
156e5c88e3fSFeng Kan	tst	count, #0x3f
157e5c88e3fSFeng Kan	b.ne	.Ltail63
158e5c88e3fSFeng Kan	b	.Lexitfunc
159e5c88e3fSFeng Kan
160e5c88e3fSFeng Kan	/*
161e5c88e3fSFeng Kan	* Critical loop.  Start at a new cache line boundary.  Assuming
162e5c88e3fSFeng Kan	* 64 bytes per line this ensures the entire loop is in one line.
163e5c88e3fSFeng Kan	*/
164e5c88e3fSFeng Kan	.p2align	L1_CACHE_SHIFT
165e5c88e3fSFeng Kan.Lcpy_body_large:
166e5c88e3fSFeng Kan	/* pre-get 64 bytes data. */
167e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
168e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
169e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
170e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
171e5c88e3fSFeng Kan1:
172e5c88e3fSFeng Kan	/*
173e5c88e3fSFeng Kan	* interlace the load of next 64 bytes data block with store of the last
174e5c88e3fSFeng Kan	* loaded 64 bytes data.
175e5c88e3fSFeng Kan	*/
176e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
177e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
178e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
179e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
180e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
181e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
182e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
183e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
184e5c88e3fSFeng Kan	subs	count, count, #64
185e5c88e3fSFeng Kan	b.ge	1b
186e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
187e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
188e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
189e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
190e5c88e3fSFeng Kan
191e5c88e3fSFeng Kan	tst	count, #0x3f
192e5c88e3fSFeng Kan	b.ne	.Ltail63
193e5c88e3fSFeng Kan.Lexitfunc:
194