xref: /openbmc/linux/arch/arm64/lib/copy_template.S (revision caab277b)
1caab277bSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */
2e5c88e3fSFeng Kan/*
3e5c88e3fSFeng Kan * Copyright (C) 2013 ARM Ltd.
4e5c88e3fSFeng Kan * Copyright (C) 2013 Linaro.
5e5c88e3fSFeng Kan *
6e5c88e3fSFeng Kan * This code is based on glibc cortex strings work originally authored by Linaro
7e5c88e3fSFeng Kan * be found @
8e5c88e3fSFeng Kan *
9e5c88e3fSFeng Kan * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10e5c88e3fSFeng Kan * files/head:/src/aarch64/
11e5c88e3fSFeng Kan */
12e5c88e3fSFeng Kan
13e5c88e3fSFeng Kan
14e5c88e3fSFeng Kan/*
15e5c88e3fSFeng Kan * Copy a buffer from src to dest (alignment handled by the hardware)
16e5c88e3fSFeng Kan *
17e5c88e3fSFeng Kan * Parameters:
18e5c88e3fSFeng Kan *	x0 - dest
19e5c88e3fSFeng Kan *	x1 - src
20e5c88e3fSFeng Kan *	x2 - n
21e5c88e3fSFeng Kan * Returns:
22e5c88e3fSFeng Kan *	x0 - dest
23e5c88e3fSFeng Kan */
24e5c88e3fSFeng Kandstin	.req	x0
25e5c88e3fSFeng Kansrc	.req	x1
26e5c88e3fSFeng Kancount	.req	x2
27e5c88e3fSFeng Kantmp1	.req	x3
28e5c88e3fSFeng Kantmp1w	.req	w3
29e5c88e3fSFeng Kantmp2	.req	x4
30e5c88e3fSFeng Kantmp2w	.req	w4
31e5c88e3fSFeng Kandst	.req	x6
32e5c88e3fSFeng Kan
33e5c88e3fSFeng KanA_l	.req	x7
34e5c88e3fSFeng KanA_h	.req	x8
35e5c88e3fSFeng KanB_l	.req	x9
36e5c88e3fSFeng KanB_h	.req	x10
37e5c88e3fSFeng KanC_l	.req	x11
38e5c88e3fSFeng KanC_h	.req	x12
39e5c88e3fSFeng KanD_l	.req	x13
40e5c88e3fSFeng KanD_h	.req	x14
41e5c88e3fSFeng Kan
42e5c88e3fSFeng Kan	mov	dst, dstin
43e5c88e3fSFeng Kan	cmp	count, #16
44e5c88e3fSFeng Kan	/*When memory length is less than 16, the accessed are not aligned.*/
45e5c88e3fSFeng Kan	b.lo	.Ltiny15
46e5c88e3fSFeng Kan
47e5c88e3fSFeng Kan	neg	tmp2, src
48e5c88e3fSFeng Kan	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
49e5c88e3fSFeng Kan	b.eq	.LSrcAligned
50e5c88e3fSFeng Kan	sub	count, count, tmp2
51e5c88e3fSFeng Kan	/*
52e5c88e3fSFeng Kan	* Copy the leading memory data from src to dst in an increasing
539a284e5cSMasahiro Yamada	* address order.By this way,the risk of overwriting the source
54e5c88e3fSFeng Kan	* memory data is eliminated when the distance between src and
55e5c88e3fSFeng Kan	* dst is less than 16. The memory accesses here are alignment.
56e5c88e3fSFeng Kan	*/
57e5c88e3fSFeng Kan	tbz	tmp2, #0, 1f
58e5c88e3fSFeng Kan	ldrb1	tmp1w, src, #1
59e5c88e3fSFeng Kan	strb1	tmp1w, dst, #1
60e5c88e3fSFeng Kan1:
61e5c88e3fSFeng Kan	tbz	tmp2, #1, 2f
62e5c88e3fSFeng Kan	ldrh1	tmp1w, src, #2
63e5c88e3fSFeng Kan	strh1	tmp1w, dst, #2
64e5c88e3fSFeng Kan2:
65e5c88e3fSFeng Kan	tbz	tmp2, #2, 3f
66e5c88e3fSFeng Kan	ldr1	tmp1w, src, #4
67e5c88e3fSFeng Kan	str1	tmp1w, dst, #4
68e5c88e3fSFeng Kan3:
69e5c88e3fSFeng Kan	tbz	tmp2, #3, .LSrcAligned
70e5c88e3fSFeng Kan	ldr1	tmp1, src, #8
71e5c88e3fSFeng Kan	str1	tmp1, dst, #8
72e5c88e3fSFeng Kan
73e5c88e3fSFeng Kan.LSrcAligned:
74e5c88e3fSFeng Kan	cmp	count, #64
75e5c88e3fSFeng Kan	b.ge	.Lcpy_over64
76e5c88e3fSFeng Kan	/*
77e5c88e3fSFeng Kan	* Deal with small copies quickly by dropping straight into the
78e5c88e3fSFeng Kan	* exit block.
79e5c88e3fSFeng Kan	*/
80e5c88e3fSFeng Kan.Ltail63:
81e5c88e3fSFeng Kan	/*
82e5c88e3fSFeng Kan	* Copy up to 48 bytes of data. At this point we only need the
83e5c88e3fSFeng Kan	* bottom 6 bits of count to be accurate.
84e5c88e3fSFeng Kan	*/
85e5c88e3fSFeng Kan	ands	tmp1, count, #0x30
86e5c88e3fSFeng Kan	b.eq	.Ltiny15
87e5c88e3fSFeng Kan	cmp	tmp1w, #0x20
88e5c88e3fSFeng Kan	b.eq	1f
89e5c88e3fSFeng Kan	b.lt	2f
90e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
91e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
92e5c88e3fSFeng Kan1:
93e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
94e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
95e5c88e3fSFeng Kan2:
96e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
97e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
98e5c88e3fSFeng Kan.Ltiny15:
99e5c88e3fSFeng Kan	/*
100e5c88e3fSFeng Kan	* Prefer to break one ldp/stp into several load/store to access
101e5c88e3fSFeng Kan	* memory in an increasing address order,rather than to load/store 16
102e5c88e3fSFeng Kan	* bytes from (src-16) to (dst-16) and to backward the src to aligned
103e5c88e3fSFeng Kan	* address,which way is used in original cortex memcpy. If keeping
104e5c88e3fSFeng Kan	* the original memcpy process here, memmove need to satisfy the
105e5c88e3fSFeng Kan	* precondition that src address is at least 16 bytes bigger than dst
106e5c88e3fSFeng Kan	* address,otherwise some source data will be overwritten when memove
107e5c88e3fSFeng Kan	* call memcpy directly. To make memmove simpler and decouple the
108e5c88e3fSFeng Kan	* memcpy's dependency on memmove, withdrew the original process.
109e5c88e3fSFeng Kan	*/
110e5c88e3fSFeng Kan	tbz	count, #3, 1f
111e5c88e3fSFeng Kan	ldr1	tmp1, src, #8
112e5c88e3fSFeng Kan	str1	tmp1, dst, #8
113e5c88e3fSFeng Kan1:
114e5c88e3fSFeng Kan	tbz	count, #2, 2f
115e5c88e3fSFeng Kan	ldr1	tmp1w, src, #4
116e5c88e3fSFeng Kan	str1	tmp1w, dst, #4
117e5c88e3fSFeng Kan2:
118e5c88e3fSFeng Kan	tbz	count, #1, 3f
119e5c88e3fSFeng Kan	ldrh1	tmp1w, src, #2
120e5c88e3fSFeng Kan	strh1	tmp1w, dst, #2
121e5c88e3fSFeng Kan3:
122e5c88e3fSFeng Kan	tbz	count, #0, .Lexitfunc
123e5c88e3fSFeng Kan	ldrb1	tmp1w, src, #1
124e5c88e3fSFeng Kan	strb1	tmp1w, dst, #1
125e5c88e3fSFeng Kan
126e5c88e3fSFeng Kan	b	.Lexitfunc
127e5c88e3fSFeng Kan
128e5c88e3fSFeng Kan.Lcpy_over64:
129e5c88e3fSFeng Kan	subs	count, count, #128
130e5c88e3fSFeng Kan	b.ge	.Lcpy_body_large
131e5c88e3fSFeng Kan	/*
132e5c88e3fSFeng Kan	* Less than 128 bytes to copy, so handle 64 here and then jump
133e5c88e3fSFeng Kan	* to the tail.
134e5c88e3fSFeng Kan	*/
135e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
136e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
137e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
138e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
139e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
140e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
141e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
142e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
143e5c88e3fSFeng Kan
144e5c88e3fSFeng Kan	tst	count, #0x3f
145e5c88e3fSFeng Kan	b.ne	.Ltail63
146e5c88e3fSFeng Kan	b	.Lexitfunc
147e5c88e3fSFeng Kan
148e5c88e3fSFeng Kan	/*
149e5c88e3fSFeng Kan	* Critical loop.  Start at a new cache line boundary.  Assuming
150e5c88e3fSFeng Kan	* 64 bytes per line this ensures the entire loop is in one line.
151e5c88e3fSFeng Kan	*/
152e5c88e3fSFeng Kan	.p2align	L1_CACHE_SHIFT
153e5c88e3fSFeng Kan.Lcpy_body_large:
154e5c88e3fSFeng Kan	/* pre-get 64 bytes data. */
155e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
156e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
157e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
158e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
159e5c88e3fSFeng Kan1:
160e5c88e3fSFeng Kan	/*
161e5c88e3fSFeng Kan	* interlace the load of next 64 bytes data block with store of the last
162e5c88e3fSFeng Kan	* loaded 64 bytes data.
163e5c88e3fSFeng Kan	*/
164e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
165e5c88e3fSFeng Kan	ldp1	A_l, A_h, src, #16
166e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
167e5c88e3fSFeng Kan	ldp1	B_l, B_h, src, #16
168e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
169e5c88e3fSFeng Kan	ldp1	C_l, C_h, src, #16
170e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
171e5c88e3fSFeng Kan	ldp1	D_l, D_h, src, #16
172e5c88e3fSFeng Kan	subs	count, count, #64
173e5c88e3fSFeng Kan	b.ge	1b
174e5c88e3fSFeng Kan	stp1	A_l, A_h, dst, #16
175e5c88e3fSFeng Kan	stp1	B_l, B_h, dst, #16
176e5c88e3fSFeng Kan	stp1	C_l, C_h, dst, #16
177e5c88e3fSFeng Kan	stp1	D_l, D_h, dst, #16
178e5c88e3fSFeng Kan
179e5c88e3fSFeng Kan	tst	count, #0x3f
180e5c88e3fSFeng Kan	b.ne	.Ltail63
181e5c88e3fSFeng Kan.Lexitfunc:
182