xref: /openbmc/linux/arch/arm64/lib/memcpy.S (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright (c) 2012-2021, Arm Limited.
4  *
5  * Adapted from the original at:
6  * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
7  */
8 
9 #include <linux/linkage.h>
10 #include <asm/assembler.h>
11 
12 /* Assumptions:
13  *
14  * ARMv8-a, AArch64, unaligned accesses.
15  *
16  */
17 
18 #define L(label) .L ## label
19 
20 #define dstin	x0
21 #define src	x1
22 #define count	x2
23 #define dst	x3
24 #define srcend	x4
25 #define dstend	x5
26 #define A_l	x6
27 #define A_lw	w6
28 #define A_h	x7
29 #define B_l	x8
30 #define B_lw	w8
31 #define B_h	x9
32 #define C_l	x10
33 #define C_lw	w10
34 #define C_h	x11
35 #define D_l	x12
36 #define D_h	x13
37 #define E_l	x14
38 #define E_h	x15
39 #define F_l	x16
40 #define F_h	x17
41 #define G_l	count
42 #define G_h	dst
43 #define H_l	src
44 #define H_h	srcend
45 #define tmp1	x14
46 
47 /* This implementation handles overlaps and supports both memcpy and memmove
48    from a single entry point.  It uses unaligned accesses and branchless
49    sequences to keep the code small, simple and improve performance.
50 
51    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52    copies of up to 128 bytes, and large copies.  The overhead of the overlap
53    check is negligible since it is only required for large copies.
54 
55    Large copies use a software pipelined loop processing 64 bytes per iteration.
56    The destination pointer is 16-byte aligned to minimize unaligned accesses.
57    The loop tail is handled by always copying 64 bytes from the end.
58 */
59 
60 SYM_FUNC_START(__pi_memcpy)
61 	add	srcend, src, count
62 	add	dstend, dstin, count
63 	cmp	count, 128
64 	b.hi	L(copy_long)
65 	cmp	count, 32
66 	b.hi	L(copy32_128)
67 
68 	/* Small copies: 0..32 bytes.  */
69 	cmp	count, 16
70 	b.lo	L(copy16)
71 	ldp	A_l, A_h, [src]
72 	ldp	D_l, D_h, [srcend, -16]
73 	stp	A_l, A_h, [dstin]
74 	stp	D_l, D_h, [dstend, -16]
75 	ret
76 
77 	/* Copy 8-15 bytes.  */
78 L(copy16):
79 	tbz	count, 3, L(copy8)
80 	ldr	A_l, [src]
81 	ldr	A_h, [srcend, -8]
82 	str	A_l, [dstin]
83 	str	A_h, [dstend, -8]
84 	ret
85 
86 	.p2align 3
87 	/* Copy 4-7 bytes.  */
88 L(copy8):
89 	tbz	count, 2, L(copy4)
90 	ldr	A_lw, [src]
91 	ldr	B_lw, [srcend, -4]
92 	str	A_lw, [dstin]
93 	str	B_lw, [dstend, -4]
94 	ret
95 
96 	/* Copy 0..3 bytes using a branchless sequence.  */
97 L(copy4):
98 	cbz	count, L(copy0)
99 	lsr	tmp1, count, 1
100 	ldrb	A_lw, [src]
101 	ldrb	C_lw, [srcend, -1]
102 	ldrb	B_lw, [src, tmp1]
103 	strb	A_lw, [dstin]
104 	strb	B_lw, [dstin, tmp1]
105 	strb	C_lw, [dstend, -1]
106 L(copy0):
107 	ret
108 
109 	.p2align 4
110 	/* Medium copies: 33..128 bytes.  */
111 L(copy32_128):
112 	ldp	A_l, A_h, [src]
113 	ldp	B_l, B_h, [src, 16]
114 	ldp	C_l, C_h, [srcend, -32]
115 	ldp	D_l, D_h, [srcend, -16]
116 	cmp	count, 64
117 	b.hi	L(copy128)
118 	stp	A_l, A_h, [dstin]
119 	stp	B_l, B_h, [dstin, 16]
120 	stp	C_l, C_h, [dstend, -32]
121 	stp	D_l, D_h, [dstend, -16]
122 	ret
123 
124 	.p2align 4
125 	/* Copy 65..128 bytes.  */
126 L(copy128):
127 	ldp	E_l, E_h, [src, 32]
128 	ldp	F_l, F_h, [src, 48]
129 	cmp	count, 96
130 	b.ls	L(copy96)
131 	ldp	G_l, G_h, [srcend, -64]
132 	ldp	H_l, H_h, [srcend, -48]
133 	stp	G_l, G_h, [dstend, -64]
134 	stp	H_l, H_h, [dstend, -48]
135 L(copy96):
136 	stp	A_l, A_h, [dstin]
137 	stp	B_l, B_h, [dstin, 16]
138 	stp	E_l, E_h, [dstin, 32]
139 	stp	F_l, F_h, [dstin, 48]
140 	stp	C_l, C_h, [dstend, -32]
141 	stp	D_l, D_h, [dstend, -16]
142 	ret
143 
144 	.p2align 4
145 	/* Copy more than 128 bytes.  */
146 L(copy_long):
147 	/* Use backwards copy if there is an overlap.  */
148 	sub	tmp1, dstin, src
149 	cbz	tmp1, L(copy0)
150 	cmp	tmp1, count
151 	b.lo	L(copy_long_backwards)
152 
153 	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
154 
155 	ldp	D_l, D_h, [src]
156 	and	tmp1, dstin, 15
157 	bic	dst, dstin, 15
158 	sub	src, src, tmp1
159 	add	count, count, tmp1	/* Count is now 16 too large.  */
160 	ldp	A_l, A_h, [src, 16]
161 	stp	D_l, D_h, [dstin]
162 	ldp	B_l, B_h, [src, 32]
163 	ldp	C_l, C_h, [src, 48]
164 	ldp	D_l, D_h, [src, 64]!
165 	subs	count, count, 128 + 16	/* Test and readjust count.  */
166 	b.ls	L(copy64_from_end)
167 
168 L(loop64):
169 	stp	A_l, A_h, [dst, 16]
170 	ldp	A_l, A_h, [src, 16]
171 	stp	B_l, B_h, [dst, 32]
172 	ldp	B_l, B_h, [src, 32]
173 	stp	C_l, C_h, [dst, 48]
174 	ldp	C_l, C_h, [src, 48]
175 	stp	D_l, D_h, [dst, 64]!
176 	ldp	D_l, D_h, [src, 64]!
177 	subs	count, count, 64
178 	b.hi	L(loop64)
179 
180 	/* Write the last iteration and copy 64 bytes from the end.  */
181 L(copy64_from_end):
182 	ldp	E_l, E_h, [srcend, -64]
183 	stp	A_l, A_h, [dst, 16]
184 	ldp	A_l, A_h, [srcend, -48]
185 	stp	B_l, B_h, [dst, 32]
186 	ldp	B_l, B_h, [srcend, -32]
187 	stp	C_l, C_h, [dst, 48]
188 	ldp	C_l, C_h, [srcend, -16]
189 	stp	D_l, D_h, [dst, 64]
190 	stp	E_l, E_h, [dstend, -64]
191 	stp	A_l, A_h, [dstend, -48]
192 	stp	B_l, B_h, [dstend, -32]
193 	stp	C_l, C_h, [dstend, -16]
194 	ret
195 
196 	.p2align 4
197 
198 	/* Large backwards copy for overlapping copies.
199 	   Copy 16 bytes and then align dst to 16-byte alignment.  */
200 L(copy_long_backwards):
201 	ldp	D_l, D_h, [srcend, -16]
202 	and	tmp1, dstend, 15
203 	sub	srcend, srcend, tmp1
204 	sub	count, count, tmp1
205 	ldp	A_l, A_h, [srcend, -16]
206 	stp	D_l, D_h, [dstend, -16]
207 	ldp	B_l, B_h, [srcend, -32]
208 	ldp	C_l, C_h, [srcend, -48]
209 	ldp	D_l, D_h, [srcend, -64]!
210 	sub	dstend, dstend, tmp1
211 	subs	count, count, 128
212 	b.ls	L(copy64_from_start)
213 
214 L(loop64_backwards):
215 	stp	A_l, A_h, [dstend, -16]
216 	ldp	A_l, A_h, [srcend, -16]
217 	stp	B_l, B_h, [dstend, -32]
218 	ldp	B_l, B_h, [srcend, -32]
219 	stp	C_l, C_h, [dstend, -48]
220 	ldp	C_l, C_h, [srcend, -48]
221 	stp	D_l, D_h, [dstend, -64]!
222 	ldp	D_l, D_h, [srcend, -64]!
223 	subs	count, count, 64
224 	b.hi	L(loop64_backwards)
225 
226 	/* Write the last iteration and copy 64 bytes from the start.  */
227 L(copy64_from_start):
228 	ldp	G_l, G_h, [src, 48]
229 	stp	A_l, A_h, [dstend, -16]
230 	ldp	A_l, A_h, [src, 32]
231 	stp	B_l, B_h, [dstend, -32]
232 	ldp	B_l, B_h, [src, 16]
233 	stp	C_l, C_h, [dstend, -48]
234 	ldp	C_l, C_h, [src]
235 	stp	D_l, D_h, [dstend, -64]
236 	stp	G_l, G_h, [dstin, 48]
237 	stp	A_l, A_h, [dstin, 32]
238 	stp	B_l, B_h, [dstin, 16]
239 	stp	C_l, C_h, [dstin]
240 	ret
241 SYM_FUNC_END(__pi_memcpy)
242 
243 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
244 EXPORT_SYMBOL(__memcpy)
245 SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
246 EXPORT_SYMBOL(memcpy)
247 
248 SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
249 
250 SYM_FUNC_ALIAS(__memmove, __pi_memmove)
251 EXPORT_SYMBOL(__memmove)
252 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
253 EXPORT_SYMBOL(memmove)
254