xref: /openbmc/linux/tools/arch/x86/lib/memcpy_64.S (revision 4c5a116a)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/linkage.h>
5#include <asm/errno.h>
6#include <asm/cpufeatures.h>
7#include <asm/mcsafe_test.h>
8#include <asm/alternative-asm.h>
9#include <asm/export.h>
10
11.pushsection .noinstr.text, "ax"
12
13/*
14 * We build a jump to memcpy_orig by default which gets NOPped out on
15 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
16 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
17 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
18 */
19
20.weak memcpy
21
22/*
23 * memcpy - Copy a memory block.
24 *
25 * Input:
26 *  rdi destination
27 *  rsi source
28 *  rdx count
29 *
30 * Output:
31 * rax original destination
32 */
33SYM_FUNC_START_ALIAS(__memcpy)
34SYM_FUNC_START_LOCAL(memcpy)
35	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
36		      "jmp memcpy_erms", X86_FEATURE_ERMS
37
38	movq %rdi, %rax
39	movq %rdx, %rcx
40	shrq $3, %rcx
41	andl $7, %edx
42	rep movsq
43	movl %edx, %ecx
44	rep movsb
45	ret
46SYM_FUNC_END(memcpy)
47SYM_FUNC_END_ALIAS(__memcpy)
48EXPORT_SYMBOL(memcpy)
49EXPORT_SYMBOL(__memcpy)
50
51/*
52 * memcpy_erms() - enhanced fast string memcpy. This is faster and
53 * simpler than memcpy. Use memcpy_erms when possible.
54 */
55SYM_FUNC_START(memcpy_erms)
56	movq %rdi, %rax
57	movq %rdx, %rcx
58	rep movsb
59	ret
60SYM_FUNC_END(memcpy_erms)
61
62SYM_FUNC_START(memcpy_orig)
63	movq %rdi, %rax
64
65	cmpq $0x20, %rdx
66	jb .Lhandle_tail
67
68	/*
69	 * We check whether memory false dependence could occur,
70	 * then jump to corresponding copy mode.
71	 */
72	cmp  %dil, %sil
73	jl .Lcopy_backward
74	subq $0x20, %rdx
75.Lcopy_forward_loop:
76	subq $0x20,	%rdx
77
78	/*
79	 * Move in blocks of 4x8 bytes:
80	 */
81	movq 0*8(%rsi),	%r8
82	movq 1*8(%rsi),	%r9
83	movq 2*8(%rsi),	%r10
84	movq 3*8(%rsi),	%r11
85	leaq 4*8(%rsi),	%rsi
86
87	movq %r8,	0*8(%rdi)
88	movq %r9,	1*8(%rdi)
89	movq %r10,	2*8(%rdi)
90	movq %r11,	3*8(%rdi)
91	leaq 4*8(%rdi),	%rdi
92	jae  .Lcopy_forward_loop
93	addl $0x20,	%edx
94	jmp  .Lhandle_tail
95
96.Lcopy_backward:
97	/*
98	 * Calculate copy position to tail.
99	 */
100	addq %rdx,	%rsi
101	addq %rdx,	%rdi
102	subq $0x20,	%rdx
103	/*
104	 * At most 3 ALU operations in one cycle,
105	 * so append NOPS in the same 16 bytes trunk.
106	 */
107	.p2align 4
108.Lcopy_backward_loop:
109	subq $0x20,	%rdx
110	movq -1*8(%rsi),	%r8
111	movq -2*8(%rsi),	%r9
112	movq -3*8(%rsi),	%r10
113	movq -4*8(%rsi),	%r11
114	leaq -4*8(%rsi),	%rsi
115	movq %r8,		-1*8(%rdi)
116	movq %r9,		-2*8(%rdi)
117	movq %r10,		-3*8(%rdi)
118	movq %r11,		-4*8(%rdi)
119	leaq -4*8(%rdi),	%rdi
120	jae  .Lcopy_backward_loop
121
122	/*
123	 * Calculate copy position to head.
124	 */
125	addl $0x20,	%edx
126	subq %rdx,	%rsi
127	subq %rdx,	%rdi
128.Lhandle_tail:
129	cmpl $16,	%edx
130	jb   .Lless_16bytes
131
132	/*
133	 * Move data from 16 bytes to 31 bytes.
134	 */
135	movq 0*8(%rsi), %r8
136	movq 1*8(%rsi),	%r9
137	movq -2*8(%rsi, %rdx),	%r10
138	movq -1*8(%rsi, %rdx),	%r11
139	movq %r8,	0*8(%rdi)
140	movq %r9,	1*8(%rdi)
141	movq %r10,	-2*8(%rdi, %rdx)
142	movq %r11,	-1*8(%rdi, %rdx)
143	retq
144	.p2align 4
145.Lless_16bytes:
146	cmpl $8,	%edx
147	jb   .Lless_8bytes
148	/*
149	 * Move data from 8 bytes to 15 bytes.
150	 */
151	movq 0*8(%rsi),	%r8
152	movq -1*8(%rsi, %rdx),	%r9
153	movq %r8,	0*8(%rdi)
154	movq %r9,	-1*8(%rdi, %rdx)
155	retq
156	.p2align 4
157.Lless_8bytes:
158	cmpl $4,	%edx
159	jb   .Lless_3bytes
160
161	/*
162	 * Move data from 4 bytes to 7 bytes.
163	 */
164	movl (%rsi), %ecx
165	movl -4(%rsi, %rdx), %r8d
166	movl %ecx, (%rdi)
167	movl %r8d, -4(%rdi, %rdx)
168	retq
169	.p2align 4
170.Lless_3bytes:
171	subl $1, %edx
172	jb .Lend
173	/*
174	 * Move data from 1 bytes to 3 bytes.
175	 */
176	movzbl (%rsi), %ecx
177	jz .Lstore_1byte
178	movzbq 1(%rsi), %r8
179	movzbq (%rsi, %rdx), %r9
180	movb %r8b, 1(%rdi)
181	movb %r9b, (%rdi, %rdx)
182.Lstore_1byte:
183	movb %cl, (%rdi)
184
185.Lend:
186	retq
187SYM_FUNC_END(memcpy_orig)
188
189.popsection
190
191#ifndef CONFIG_UML
192
193MCSAFE_TEST_CTL
194
195/*
196 * __memcpy_mcsafe - memory copy with machine check exception handling
197 * Note that we only catch machine checks when reading the source addresses.
198 * Writes to target are posted and don't generate machine checks.
199 */
200SYM_FUNC_START(__memcpy_mcsafe)
201	cmpl $8, %edx
202	/* Less than 8 bytes? Go to byte copy loop */
203	jb .L_no_whole_words
204
205	/* Check for bad alignment of source */
206	testl $7, %esi
207	/* Already aligned */
208	jz .L_8byte_aligned
209
210	/* Copy one byte at a time until source is 8-byte aligned */
211	movl %esi, %ecx
212	andl $7, %ecx
213	subl $8, %ecx
214	negl %ecx
215	subl %ecx, %edx
216.L_read_leading_bytes:
217	movb (%rsi), %al
218	MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
219	MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
220.L_write_leading_bytes:
221	movb %al, (%rdi)
222	incq %rsi
223	incq %rdi
224	decl %ecx
225	jnz .L_read_leading_bytes
226
227.L_8byte_aligned:
228	movl %edx, %ecx
229	andl $7, %edx
230	shrl $3, %ecx
231	jz .L_no_whole_words
232
233.L_read_words:
234	movq (%rsi), %r8
235	MCSAFE_TEST_SRC %rsi 8 .E_read_words
236	MCSAFE_TEST_DST %rdi 8 .E_write_words
237.L_write_words:
238	movq %r8, (%rdi)
239	addq $8, %rsi
240	addq $8, %rdi
241	decl %ecx
242	jnz .L_read_words
243
244	/* Any trailing bytes? */
245.L_no_whole_words:
246	andl %edx, %edx
247	jz .L_done_memcpy_trap
248
249	/* Copy trailing bytes */
250	movl %edx, %ecx
251.L_read_trailing_bytes:
252	movb (%rsi), %al
253	MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
254	MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
255.L_write_trailing_bytes:
256	movb %al, (%rdi)
257	incq %rsi
258	incq %rdi
259	decl %ecx
260	jnz .L_read_trailing_bytes
261
262	/* Copy successful. Return zero */
263.L_done_memcpy_trap:
264	xorl %eax, %eax
265.L_done:
266	ret
267SYM_FUNC_END(__memcpy_mcsafe)
268EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
269
270	.section .fixup, "ax"
271	/*
272	 * Return number of bytes not copied for any failure. Note that
273	 * there is no "tail" handling since the source buffer is 8-byte
274	 * aligned and poison is cacheline aligned.
275	 */
276.E_read_words:
277	shll	$3, %ecx
278.E_leading_bytes:
279	addl	%edx, %ecx
280.E_trailing_bytes:
281	mov	%ecx, %eax
282	jmp	.L_done
283
284	/*
285	 * For write fault handling, given the destination is unaligned,
286	 * we handle faults on multi-byte writes with a byte-by-byte
287	 * copy up to the write-protected page.
288	 */
289.E_write_words:
290	shll	$3, %ecx
291	addl	%edx, %ecx
292	movl	%ecx, %edx
293	jmp mcsafe_handle_tail
294
295	.previous
296
297	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
298	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
299	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
300	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
301	_ASM_EXTABLE(.L_write_words, .E_write_words)
302	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
303#endif
304