xref: /openbmc/linux/tools/arch/x86/lib/memcpy_64.S (revision b24413180f5600bcb3bb70fbed5cf186b60864bd)
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/linkage.h>
5#include <asm/errno.h>
6#include <asm/cpufeatures.h>
7#include <asm/alternative-asm.h>
8
9/*
10 * We build a jump to memcpy_orig by default which gets NOPped out on
11 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
12 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
13 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
14 */
15
16.weak memcpy
17
18/*
19 * memcpy - Copy a memory block.
20 *
21 * Input:
22 *  rdi destination
23 *  rsi source
24 *  rdx count
25 *
26 * Output:
27 * rax original destination
28 */
29ENTRY(__memcpy)
30ENTRY(memcpy)
31	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
32		      "jmp memcpy_erms", X86_FEATURE_ERMS
33
34	movq %rdi, %rax
35	movq %rdx, %rcx
36	shrq $3, %rcx
37	andl $7, %edx
38	rep movsq
39	movl %edx, %ecx
40	rep movsb
41	ret
42ENDPROC(memcpy)
43ENDPROC(__memcpy)
44
45/*
46 * memcpy_erms() - enhanced fast string memcpy. This is faster and
47 * simpler than memcpy. Use memcpy_erms when possible.
48 */
49ENTRY(memcpy_erms)
50	movq %rdi, %rax
51	movq %rdx, %rcx
52	rep movsb
53	ret
54ENDPROC(memcpy_erms)
55
56ENTRY(memcpy_orig)
57	movq %rdi, %rax
58
59	cmpq $0x20, %rdx
60	jb .Lhandle_tail
61
62	/*
63	 * We check whether memory false dependence could occur,
64	 * then jump to corresponding copy mode.
65	 */
66	cmp  %dil, %sil
67	jl .Lcopy_backward
68	subq $0x20, %rdx
69.Lcopy_forward_loop:
70	subq $0x20,	%rdx
71
72	/*
73	 * Move in blocks of 4x8 bytes:
74	 */
75	movq 0*8(%rsi),	%r8
76	movq 1*8(%rsi),	%r9
77	movq 2*8(%rsi),	%r10
78	movq 3*8(%rsi),	%r11
79	leaq 4*8(%rsi),	%rsi
80
81	movq %r8,	0*8(%rdi)
82	movq %r9,	1*8(%rdi)
83	movq %r10,	2*8(%rdi)
84	movq %r11,	3*8(%rdi)
85	leaq 4*8(%rdi),	%rdi
86	jae  .Lcopy_forward_loop
87	addl $0x20,	%edx
88	jmp  .Lhandle_tail
89
90.Lcopy_backward:
91	/*
92	 * Calculate copy position to tail.
93	 */
94	addq %rdx,	%rsi
95	addq %rdx,	%rdi
96	subq $0x20,	%rdx
97	/*
98	 * At most 3 ALU operations in one cycle,
99	 * so append NOPS in the same 16 bytes trunk.
100	 */
101	.p2align 4
102.Lcopy_backward_loop:
103	subq $0x20,	%rdx
104	movq -1*8(%rsi),	%r8
105	movq -2*8(%rsi),	%r9
106	movq -3*8(%rsi),	%r10
107	movq -4*8(%rsi),	%r11
108	leaq -4*8(%rsi),	%rsi
109	movq %r8,		-1*8(%rdi)
110	movq %r9,		-2*8(%rdi)
111	movq %r10,		-3*8(%rdi)
112	movq %r11,		-4*8(%rdi)
113	leaq -4*8(%rdi),	%rdi
114	jae  .Lcopy_backward_loop
115
116	/*
117	 * Calculate copy position to head.
118	 */
119	addl $0x20,	%edx
120	subq %rdx,	%rsi
121	subq %rdx,	%rdi
122.Lhandle_tail:
123	cmpl $16,	%edx
124	jb   .Lless_16bytes
125
126	/*
127	 * Move data from 16 bytes to 31 bytes.
128	 */
129	movq 0*8(%rsi), %r8
130	movq 1*8(%rsi),	%r9
131	movq -2*8(%rsi, %rdx),	%r10
132	movq -1*8(%rsi, %rdx),	%r11
133	movq %r8,	0*8(%rdi)
134	movq %r9,	1*8(%rdi)
135	movq %r10,	-2*8(%rdi, %rdx)
136	movq %r11,	-1*8(%rdi, %rdx)
137	retq
138	.p2align 4
139.Lless_16bytes:
140	cmpl $8,	%edx
141	jb   .Lless_8bytes
142	/*
143	 * Move data from 8 bytes to 15 bytes.
144	 */
145	movq 0*8(%rsi),	%r8
146	movq -1*8(%rsi, %rdx),	%r9
147	movq %r8,	0*8(%rdi)
148	movq %r9,	-1*8(%rdi, %rdx)
149	retq
150	.p2align 4
151.Lless_8bytes:
152	cmpl $4,	%edx
153	jb   .Lless_3bytes
154
155	/*
156	 * Move data from 4 bytes to 7 bytes.
157	 */
158	movl (%rsi), %ecx
159	movl -4(%rsi, %rdx), %r8d
160	movl %ecx, (%rdi)
161	movl %r8d, -4(%rdi, %rdx)
162	retq
163	.p2align 4
164.Lless_3bytes:
165	subl $1, %edx
166	jb .Lend
167	/*
168	 * Move data from 1 bytes to 3 bytes.
169	 */
170	movzbl (%rsi), %ecx
171	jz .Lstore_1byte
172	movzbq 1(%rsi), %r8
173	movzbq (%rsi, %rdx), %r9
174	movb %r8b, 1(%rdi)
175	movb %r9b, (%rdi, %rdx)
176.Lstore_1byte:
177	movb %cl, (%rdi)
178
179.Lend:
180	retq
181ENDPROC(memcpy_orig)
182
183#ifndef CONFIG_UML
184/*
185 * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
186 * Note that we only catch machine checks when reading the source addresses.
187 * Writes to target are posted and don't generate machine checks.
188 */
189ENTRY(memcpy_mcsafe_unrolled)
190	cmpl $8, %edx
191	/* Less than 8 bytes? Go to byte copy loop */
192	jb .L_no_whole_words
193
194	/* Check for bad alignment of source */
195	testl $7, %esi
196	/* Already aligned */
197	jz .L_8byte_aligned
198
199	/* Copy one byte at a time until source is 8-byte aligned */
200	movl %esi, %ecx
201	andl $7, %ecx
202	subl $8, %ecx
203	negl %ecx
204	subl %ecx, %edx
205.L_copy_leading_bytes:
206	movb (%rsi), %al
207	movb %al, (%rdi)
208	incq %rsi
209	incq %rdi
210	decl %ecx
211	jnz .L_copy_leading_bytes
212
213.L_8byte_aligned:
214	/* Figure out how many whole cache lines (64-bytes) to copy */
215	movl %edx, %ecx
216	andl $63, %edx
217	shrl $6, %ecx
218	jz .L_no_whole_cache_lines
219
220	/* Loop copying whole cache lines */
221.L_cache_w0: movq (%rsi), %r8
222.L_cache_w1: movq 1*8(%rsi), %r9
223.L_cache_w2: movq 2*8(%rsi), %r10
224.L_cache_w3: movq 3*8(%rsi), %r11
225	movq %r8, (%rdi)
226	movq %r9, 1*8(%rdi)
227	movq %r10, 2*8(%rdi)
228	movq %r11, 3*8(%rdi)
229.L_cache_w4: movq 4*8(%rsi), %r8
230.L_cache_w5: movq 5*8(%rsi), %r9
231.L_cache_w6: movq 6*8(%rsi), %r10
232.L_cache_w7: movq 7*8(%rsi), %r11
233	movq %r8, 4*8(%rdi)
234	movq %r9, 5*8(%rdi)
235	movq %r10, 6*8(%rdi)
236	movq %r11, 7*8(%rdi)
237	leaq 64(%rsi), %rsi
238	leaq 64(%rdi), %rdi
239	decl %ecx
240	jnz .L_cache_w0
241
242	/* Are there any trailing 8-byte words? */
243.L_no_whole_cache_lines:
244	movl %edx, %ecx
245	andl $7, %edx
246	shrl $3, %ecx
247	jz .L_no_whole_words
248
249	/* Copy trailing words */
250.L_copy_trailing_words:
251	movq (%rsi), %r8
252	mov %r8, (%rdi)
253	leaq 8(%rsi), %rsi
254	leaq 8(%rdi), %rdi
255	decl %ecx
256	jnz .L_copy_trailing_words
257
258	/* Any trailing bytes? */
259.L_no_whole_words:
260	andl %edx, %edx
261	jz .L_done_memcpy_trap
262
263	/* Copy trailing bytes */
264	movl %edx, %ecx
265.L_copy_trailing_bytes:
266	movb (%rsi), %al
267	movb %al, (%rdi)
268	incq %rsi
269	incq %rdi
270	decl %ecx
271	jnz .L_copy_trailing_bytes
272
273	/* Copy successful. Return zero */
274.L_done_memcpy_trap:
275	xorq %rax, %rax
276	ret
277ENDPROC(memcpy_mcsafe_unrolled)
278
279	.section .fixup, "ax"
280	/* Return -EFAULT for any failure */
281.L_memcpy_mcsafe_fail:
282	mov	$-EFAULT, %rax
283	ret
284
285	.previous
286
287	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
288	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
289	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
290	_ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
291	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
292	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
293	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
294	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
295	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
296	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
297	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
298#endif
299