xref: /openbmc/linux/arch/x86/lib/memcpy_64.S (revision 99a15348)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/linkage.h>
5#include <asm/errno.h>
6#include <asm/cpufeatures.h>
7#include <asm/alternative.h>
8#include <asm/export.h>
9
10.pushsection .noinstr.text, "ax"
11
12/*
13 * We build a jump to memcpy_orig by default which gets NOPped out on
14 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
15 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
16 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
17 */
18
19/*
20 * memcpy - Copy a memory block.
21 *
22 * Input:
23 *  rdi destination
24 *  rsi source
25 *  rdx count
26 *
27 * Output:
28 * rax original destination
29 */
30SYM_FUNC_START(__memcpy)
31	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
32		      "jmp memcpy_erms", X86_FEATURE_ERMS
33
34	movq %rdi, %rax
35	movq %rdx, %rcx
36	shrq $3, %rcx
37	andl $7, %edx
38	rep movsq
39	movl %edx, %ecx
40	rep movsb
41	RET
42SYM_FUNC_END(__memcpy)
43EXPORT_SYMBOL(__memcpy)
44
45SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
46EXPORT_SYMBOL(memcpy)
47
48/*
49 * memcpy_erms() - enhanced fast string memcpy. This is faster and
50 * simpler than memcpy. Use memcpy_erms when possible.
51 */
52SYM_FUNC_START_LOCAL(memcpy_erms)
53	movq %rdi, %rax
54	movq %rdx, %rcx
55	rep movsb
56	RET
57SYM_FUNC_END(memcpy_erms)
58
59SYM_FUNC_START_LOCAL(memcpy_orig)
60	movq %rdi, %rax
61
62	cmpq $0x20, %rdx
63	jb .Lhandle_tail
64
65	/*
66	 * We check whether memory false dependence could occur,
67	 * then jump to corresponding copy mode.
68	 */
69	cmp  %dil, %sil
70	jl .Lcopy_backward
71	subq $0x20, %rdx
72.Lcopy_forward_loop:
73	subq $0x20,	%rdx
74
75	/*
76	 * Move in blocks of 4x8 bytes:
77	 */
78	movq 0*8(%rsi),	%r8
79	movq 1*8(%rsi),	%r9
80	movq 2*8(%rsi),	%r10
81	movq 3*8(%rsi),	%r11
82	leaq 4*8(%rsi),	%rsi
83
84	movq %r8,	0*8(%rdi)
85	movq %r9,	1*8(%rdi)
86	movq %r10,	2*8(%rdi)
87	movq %r11,	3*8(%rdi)
88	leaq 4*8(%rdi),	%rdi
89	jae  .Lcopy_forward_loop
90	addl $0x20,	%edx
91	jmp  .Lhandle_tail
92
93.Lcopy_backward:
94	/*
95	 * Calculate copy position to tail.
96	 */
97	addq %rdx,	%rsi
98	addq %rdx,	%rdi
99	subq $0x20,	%rdx
100	/*
101	 * At most 3 ALU operations in one cycle,
102	 * so append NOPS in the same 16 bytes trunk.
103	 */
104	.p2align 4
105.Lcopy_backward_loop:
106	subq $0x20,	%rdx
107	movq -1*8(%rsi),	%r8
108	movq -2*8(%rsi),	%r9
109	movq -3*8(%rsi),	%r10
110	movq -4*8(%rsi),	%r11
111	leaq -4*8(%rsi),	%rsi
112	movq %r8,		-1*8(%rdi)
113	movq %r9,		-2*8(%rdi)
114	movq %r10,		-3*8(%rdi)
115	movq %r11,		-4*8(%rdi)
116	leaq -4*8(%rdi),	%rdi
117	jae  .Lcopy_backward_loop
118
119	/*
120	 * Calculate copy position to head.
121	 */
122	addl $0x20,	%edx
123	subq %rdx,	%rsi
124	subq %rdx,	%rdi
125.Lhandle_tail:
126	cmpl $16,	%edx
127	jb   .Lless_16bytes
128
129	/*
130	 * Move data from 16 bytes to 31 bytes.
131	 */
132	movq 0*8(%rsi), %r8
133	movq 1*8(%rsi),	%r9
134	movq -2*8(%rsi, %rdx),	%r10
135	movq -1*8(%rsi, %rdx),	%r11
136	movq %r8,	0*8(%rdi)
137	movq %r9,	1*8(%rdi)
138	movq %r10,	-2*8(%rdi, %rdx)
139	movq %r11,	-1*8(%rdi, %rdx)
140	RET
141	.p2align 4
142.Lless_16bytes:
143	cmpl $8,	%edx
144	jb   .Lless_8bytes
145	/*
146	 * Move data from 8 bytes to 15 bytes.
147	 */
148	movq 0*8(%rsi),	%r8
149	movq -1*8(%rsi, %rdx),	%r9
150	movq %r8,	0*8(%rdi)
151	movq %r9,	-1*8(%rdi, %rdx)
152	RET
153	.p2align 4
154.Lless_8bytes:
155	cmpl $4,	%edx
156	jb   .Lless_3bytes
157
158	/*
159	 * Move data from 4 bytes to 7 bytes.
160	 */
161	movl (%rsi), %ecx
162	movl -4(%rsi, %rdx), %r8d
163	movl %ecx, (%rdi)
164	movl %r8d, -4(%rdi, %rdx)
165	RET
166	.p2align 4
167.Lless_3bytes:
168	subl $1, %edx
169	jb .Lend
170	/*
171	 * Move data from 1 bytes to 3 bytes.
172	 */
173	movzbl (%rsi), %ecx
174	jz .Lstore_1byte
175	movzbq 1(%rsi), %r8
176	movzbq (%rsi, %rdx), %r9
177	movb %r8b, 1(%rdi)
178	movb %r9b, (%rdi, %rdx)
179.Lstore_1byte:
180	movb %cl, (%rdi)
181
182.Lend:
183	RET
184SYM_FUNC_END(memcpy_orig)
185
186.popsection
187