xref: /openbmc/linux/arch/x86/lib/memmove_64.S (revision e847c767)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Normally compiler builtins are used, but sometimes the compiler calls out
4 * of line code. Based on asm-i386/string.h.
5 *
6 * This assembly file is re-written from memmove_64.c file.
7 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
8 */
9#include <linux/linkage.h>
10#include <asm/cpufeatures.h>
11#include <asm/alternative.h>
12#include <asm/export.h>
13
14#undef memmove
15
16.section .noinstr.text, "ax"
17
18/*
19 * Implement memmove(). This can handle overlap between src and dst.
20 *
21 * Input:
22 * rdi: dest
23 * rsi: src
24 * rdx: count
25 *
26 * Output:
27 * rax: dest
28 */
29SYM_FUNC_START(__memmove)
30
31	mov %rdi, %rax
32
33	/* Decide forward/backward copy mode */
34	cmp %rdi, %rsi
35	jge .Lmemmove_begin_forward
36	mov %rsi, %r8
37	add %rdx, %r8
38	cmp %rdi, %r8
39	jg 2f
40
41	/* FSRM implies ERMS => no length checks, do the copy directly */
42.Lmemmove_begin_forward:
43	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
44	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
45
46	/*
47	 * movsq instruction have many startup latency
48	 * so we handle small size by general register.
49	 */
50	cmp  $680, %rdx
51	jb	3f
52	/*
53	 * movsq instruction is only good for aligned case.
54	 */
55
56	cmpb %dil, %sil
57	je 4f
583:
59	sub $0x20, %rdx
60	/*
61	 * We gobble 32 bytes forward in each loop.
62	 */
635:
64	sub $0x20, %rdx
65	movq 0*8(%rsi), %r11
66	movq 1*8(%rsi), %r10
67	movq 2*8(%rsi), %r9
68	movq 3*8(%rsi), %r8
69	leaq 4*8(%rsi), %rsi
70
71	movq %r11, 0*8(%rdi)
72	movq %r10, 1*8(%rdi)
73	movq %r9, 2*8(%rdi)
74	movq %r8, 3*8(%rdi)
75	leaq 4*8(%rdi), %rdi
76	jae 5b
77	addq $0x20, %rdx
78	jmp 1f
79	/*
80	 * Handle data forward by movsq.
81	 */
82	.p2align 4
834:
84	movq %rdx, %rcx
85	movq -8(%rsi, %rdx), %r11
86	lea -8(%rdi, %rdx), %r10
87	shrq $3, %rcx
88	rep movsq
89	movq %r11, (%r10)
90	jmp 13f
91.Lmemmove_end_forward:
92
93	/*
94	 * Handle data backward by movsq.
95	 */
96	.p2align 4
977:
98	movq %rdx, %rcx
99	movq (%rsi), %r11
100	movq %rdi, %r10
101	leaq -8(%rsi, %rdx), %rsi
102	leaq -8(%rdi, %rdx), %rdi
103	shrq $3, %rcx
104	std
105	rep movsq
106	cld
107	movq %r11, (%r10)
108	jmp 13f
109
110	/*
111	 * Start to prepare for backward copy.
112	 */
113	.p2align 4
1142:
115	cmp $0x20, %rdx
116	jb 1f
117	cmp $680, %rdx
118	jb 6f
119	cmp %dil, %sil
120	je 7b
1216:
122	/*
123	 * Calculate copy position to tail.
124	 */
125	addq %rdx, %rsi
126	addq %rdx, %rdi
127	subq $0x20, %rdx
128	/*
129	 * We gobble 32 bytes backward in each loop.
130	 */
1318:
132	subq $0x20, %rdx
133	movq -1*8(%rsi), %r11
134	movq -2*8(%rsi), %r10
135	movq -3*8(%rsi), %r9
136	movq -4*8(%rsi), %r8
137	leaq -4*8(%rsi), %rsi
138
139	movq %r11, -1*8(%rdi)
140	movq %r10, -2*8(%rdi)
141	movq %r9, -3*8(%rdi)
142	movq %r8, -4*8(%rdi)
143	leaq -4*8(%rdi), %rdi
144	jae 8b
145	/*
146	 * Calculate copy position to head.
147	 */
148	addq $0x20, %rdx
149	subq %rdx, %rsi
150	subq %rdx, %rdi
1511:
152	cmpq $16, %rdx
153	jb 9f
154	/*
155	 * Move data from 16 bytes to 31 bytes.
156	 */
157	movq 0*8(%rsi), %r11
158	movq 1*8(%rsi), %r10
159	movq -2*8(%rsi, %rdx), %r9
160	movq -1*8(%rsi, %rdx), %r8
161	movq %r11, 0*8(%rdi)
162	movq %r10, 1*8(%rdi)
163	movq %r9, -2*8(%rdi, %rdx)
164	movq %r8, -1*8(%rdi, %rdx)
165	jmp 13f
166	.p2align 4
1679:
168	cmpq $8, %rdx
169	jb 10f
170	/*
171	 * Move data from 8 bytes to 15 bytes.
172	 */
173	movq 0*8(%rsi), %r11
174	movq -1*8(%rsi, %rdx), %r10
175	movq %r11, 0*8(%rdi)
176	movq %r10, -1*8(%rdi, %rdx)
177	jmp 13f
17810:
179	cmpq $4, %rdx
180	jb 11f
181	/*
182	 * Move data from 4 bytes to 7 bytes.
183	 */
184	movl (%rsi), %r11d
185	movl -4(%rsi, %rdx), %r10d
186	movl %r11d, (%rdi)
187	movl %r10d, -4(%rdi, %rdx)
188	jmp 13f
18911:
190	cmp $2, %rdx
191	jb 12f
192	/*
193	 * Move data from 2 bytes to 3 bytes.
194	 */
195	movw (%rsi), %r11w
196	movw -2(%rsi, %rdx), %r10w
197	movw %r11w, (%rdi)
198	movw %r10w, -2(%rdi, %rdx)
199	jmp 13f
20012:
201	cmp $1, %rdx
202	jb 13f
203	/*
204	 * Move data for 1 byte.
205	 */
206	movb (%rsi), %r11b
207	movb %r11b, (%rdi)
20813:
209	RET
210
211.Lmemmove_erms:
212	movq %rdx, %rcx
213	rep movsb
214	RET
215SYM_FUNC_END(__memmove)
216EXPORT_SYMBOL(__memmove)
217
218SYM_FUNC_ALIAS(memmove, __memmove)
219EXPORT_SYMBOL(memmove)
220