xref: /openbmc/linux/arch/x86/lib/memmove_64.S (revision ff56535d)
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
12#undef memmove
13
14/*
15 * Implement memmove(). This can handle overlap between src and dst.
16 *
17 * Input:
18 * rdi: dest
19 * rsi: src
20 * rdx: count
21 *
22 * Output:
23 * rax: dest
24 */
25ENTRY(memmove)
26	CFI_STARTPROC
27	/* Handle more 32bytes in loop */
28	mov %rdi, %rax
29	cmp $0x20, %rdx
30	jb	1f
31
32	/* Decide forward/backward copy mode */
33	cmp %rdi, %rsi
34	jb	2f
35
36	/*
37	 * movsq instruction have many startup latency
38	 * so we handle small size by general register.
39	 */
40	cmp  $680, %rdx
41	jb	3f
42	/*
43	 * movsq instruction is only good for aligned case.
44	 */
45
46	cmpb %dil, %sil
47	je 4f
483:
49	sub $0x20, %rdx
50	/*
51	 * We gobble 32byts forward in each loop.
52	 */
535:
54	sub $0x20, %rdx
55	movq 0*8(%rsi), %r11
56	movq 1*8(%rsi), %r10
57	movq 2*8(%rsi), %r9
58	movq 3*8(%rsi), %r8
59	leaq 4*8(%rsi), %rsi
60
61	movq %r11, 0*8(%rdi)
62	movq %r10, 1*8(%rdi)
63	movq %r9, 2*8(%rdi)
64	movq %r8, 3*8(%rdi)
65	leaq 4*8(%rdi), %rdi
66	jae 5b
67	addq $0x20, %rdx
68	jmp 1f
69	/*
70	 * Handle data forward by movsq.
71	 */
72	.p2align 4
734:
74	movq %rdx, %rcx
75	movq -8(%rsi, %rdx), %r11
76	lea -8(%rdi, %rdx), %r10
77	shrq $3, %rcx
78	rep movsq
79	movq %r11, (%r10)
80	jmp 13f
81	/*
82	 * Handle data backward by movsq.
83	 */
84	.p2align 4
857:
86	movq %rdx, %rcx
87	movq (%rsi), %r11
88	movq %rdi, %r10
89	leaq -8(%rsi, %rdx), %rsi
90	leaq -8(%rdi, %rdx), %rdi
91	shrq $3, %rcx
92	std
93	rep movsq
94	cld
95	movq %r11, (%r10)
96	jmp 13f
97
98	/*
99	 * Start to prepare for backward copy.
100	 */
101	.p2align 4
1022:
103	cmp $680, %rdx
104	jb 6f
105	cmp %dil, %sil
106	je 7b
1076:
108	/*
109	 * Calculate copy position to tail.
110	 */
111	addq %rdx, %rsi
112	addq %rdx, %rdi
113	subq $0x20, %rdx
114	/*
115	 * We gobble 32byts backward in each loop.
116	 */
1178:
118	subq $0x20, %rdx
119	movq -1*8(%rsi), %r11
120	movq -2*8(%rsi), %r10
121	movq -3*8(%rsi), %r9
122	movq -4*8(%rsi), %r8
123	leaq -4*8(%rsi), %rsi
124
125	movq %r11, -1*8(%rdi)
126	movq %r10, -2*8(%rdi)
127	movq %r9, -3*8(%rdi)
128	movq %r8, -4*8(%rdi)
129	leaq -4*8(%rdi), %rdi
130	jae 8b
131	/*
132	 * Calculate copy position to head.
133	 */
134	addq $0x20, %rdx
135	subq %rdx, %rsi
136	subq %rdx, %rdi
1371:
138	cmpq $16, %rdx
139	jb 9f
140	/*
141	 * Move data from 16 bytes to 31 bytes.
142	 */
143	movq 0*8(%rsi), %r11
144	movq 1*8(%rsi), %r10
145	movq -2*8(%rsi, %rdx), %r9
146	movq -1*8(%rsi, %rdx), %r8
147	movq %r11, 0*8(%rdi)
148	movq %r10, 1*8(%rdi)
149	movq %r9, -2*8(%rdi, %rdx)
150	movq %r8, -1*8(%rdi, %rdx)
151	jmp 13f
152	.p2align 4
1539:
154	cmpq $8, %rdx
155	jb 10f
156	/*
157	 * Move data from 8 bytes to 15 bytes.
158	 */
159	movq 0*8(%rsi), %r11
160	movq -1*8(%rsi, %rdx), %r10
161	movq %r11, 0*8(%rdi)
162	movq %r10, -1*8(%rdi, %rdx)
163	jmp 13f
16410:
165	cmpq $4, %rdx
166	jb 11f
167	/*
168	 * Move data from 4 bytes to 7 bytes.
169	 */
170	movl (%rsi), %r11d
171	movl -4(%rsi, %rdx), %r10d
172	movl %r11d, (%rdi)
173	movl %r10d, -4(%rdi, %rdx)
174	jmp 13f
17511:
176	cmp $2, %rdx
177	jb 12f
178	/*
179	 * Move data from 2 bytes to 3 bytes.
180	 */
181	movw (%rsi), %r11w
182	movw -2(%rsi, %rdx), %r10w
183	movw %r11w, (%rdi)
184	movw %r10w, -2(%rdi, %rdx)
185	jmp 13f
18612:
187	cmp $1, %rdx
188	jb 13f
189	/*
190	 * Move data for 1 byte.
191	 */
192	movb (%rsi), %r11b
193	movb %r11b, (%rdi)
19413:
195	retq
196	CFI_ENDPROC
197ENDPROC(memmove)
198