xref: /openbmc/linux/arch/x86/lib/memcpy_64.S (revision 4f3db074)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/cpufeature.h>
5#include <asm/dwarf2.h>
6#include <asm/alternative-asm.h>
7
8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
18 * memcpy - Copy a memory block.
19 *
20 * Input:
21 *  rdi destination
22 *  rsi source
23 *  rdx count
24 *
25 * Output:
26 * rax original destination
27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31		      "jmp memcpy_erms", X86_FEATURE_ERMS
32
33	movq %rdi, %rax
34	movq %rdx, %rcx
35	shrq $3, %rcx
36	andl $7, %edx
37	rep movsq
38	movl %edx, %ecx
39	rep movsb
40	ret
41ENDPROC(memcpy)
42ENDPROC(__memcpy)
43
44/*
45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
46 * simpler than memcpy. Use memcpy_erms when possible.
47 */
48ENTRY(memcpy_erms)
49	movq %rdi, %rax
50	movq %rdx, %rcx
51	rep movsb
52	ret
53ENDPROC(memcpy_erms)
54
55ENTRY(memcpy_orig)
56	CFI_STARTPROC
57	movq %rdi, %rax
58
59	cmpq $0x20, %rdx
60	jb .Lhandle_tail
61
62	/*
63	 * We check whether memory false dependence could occur,
64	 * then jump to corresponding copy mode.
65	 */
66	cmp  %dil, %sil
67	jl .Lcopy_backward
68	subq $0x20, %rdx
69.Lcopy_forward_loop:
70	subq $0x20,	%rdx
71
72	/*
73	 * Move in blocks of 4x8 bytes:
74	 */
75	movq 0*8(%rsi),	%r8
76	movq 1*8(%rsi),	%r9
77	movq 2*8(%rsi),	%r10
78	movq 3*8(%rsi),	%r11
79	leaq 4*8(%rsi),	%rsi
80
81	movq %r8,	0*8(%rdi)
82	movq %r9,	1*8(%rdi)
83	movq %r10,	2*8(%rdi)
84	movq %r11,	3*8(%rdi)
85	leaq 4*8(%rdi),	%rdi
86	jae  .Lcopy_forward_loop
87	addl $0x20,	%edx
88	jmp  .Lhandle_tail
89
90.Lcopy_backward:
91	/*
92	 * Calculate copy position to tail.
93	 */
94	addq %rdx,	%rsi
95	addq %rdx,	%rdi
96	subq $0x20,	%rdx
97	/*
98	 * At most 3 ALU operations in one cycle,
99	 * so append NOPS in the same 16 bytes trunk.
100	 */
101	.p2align 4
102.Lcopy_backward_loop:
103	subq $0x20,	%rdx
104	movq -1*8(%rsi),	%r8
105	movq -2*8(%rsi),	%r9
106	movq -3*8(%rsi),	%r10
107	movq -4*8(%rsi),	%r11
108	leaq -4*8(%rsi),	%rsi
109	movq %r8,		-1*8(%rdi)
110	movq %r9,		-2*8(%rdi)
111	movq %r10,		-3*8(%rdi)
112	movq %r11,		-4*8(%rdi)
113	leaq -4*8(%rdi),	%rdi
114	jae  .Lcopy_backward_loop
115
116	/*
117	 * Calculate copy position to head.
118	 */
119	addl $0x20,	%edx
120	subq %rdx,	%rsi
121	subq %rdx,	%rdi
122.Lhandle_tail:
123	cmpl $16,	%edx
124	jb   .Lless_16bytes
125
126	/*
127	 * Move data from 16 bytes to 31 bytes.
128	 */
129	movq 0*8(%rsi), %r8
130	movq 1*8(%rsi),	%r9
131	movq -2*8(%rsi, %rdx),	%r10
132	movq -1*8(%rsi, %rdx),	%r11
133	movq %r8,	0*8(%rdi)
134	movq %r9,	1*8(%rdi)
135	movq %r10,	-2*8(%rdi, %rdx)
136	movq %r11,	-1*8(%rdi, %rdx)
137	retq
138	.p2align 4
139.Lless_16bytes:
140	cmpl $8,	%edx
141	jb   .Lless_8bytes
142	/*
143	 * Move data from 8 bytes to 15 bytes.
144	 */
145	movq 0*8(%rsi),	%r8
146	movq -1*8(%rsi, %rdx),	%r9
147	movq %r8,	0*8(%rdi)
148	movq %r9,	-1*8(%rdi, %rdx)
149	retq
150	.p2align 4
151.Lless_8bytes:
152	cmpl $4,	%edx
153	jb   .Lless_3bytes
154
155	/*
156	 * Move data from 4 bytes to 7 bytes.
157	 */
158	movl (%rsi), %ecx
159	movl -4(%rsi, %rdx), %r8d
160	movl %ecx, (%rdi)
161	movl %r8d, -4(%rdi, %rdx)
162	retq
163	.p2align 4
164.Lless_3bytes:
165	subl $1, %edx
166	jb .Lend
167	/*
168	 * Move data from 1 bytes to 3 bytes.
169	 */
170	movzbl (%rsi), %ecx
171	jz .Lstore_1byte
172	movzbq 1(%rsi), %r8
173	movzbq (%rsi, %rdx), %r9
174	movb %r8b, 1(%rdi)
175	movb %r9b, (%rdi, %rdx)
176.Lstore_1byte:
177	movb %cl, (%rdi)
178
179.Lend:
180	retq
181	CFI_ENDPROC
182ENDPROC(memcpy_orig)
183