xref: /openbmc/linux/arch/x86/lib/memcpy_64.S (revision 7dd65feb)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4
5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h>
7
8/*
9 * memcpy - Copy a memory block.
10 *
11 * Input:
12 *  rdi destination
13 *  rsi source
14 *  rdx count
15 *
16 * Output:
17 * rax original destination
18 */
19
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
26	ALIGN
27memcpy_c:
28	CFI_STARTPROC
29	movq %rdi, %rax
30
31	movl %edx, %ecx
32	shrl $3, %ecx
33	andl $7, %edx
34	rep movsq
35	movl %edx, %ecx
36	rep movsb
37	ret
38	CFI_ENDPROC
39ENDPROC(memcpy_c)
40
41ENTRY(__memcpy)
42ENTRY(memcpy)
43	CFI_STARTPROC
44
45	/*
46	 * Put the number of full 64-byte blocks into %ecx.
47	 * Tail portion is handled at the end:
48	 */
49	movq %rdi, %rax
50	movl %edx, %ecx
51	shrl   $6, %ecx
52	jz .Lhandle_tail
53
54	.p2align 4
55.Lloop_64:
56	/*
57	 * We decrement the loop index here - and the zero-flag is
58	 * checked at the end of the loop (instructions inbetween do
59	 * not change the zero flag):
60	 */
61	decl %ecx
62
63	/*
64	 * Move in blocks of 4x16 bytes:
65	 */
66	movq 0*8(%rsi),		%r11
67	movq 1*8(%rsi),		%r8
68	movq %r11,		0*8(%rdi)
69	movq %r8,		1*8(%rdi)
70
71	movq 2*8(%rsi),		%r9
72	movq 3*8(%rsi),		%r10
73	movq %r9,		2*8(%rdi)
74	movq %r10,		3*8(%rdi)
75
76	movq 4*8(%rsi),		%r11
77	movq 5*8(%rsi),		%r8
78	movq %r11,		4*8(%rdi)
79	movq %r8,		5*8(%rdi)
80
81	movq 6*8(%rsi),		%r9
82	movq 7*8(%rsi),		%r10
83	movq %r9,		6*8(%rdi)
84	movq %r10,		7*8(%rdi)
85
86	leaq 64(%rsi), %rsi
87	leaq 64(%rdi), %rdi
88
89	jnz  .Lloop_64
90
91.Lhandle_tail:
92	movl %edx, %ecx
93	andl  $63, %ecx
94	shrl   $3, %ecx
95	jz   .Lhandle_7
96
97	.p2align 4
98.Lloop_8:
99	decl %ecx
100	movq (%rsi),		%r8
101	movq %r8,		(%rdi)
102	leaq 8(%rdi),		%rdi
103	leaq 8(%rsi),		%rsi
104	jnz  .Lloop_8
105
106.Lhandle_7:
107	movl %edx, %ecx
108	andl $7, %ecx
109	jz .Lend
110
111	.p2align 4
112.Lloop_1:
113	movb (%rsi), %r8b
114	movb %r8b, (%rdi)
115	incq %rdi
116	incq %rsi
117	decl %ecx
118	jnz .Lloop_1
119
120.Lend:
121	ret
122	CFI_ENDPROC
123ENDPROC(memcpy)
124ENDPROC(__memcpy)
125
126	/*
127	 * Some CPUs run faster using the string copy instructions.
128	 * It is also a lot simpler. Use this when possible:
129	 */
130
131	.section .altinstr_replacement, "ax"
1321:	.byte 0xeb				/* jmp <disp8> */
133	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
1342:
135	.previous
136
137	.section .altinstructions, "a"
138	.align 8
139	.quad memcpy
140	.quad 1b
141	.byte X86_FEATURE_REP_GOOD
142
143	/*
144	 * Replace only beginning, memcpy is used to apply alternatives,
145	 * so it is silly to overwrite itself with nops - reboot is the
146	 * only outcome...
147	 */
148	.byte 2b - 1b
149	.byte 2b - 1b
150	.previous
151