xref: /openbmc/linux/tools/arch/x86/lib/memcpy_64.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1457c8996SThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */
27d7d1bf1SArnaldo Carvalho de Melo/* Copyright 2002 Andi Kleen */
37d7d1bf1SArnaldo Carvalho de Melo
47d7d1bf1SArnaldo Carvalho de Melo#include <linux/linkage.h>
57d7d1bf1SArnaldo Carvalho de Melo#include <asm/errno.h>
67d7d1bf1SArnaldo Carvalho de Melo#include <asm/cpufeatures.h>
7fb24e308SArnaldo Carvalho de Melo#include <asm/alternative.h>
8fb7df12dSIngo Molnar#include <asm/export.h>
97d7d1bf1SArnaldo Carvalho de Melo
1031d2e6b5SArnaldo Carvalho de Melo.section .noinstr.text, "ax"
11eb25de27SArnaldo Carvalho de Melo
127d7d1bf1SArnaldo Carvalho de Melo/*
137d7d1bf1SArnaldo Carvalho de Melo * memcpy - Copy a memory block.
147d7d1bf1SArnaldo Carvalho de Melo *
157d7d1bf1SArnaldo Carvalho de Melo * Input:
167d7d1bf1SArnaldo Carvalho de Melo *  rdi destination
177d7d1bf1SArnaldo Carvalho de Melo *  rsi source
187d7d1bf1SArnaldo Carvalho de Melo *  rdx count
197d7d1bf1SArnaldo Carvalho de Melo *
207d7d1bf1SArnaldo Carvalho de Melo * Output:
217d7d1bf1SArnaldo Carvalho de Melo * rax original destination
22*7f02ce62SArnaldo Carvalho de Melo *
23*7f02ce62SArnaldo Carvalho de Melo * The FSRM alternative should be done inline (avoiding the call and
24*7f02ce62SArnaldo Carvalho de Melo * the disgusting return handling), but that would require some help
25*7f02ce62SArnaldo Carvalho de Melo * from the compiler for better calling conventions.
26*7f02ce62SArnaldo Carvalho de Melo *
27*7f02ce62SArnaldo Carvalho de Melo * The 'rep movsb' itself is small enough to replace the call, but the
28*7f02ce62SArnaldo Carvalho de Melo * two register moves blow up the code. And one of them is "needed"
29*7f02ce62SArnaldo Carvalho de Melo * only for the return value that is the same as the source input,
30*7f02ce62SArnaldo Carvalho de Melo * which the compiler could/should do much better anyway.
317d7d1bf1SArnaldo Carvalho de Melo */
324402e360SArnaldo Carvalho de MeloSYM_TYPED_FUNC_START(__memcpy)
33*7f02ce62SArnaldo Carvalho de Melo	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
347d7d1bf1SArnaldo Carvalho de Melo
357d7d1bf1SArnaldo Carvalho de Melo	movq %rdi, %rax
367d7d1bf1SArnaldo Carvalho de Melo	movq %rdx, %rcx
377d7d1bf1SArnaldo Carvalho de Melo	rep movsb
3835cb8c71SArnaldo Carvalho de Melo	RET
397be2e319SMark RutlandSYM_FUNC_END(__memcpy)
40fb7df12dSIngo MolnarEXPORT_SYMBOL(__memcpy)
417d7d1bf1SArnaldo Carvalho de Melo
4231d2e6b5SArnaldo Carvalho de MeloSYM_FUNC_ALIAS(memcpy, __memcpy)
437be2e319SMark RutlandEXPORT_SYMBOL(memcpy)
447be2e319SMark Rutland
45db1a8b97SArnaldo Carvalho de MeloSYM_FUNC_START_LOCAL(memcpy_orig)
467d7d1bf1SArnaldo Carvalho de Melo	movq %rdi, %rax
477d7d1bf1SArnaldo Carvalho de Melo
487d7d1bf1SArnaldo Carvalho de Melo	cmpq $0x20, %rdx
497d7d1bf1SArnaldo Carvalho de Melo	jb .Lhandle_tail
507d7d1bf1SArnaldo Carvalho de Melo
517d7d1bf1SArnaldo Carvalho de Melo	/*
527d7d1bf1SArnaldo Carvalho de Melo	 * We check whether memory false dependence could occur,
537d7d1bf1SArnaldo Carvalho de Melo	 * then jump to corresponding copy mode.
547d7d1bf1SArnaldo Carvalho de Melo	 */
557d7d1bf1SArnaldo Carvalho de Melo	cmp  %dil, %sil
567d7d1bf1SArnaldo Carvalho de Melo	jl .Lcopy_backward
577d7d1bf1SArnaldo Carvalho de Melo	subq $0x20, %rdx
587d7d1bf1SArnaldo Carvalho de Melo.Lcopy_forward_loop:
597d7d1bf1SArnaldo Carvalho de Melo	subq $0x20,	%rdx
607d7d1bf1SArnaldo Carvalho de Melo
617d7d1bf1SArnaldo Carvalho de Melo	/*
627d7d1bf1SArnaldo Carvalho de Melo	 * Move in blocks of 4x8 bytes:
637d7d1bf1SArnaldo Carvalho de Melo	 */
647d7d1bf1SArnaldo Carvalho de Melo	movq 0*8(%rsi),	%r8
657d7d1bf1SArnaldo Carvalho de Melo	movq 1*8(%rsi),	%r9
667d7d1bf1SArnaldo Carvalho de Melo	movq 2*8(%rsi),	%r10
677d7d1bf1SArnaldo Carvalho de Melo	movq 3*8(%rsi),	%r11
687d7d1bf1SArnaldo Carvalho de Melo	leaq 4*8(%rsi),	%rsi
697d7d1bf1SArnaldo Carvalho de Melo
707d7d1bf1SArnaldo Carvalho de Melo	movq %r8,	0*8(%rdi)
717d7d1bf1SArnaldo Carvalho de Melo	movq %r9,	1*8(%rdi)
727d7d1bf1SArnaldo Carvalho de Melo	movq %r10,	2*8(%rdi)
737d7d1bf1SArnaldo Carvalho de Melo	movq %r11,	3*8(%rdi)
747d7d1bf1SArnaldo Carvalho de Melo	leaq 4*8(%rdi),	%rdi
757d7d1bf1SArnaldo Carvalho de Melo	jae  .Lcopy_forward_loop
767d7d1bf1SArnaldo Carvalho de Melo	addl $0x20,	%edx
777d7d1bf1SArnaldo Carvalho de Melo	jmp  .Lhandle_tail
787d7d1bf1SArnaldo Carvalho de Melo
797d7d1bf1SArnaldo Carvalho de Melo.Lcopy_backward:
807d7d1bf1SArnaldo Carvalho de Melo	/*
817d7d1bf1SArnaldo Carvalho de Melo	 * Calculate copy position to tail.
827d7d1bf1SArnaldo Carvalho de Melo	 */
837d7d1bf1SArnaldo Carvalho de Melo	addq %rdx,	%rsi
847d7d1bf1SArnaldo Carvalho de Melo	addq %rdx,	%rdi
857d7d1bf1SArnaldo Carvalho de Melo	subq $0x20,	%rdx
867d7d1bf1SArnaldo Carvalho de Melo	/*
877d7d1bf1SArnaldo Carvalho de Melo	 * At most 3 ALU operations in one cycle,
887d7d1bf1SArnaldo Carvalho de Melo	 * so append NOPS in the same 16 bytes trunk.
897d7d1bf1SArnaldo Carvalho de Melo	 */
907d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
917d7d1bf1SArnaldo Carvalho de Melo.Lcopy_backward_loop:
927d7d1bf1SArnaldo Carvalho de Melo	subq $0x20,	%rdx
937d7d1bf1SArnaldo Carvalho de Melo	movq -1*8(%rsi),	%r8
947d7d1bf1SArnaldo Carvalho de Melo	movq -2*8(%rsi),	%r9
957d7d1bf1SArnaldo Carvalho de Melo	movq -3*8(%rsi),	%r10
967d7d1bf1SArnaldo Carvalho de Melo	movq -4*8(%rsi),	%r11
977d7d1bf1SArnaldo Carvalho de Melo	leaq -4*8(%rsi),	%rsi
987d7d1bf1SArnaldo Carvalho de Melo	movq %r8,		-1*8(%rdi)
997d7d1bf1SArnaldo Carvalho de Melo	movq %r9,		-2*8(%rdi)
1007d7d1bf1SArnaldo Carvalho de Melo	movq %r10,		-3*8(%rdi)
1017d7d1bf1SArnaldo Carvalho de Melo	movq %r11,		-4*8(%rdi)
1027d7d1bf1SArnaldo Carvalho de Melo	leaq -4*8(%rdi),	%rdi
1037d7d1bf1SArnaldo Carvalho de Melo	jae  .Lcopy_backward_loop
1047d7d1bf1SArnaldo Carvalho de Melo
1057d7d1bf1SArnaldo Carvalho de Melo	/*
1067d7d1bf1SArnaldo Carvalho de Melo	 * Calculate copy position to head.
1077d7d1bf1SArnaldo Carvalho de Melo	 */
1087d7d1bf1SArnaldo Carvalho de Melo	addl $0x20,	%edx
1097d7d1bf1SArnaldo Carvalho de Melo	subq %rdx,	%rsi
1107d7d1bf1SArnaldo Carvalho de Melo	subq %rdx,	%rdi
1117d7d1bf1SArnaldo Carvalho de Melo.Lhandle_tail:
1127d7d1bf1SArnaldo Carvalho de Melo	cmpl $16,	%edx
1137d7d1bf1SArnaldo Carvalho de Melo	jb   .Lless_16bytes
1147d7d1bf1SArnaldo Carvalho de Melo
1157d7d1bf1SArnaldo Carvalho de Melo	/*
1167d7d1bf1SArnaldo Carvalho de Melo	 * Move data from 16 bytes to 31 bytes.
1177d7d1bf1SArnaldo Carvalho de Melo	 */
1187d7d1bf1SArnaldo Carvalho de Melo	movq 0*8(%rsi), %r8
1197d7d1bf1SArnaldo Carvalho de Melo	movq 1*8(%rsi),	%r9
1207d7d1bf1SArnaldo Carvalho de Melo	movq -2*8(%rsi, %rdx),	%r10
1217d7d1bf1SArnaldo Carvalho de Melo	movq -1*8(%rsi, %rdx),	%r11
1227d7d1bf1SArnaldo Carvalho de Melo	movq %r8,	0*8(%rdi)
1237d7d1bf1SArnaldo Carvalho de Melo	movq %r9,	1*8(%rdi)
1247d7d1bf1SArnaldo Carvalho de Melo	movq %r10,	-2*8(%rdi, %rdx)
1257d7d1bf1SArnaldo Carvalho de Melo	movq %r11,	-1*8(%rdi, %rdx)
12635cb8c71SArnaldo Carvalho de Melo	RET
1277d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
1287d7d1bf1SArnaldo Carvalho de Melo.Lless_16bytes:
1297d7d1bf1SArnaldo Carvalho de Melo	cmpl $8,	%edx
1307d7d1bf1SArnaldo Carvalho de Melo	jb   .Lless_8bytes
1317d7d1bf1SArnaldo Carvalho de Melo	/*
1327d7d1bf1SArnaldo Carvalho de Melo	 * Move data from 8 bytes to 15 bytes.
1337d7d1bf1SArnaldo Carvalho de Melo	 */
1347d7d1bf1SArnaldo Carvalho de Melo	movq 0*8(%rsi),	%r8
1357d7d1bf1SArnaldo Carvalho de Melo	movq -1*8(%rsi, %rdx),	%r9
1367d7d1bf1SArnaldo Carvalho de Melo	movq %r8,	0*8(%rdi)
1377d7d1bf1SArnaldo Carvalho de Melo	movq %r9,	-1*8(%rdi, %rdx)
13835cb8c71SArnaldo Carvalho de Melo	RET
1397d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
1407d7d1bf1SArnaldo Carvalho de Melo.Lless_8bytes:
1417d7d1bf1SArnaldo Carvalho de Melo	cmpl $4,	%edx
1427d7d1bf1SArnaldo Carvalho de Melo	jb   .Lless_3bytes
1437d7d1bf1SArnaldo Carvalho de Melo
1447d7d1bf1SArnaldo Carvalho de Melo	/*
1457d7d1bf1SArnaldo Carvalho de Melo	 * Move data from 4 bytes to 7 bytes.
1467d7d1bf1SArnaldo Carvalho de Melo	 */
1477d7d1bf1SArnaldo Carvalho de Melo	movl (%rsi), %ecx
1487d7d1bf1SArnaldo Carvalho de Melo	movl -4(%rsi, %rdx), %r8d
1497d7d1bf1SArnaldo Carvalho de Melo	movl %ecx, (%rdi)
1507d7d1bf1SArnaldo Carvalho de Melo	movl %r8d, -4(%rdi, %rdx)
15135cb8c71SArnaldo Carvalho de Melo	RET
1527d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
1537d7d1bf1SArnaldo Carvalho de Melo.Lless_3bytes:
1547d7d1bf1SArnaldo Carvalho de Melo	subl $1, %edx
1557d7d1bf1SArnaldo Carvalho de Melo	jb .Lend
1567d7d1bf1SArnaldo Carvalho de Melo	/*
1577d7d1bf1SArnaldo Carvalho de Melo	 * Move data from 1 bytes to 3 bytes.
1587d7d1bf1SArnaldo Carvalho de Melo	 */
1597d7d1bf1SArnaldo Carvalho de Melo	movzbl (%rsi), %ecx
1607d7d1bf1SArnaldo Carvalho de Melo	jz .Lstore_1byte
1617d7d1bf1SArnaldo Carvalho de Melo	movzbq 1(%rsi), %r8
1627d7d1bf1SArnaldo Carvalho de Melo	movzbq (%rsi, %rdx), %r9
1637d7d1bf1SArnaldo Carvalho de Melo	movb %r8b, 1(%rdi)
1647d7d1bf1SArnaldo Carvalho de Melo	movb %r9b, (%rdi, %rdx)
1657d7d1bf1SArnaldo Carvalho de Melo.Lstore_1byte:
1667d7d1bf1SArnaldo Carvalho de Melo	movb %cl, (%rdi)
1677d7d1bf1SArnaldo Carvalho de Melo
1687d7d1bf1SArnaldo Carvalho de Melo.Lend:
16935cb8c71SArnaldo Carvalho de Melo	RET
170bd5c6b81SArnaldo Carvalho de MeloSYM_FUNC_END(memcpy_orig)
1717d7d1bf1SArnaldo Carvalho de Melo
172