xref: /openbmc/linux/arch/x86/lib/memmove_64.S (revision 393f203f)
19599ec04SFenghua Yu/*
29599ec04SFenghua Yu * Normally compiler builtins are used, but sometimes the compiler calls out
39599ec04SFenghua Yu * of line code. Based on asm-i386/string.h.
49599ec04SFenghua Yu *
59599ec04SFenghua Yu * This assembly file is re-written from memmove_64.c file.
69599ec04SFenghua Yu *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
79599ec04SFenghua Yu */
89599ec04SFenghua Yu#define _STRING_C
99599ec04SFenghua Yu#include <linux/linkage.h>
109599ec04SFenghua Yu#include <asm/dwarf2.h>
11057e05c1SFenghua Yu#include <asm/cpufeature.h>
1259e97e4dSAndy Lutomirski#include <asm/alternative-asm.h>
139599ec04SFenghua Yu
149599ec04SFenghua Yu#undef memmove
159599ec04SFenghua Yu
169599ec04SFenghua Yu/*
179599ec04SFenghua Yu * Implement memmove(). This can handle overlap between src and dst.
189599ec04SFenghua Yu *
199599ec04SFenghua Yu * Input:
209599ec04SFenghua Yu * rdi: dest
219599ec04SFenghua Yu * rsi: src
229599ec04SFenghua Yu * rdx: count
239599ec04SFenghua Yu *
249599ec04SFenghua Yu * Output:
259599ec04SFenghua Yu * rax: dest
269599ec04SFenghua Yu */
27393f203fSAndrey Ryabinin.weak memmove
28393f203fSAndrey Ryabinin
299599ec04SFenghua YuENTRY(memmove)
30393f203fSAndrey RyabininENTRY(__memmove)
319599ec04SFenghua Yu	CFI_STARTPROC
32057e05c1SFenghua Yu
339599ec04SFenghua Yu	/* Handle more 32 bytes in loop */
349599ec04SFenghua Yu	mov %rdi, %rax
359599ec04SFenghua Yu	cmp $0x20, %rdx
369599ec04SFenghua Yu	jb	1f
379599ec04SFenghua Yu
389599ec04SFenghua Yu	/* Decide forward/backward copy mode */
399599ec04SFenghua Yu	cmp %rdi, %rsi
40057e05c1SFenghua Yu	jge .Lmemmove_begin_forward
41057e05c1SFenghua Yu	mov %rsi, %r8
42057e05c1SFenghua Yu	add %rdx, %r8
43057e05c1SFenghua Yu	cmp %rdi, %r8
44057e05c1SFenghua Yu	jg 2f
459599ec04SFenghua Yu
46057e05c1SFenghua Yu.Lmemmove_begin_forward:
479599ec04SFenghua Yu	/*
489599ec04SFenghua Yu	 * movsq instruction have many startup latency
499599ec04SFenghua Yu	 * so we handle small size by general register.
509599ec04SFenghua Yu	 */
519599ec04SFenghua Yu	cmp  $680, %rdx
529599ec04SFenghua Yu	jb	3f
539599ec04SFenghua Yu	/*
549599ec04SFenghua Yu	 * movsq instruction is only good for aligned case.
559599ec04SFenghua Yu	 */
569599ec04SFenghua Yu
579599ec04SFenghua Yu	cmpb %dil, %sil
589599ec04SFenghua Yu	je 4f
599599ec04SFenghua Yu3:
609599ec04SFenghua Yu	sub $0x20, %rdx
619599ec04SFenghua Yu	/*
62bb916ff7SAndy Shevchenko	 * We gobble 32 bytes forward in each loop.
639599ec04SFenghua Yu	 */
649599ec04SFenghua Yu5:
659599ec04SFenghua Yu	sub $0x20, %rdx
669599ec04SFenghua Yu	movq 0*8(%rsi), %r11
679599ec04SFenghua Yu	movq 1*8(%rsi), %r10
689599ec04SFenghua Yu	movq 2*8(%rsi), %r9
699599ec04SFenghua Yu	movq 3*8(%rsi), %r8
709599ec04SFenghua Yu	leaq 4*8(%rsi), %rsi
719599ec04SFenghua Yu
729599ec04SFenghua Yu	movq %r11, 0*8(%rdi)
739599ec04SFenghua Yu	movq %r10, 1*8(%rdi)
749599ec04SFenghua Yu	movq %r9, 2*8(%rdi)
759599ec04SFenghua Yu	movq %r8, 3*8(%rdi)
769599ec04SFenghua Yu	leaq 4*8(%rdi), %rdi
779599ec04SFenghua Yu	jae 5b
789599ec04SFenghua Yu	addq $0x20, %rdx
799599ec04SFenghua Yu	jmp 1f
809599ec04SFenghua Yu	/*
819599ec04SFenghua Yu	 * Handle data forward by movsq.
829599ec04SFenghua Yu	 */
839599ec04SFenghua Yu	.p2align 4
849599ec04SFenghua Yu4:
859599ec04SFenghua Yu	movq %rdx, %rcx
869599ec04SFenghua Yu	movq -8(%rsi, %rdx), %r11
879599ec04SFenghua Yu	lea -8(%rdi, %rdx), %r10
889599ec04SFenghua Yu	shrq $3, %rcx
899599ec04SFenghua Yu	rep movsq
909599ec04SFenghua Yu	movq %r11, (%r10)
919599ec04SFenghua Yu	jmp 13f
92057e05c1SFenghua Yu.Lmemmove_end_forward:
93057e05c1SFenghua Yu
949599ec04SFenghua Yu	/*
959599ec04SFenghua Yu	 * Handle data backward by movsq.
969599ec04SFenghua Yu	 */
979599ec04SFenghua Yu	.p2align 4
989599ec04SFenghua Yu7:
999599ec04SFenghua Yu	movq %rdx, %rcx
1009599ec04SFenghua Yu	movq (%rsi), %r11
1019599ec04SFenghua Yu	movq %rdi, %r10
1029599ec04SFenghua Yu	leaq -8(%rsi, %rdx), %rsi
1039599ec04SFenghua Yu	leaq -8(%rdi, %rdx), %rdi
1049599ec04SFenghua Yu	shrq $3, %rcx
1059599ec04SFenghua Yu	std
1069599ec04SFenghua Yu	rep movsq
1079599ec04SFenghua Yu	cld
1089599ec04SFenghua Yu	movq %r11, (%r10)
1099599ec04SFenghua Yu	jmp 13f
1109599ec04SFenghua Yu
1119599ec04SFenghua Yu	/*
1129599ec04SFenghua Yu	 * Start to prepare for backward copy.
1139599ec04SFenghua Yu	 */
1149599ec04SFenghua Yu	.p2align 4
1159599ec04SFenghua Yu2:
1169599ec04SFenghua Yu	cmp $680, %rdx
1179599ec04SFenghua Yu	jb 6f
1189599ec04SFenghua Yu	cmp %dil, %sil
1199599ec04SFenghua Yu	je 7b
1209599ec04SFenghua Yu6:
1219599ec04SFenghua Yu	/*
1229599ec04SFenghua Yu	 * Calculate copy position to tail.
1239599ec04SFenghua Yu	 */
1249599ec04SFenghua Yu	addq %rdx, %rsi
1259599ec04SFenghua Yu	addq %rdx, %rdi
1269599ec04SFenghua Yu	subq $0x20, %rdx
1279599ec04SFenghua Yu	/*
128bb916ff7SAndy Shevchenko	 * We gobble 32 bytes backward in each loop.
1299599ec04SFenghua Yu	 */
1309599ec04SFenghua Yu8:
1319599ec04SFenghua Yu	subq $0x20, %rdx
1329599ec04SFenghua Yu	movq -1*8(%rsi), %r11
1339599ec04SFenghua Yu	movq -2*8(%rsi), %r10
1349599ec04SFenghua Yu	movq -3*8(%rsi), %r9
1359599ec04SFenghua Yu	movq -4*8(%rsi), %r8
1369599ec04SFenghua Yu	leaq -4*8(%rsi), %rsi
1379599ec04SFenghua Yu
1389599ec04SFenghua Yu	movq %r11, -1*8(%rdi)
1399599ec04SFenghua Yu	movq %r10, -2*8(%rdi)
1409599ec04SFenghua Yu	movq %r9, -3*8(%rdi)
1419599ec04SFenghua Yu	movq %r8, -4*8(%rdi)
1429599ec04SFenghua Yu	leaq -4*8(%rdi), %rdi
1439599ec04SFenghua Yu	jae 8b
1449599ec04SFenghua Yu	/*
1459599ec04SFenghua Yu	 * Calculate copy position to head.
1469599ec04SFenghua Yu	 */
1479599ec04SFenghua Yu	addq $0x20, %rdx
1489599ec04SFenghua Yu	subq %rdx, %rsi
1499599ec04SFenghua Yu	subq %rdx, %rdi
1509599ec04SFenghua Yu1:
1519599ec04SFenghua Yu	cmpq $16, %rdx
1529599ec04SFenghua Yu	jb 9f
1539599ec04SFenghua Yu	/*
1549599ec04SFenghua Yu	 * Move data from 16 bytes to 31 bytes.
1559599ec04SFenghua Yu	 */
1569599ec04SFenghua Yu	movq 0*8(%rsi), %r11
1579599ec04SFenghua Yu	movq 1*8(%rsi), %r10
1589599ec04SFenghua Yu	movq -2*8(%rsi, %rdx), %r9
1599599ec04SFenghua Yu	movq -1*8(%rsi, %rdx), %r8
1609599ec04SFenghua Yu	movq %r11, 0*8(%rdi)
1619599ec04SFenghua Yu	movq %r10, 1*8(%rdi)
1629599ec04SFenghua Yu	movq %r9, -2*8(%rdi, %rdx)
1639599ec04SFenghua Yu	movq %r8, -1*8(%rdi, %rdx)
1649599ec04SFenghua Yu	jmp 13f
1659599ec04SFenghua Yu	.p2align 4
1669599ec04SFenghua Yu9:
1679599ec04SFenghua Yu	cmpq $8, %rdx
1689599ec04SFenghua Yu	jb 10f
1699599ec04SFenghua Yu	/*
1709599ec04SFenghua Yu	 * Move data from 8 bytes to 15 bytes.
1719599ec04SFenghua Yu	 */
1729599ec04SFenghua Yu	movq 0*8(%rsi), %r11
1739599ec04SFenghua Yu	movq -1*8(%rsi, %rdx), %r10
1749599ec04SFenghua Yu	movq %r11, 0*8(%rdi)
1759599ec04SFenghua Yu	movq %r10, -1*8(%rdi, %rdx)
1769599ec04SFenghua Yu	jmp 13f
1779599ec04SFenghua Yu10:
1789599ec04SFenghua Yu	cmpq $4, %rdx
1799599ec04SFenghua Yu	jb 11f
1809599ec04SFenghua Yu	/*
1819599ec04SFenghua Yu	 * Move data from 4 bytes to 7 bytes.
1829599ec04SFenghua Yu	 */
1839599ec04SFenghua Yu	movl (%rsi), %r11d
1849599ec04SFenghua Yu	movl -4(%rsi, %rdx), %r10d
1859599ec04SFenghua Yu	movl %r11d, (%rdi)
1869599ec04SFenghua Yu	movl %r10d, -4(%rdi, %rdx)
1879599ec04SFenghua Yu	jmp 13f
1889599ec04SFenghua Yu11:
1899599ec04SFenghua Yu	cmp $2, %rdx
1909599ec04SFenghua Yu	jb 12f
1919599ec04SFenghua Yu	/*
1929599ec04SFenghua Yu	 * Move data from 2 bytes to 3 bytes.
1939599ec04SFenghua Yu	 */
1949599ec04SFenghua Yu	movw (%rsi), %r11w
1959599ec04SFenghua Yu	movw -2(%rsi, %rdx), %r10w
1969599ec04SFenghua Yu	movw %r11w, (%rdi)
1979599ec04SFenghua Yu	movw %r10w, -2(%rdi, %rdx)
1989599ec04SFenghua Yu	jmp 13f
1999599ec04SFenghua Yu12:
2009599ec04SFenghua Yu	cmp $1, %rdx
2019599ec04SFenghua Yu	jb 13f
2029599ec04SFenghua Yu	/*
2039599ec04SFenghua Yu	 * Move data for 1 byte.
2049599ec04SFenghua Yu	 */
2059599ec04SFenghua Yu	movb (%rsi), %r11b
2069599ec04SFenghua Yu	movb %r11b, (%rdi)
2079599ec04SFenghua Yu13:
2089599ec04SFenghua Yu	retq
2099599ec04SFenghua Yu	CFI_ENDPROC
210057e05c1SFenghua Yu
211057e05c1SFenghua Yu	.section .altinstr_replacement,"ax"
212057e05c1SFenghua Yu.Lmemmove_begin_forward_efs:
213057e05c1SFenghua Yu	/* Forward moving data. */
214057e05c1SFenghua Yu	movq %rdx, %rcx
215057e05c1SFenghua Yu	rep movsb
216057e05c1SFenghua Yu	retq
217057e05c1SFenghua Yu.Lmemmove_end_forward_efs:
218057e05c1SFenghua Yu	.previous
219057e05c1SFenghua Yu
220057e05c1SFenghua Yu	.section .altinstructions,"a"
22159e97e4dSAndy Lutomirski	altinstruction_entry .Lmemmove_begin_forward,		\
22259e97e4dSAndy Lutomirski		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
22359e97e4dSAndy Lutomirski		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
22459e97e4dSAndy Lutomirski		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
225057e05c1SFenghua Yu	.previous
226393f203fSAndrey RyabininENDPROC(__memmove)
2279599ec04SFenghua YuENDPROC(memmove)
228