1457c8996SThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */ 27d7d1bf1SArnaldo Carvalho de Melo/* Copyright 2002 Andi Kleen */ 37d7d1bf1SArnaldo Carvalho de Melo 47d7d1bf1SArnaldo Carvalho de Melo#include <linux/linkage.h> 57d7d1bf1SArnaldo Carvalho de Melo#include <asm/errno.h> 67d7d1bf1SArnaldo Carvalho de Melo#include <asm/cpufeatures.h> 7fb24e308SArnaldo Carvalho de Melo#include <asm/alternative.h> 8fb7df12dSIngo Molnar#include <asm/export.h> 97d7d1bf1SArnaldo Carvalho de Melo 1031d2e6b5SArnaldo Carvalho de Melo.section .noinstr.text, "ax" 11eb25de27SArnaldo Carvalho de Melo 127d7d1bf1SArnaldo Carvalho de Melo/* 137d7d1bf1SArnaldo Carvalho de Melo * memcpy - Copy a memory block. 147d7d1bf1SArnaldo Carvalho de Melo * 157d7d1bf1SArnaldo Carvalho de Melo * Input: 167d7d1bf1SArnaldo Carvalho de Melo * rdi destination 177d7d1bf1SArnaldo Carvalho de Melo * rsi source 187d7d1bf1SArnaldo Carvalho de Melo * rdx count 197d7d1bf1SArnaldo Carvalho de Melo * 207d7d1bf1SArnaldo Carvalho de Melo * Output: 217d7d1bf1SArnaldo Carvalho de Melo * rax original destination 22*7f02ce62SArnaldo Carvalho de Melo * 23*7f02ce62SArnaldo Carvalho de Melo * The FSRM alternative should be done inline (avoiding the call and 24*7f02ce62SArnaldo Carvalho de Melo * the disgusting return handling), but that would require some help 25*7f02ce62SArnaldo Carvalho de Melo * from the compiler for better calling conventions. 26*7f02ce62SArnaldo Carvalho de Melo * 27*7f02ce62SArnaldo Carvalho de Melo * The 'rep movsb' itself is small enough to replace the call, but the 28*7f02ce62SArnaldo Carvalho de Melo * two register moves blow up the code. And one of them is "needed" 29*7f02ce62SArnaldo Carvalho de Melo * only for the return value that is the same as the source input, 30*7f02ce62SArnaldo Carvalho de Melo * which the compiler could/should do much better anyway. 317d7d1bf1SArnaldo Carvalho de Melo */ 324402e360SArnaldo Carvalho de MeloSYM_TYPED_FUNC_START(__memcpy) 33*7f02ce62SArnaldo Carvalho de Melo ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 347d7d1bf1SArnaldo Carvalho de Melo 357d7d1bf1SArnaldo Carvalho de Melo movq %rdi, %rax 367d7d1bf1SArnaldo Carvalho de Melo movq %rdx, %rcx 377d7d1bf1SArnaldo Carvalho de Melo rep movsb 3835cb8c71SArnaldo Carvalho de Melo RET 397be2e319SMark RutlandSYM_FUNC_END(__memcpy) 40fb7df12dSIngo MolnarEXPORT_SYMBOL(__memcpy) 417d7d1bf1SArnaldo Carvalho de Melo 4231d2e6b5SArnaldo Carvalho de MeloSYM_FUNC_ALIAS(memcpy, __memcpy) 437be2e319SMark RutlandEXPORT_SYMBOL(memcpy) 447be2e319SMark Rutland 45db1a8b97SArnaldo Carvalho de MeloSYM_FUNC_START_LOCAL(memcpy_orig) 467d7d1bf1SArnaldo Carvalho de Melo movq %rdi, %rax 477d7d1bf1SArnaldo Carvalho de Melo 487d7d1bf1SArnaldo Carvalho de Melo cmpq $0x20, %rdx 497d7d1bf1SArnaldo Carvalho de Melo jb .Lhandle_tail 507d7d1bf1SArnaldo Carvalho de Melo 517d7d1bf1SArnaldo Carvalho de Melo /* 527d7d1bf1SArnaldo Carvalho de Melo * We check whether memory false dependence could occur, 537d7d1bf1SArnaldo Carvalho de Melo * then jump to corresponding copy mode. 547d7d1bf1SArnaldo Carvalho de Melo */ 557d7d1bf1SArnaldo Carvalho de Melo cmp %dil, %sil 567d7d1bf1SArnaldo Carvalho de Melo jl .Lcopy_backward 577d7d1bf1SArnaldo Carvalho de Melo subq $0x20, %rdx 587d7d1bf1SArnaldo Carvalho de Melo.Lcopy_forward_loop: 597d7d1bf1SArnaldo Carvalho de Melo subq $0x20, %rdx 607d7d1bf1SArnaldo Carvalho de Melo 617d7d1bf1SArnaldo Carvalho de Melo /* 627d7d1bf1SArnaldo Carvalho de Melo * Move in blocks of 4x8 bytes: 637d7d1bf1SArnaldo Carvalho de Melo */ 647d7d1bf1SArnaldo Carvalho de Melo movq 0*8(%rsi), %r8 657d7d1bf1SArnaldo Carvalho de Melo movq 1*8(%rsi), %r9 667d7d1bf1SArnaldo Carvalho de Melo movq 2*8(%rsi), %r10 677d7d1bf1SArnaldo Carvalho de Melo movq 3*8(%rsi), %r11 687d7d1bf1SArnaldo Carvalho de Melo leaq 4*8(%rsi), %rsi 697d7d1bf1SArnaldo Carvalho de Melo 707d7d1bf1SArnaldo Carvalho de Melo movq %r8, 0*8(%rdi) 717d7d1bf1SArnaldo Carvalho de Melo movq %r9, 1*8(%rdi) 727d7d1bf1SArnaldo Carvalho de Melo movq %r10, 2*8(%rdi) 737d7d1bf1SArnaldo Carvalho de Melo movq %r11, 3*8(%rdi) 747d7d1bf1SArnaldo Carvalho de Melo leaq 4*8(%rdi), %rdi 757d7d1bf1SArnaldo Carvalho de Melo jae .Lcopy_forward_loop 767d7d1bf1SArnaldo Carvalho de Melo addl $0x20, %edx 777d7d1bf1SArnaldo Carvalho de Melo jmp .Lhandle_tail 787d7d1bf1SArnaldo Carvalho de Melo 797d7d1bf1SArnaldo Carvalho de Melo.Lcopy_backward: 807d7d1bf1SArnaldo Carvalho de Melo /* 817d7d1bf1SArnaldo Carvalho de Melo * Calculate copy position to tail. 827d7d1bf1SArnaldo Carvalho de Melo */ 837d7d1bf1SArnaldo Carvalho de Melo addq %rdx, %rsi 847d7d1bf1SArnaldo Carvalho de Melo addq %rdx, %rdi 857d7d1bf1SArnaldo Carvalho de Melo subq $0x20, %rdx 867d7d1bf1SArnaldo Carvalho de Melo /* 877d7d1bf1SArnaldo Carvalho de Melo * At most 3 ALU operations in one cycle, 887d7d1bf1SArnaldo Carvalho de Melo * so append NOPS in the same 16 bytes trunk. 897d7d1bf1SArnaldo Carvalho de Melo */ 907d7d1bf1SArnaldo Carvalho de Melo .p2align 4 917d7d1bf1SArnaldo Carvalho de Melo.Lcopy_backward_loop: 927d7d1bf1SArnaldo Carvalho de Melo subq $0x20, %rdx 937d7d1bf1SArnaldo Carvalho de Melo movq -1*8(%rsi), %r8 947d7d1bf1SArnaldo Carvalho de Melo movq -2*8(%rsi), %r9 957d7d1bf1SArnaldo Carvalho de Melo movq -3*8(%rsi), %r10 967d7d1bf1SArnaldo Carvalho de Melo movq -4*8(%rsi), %r11 977d7d1bf1SArnaldo Carvalho de Melo leaq -4*8(%rsi), %rsi 987d7d1bf1SArnaldo Carvalho de Melo movq %r8, -1*8(%rdi) 997d7d1bf1SArnaldo Carvalho de Melo movq %r9, -2*8(%rdi) 1007d7d1bf1SArnaldo Carvalho de Melo movq %r10, -3*8(%rdi) 1017d7d1bf1SArnaldo Carvalho de Melo movq %r11, -4*8(%rdi) 1027d7d1bf1SArnaldo Carvalho de Melo leaq -4*8(%rdi), %rdi 1037d7d1bf1SArnaldo Carvalho de Melo jae .Lcopy_backward_loop 1047d7d1bf1SArnaldo Carvalho de Melo 1057d7d1bf1SArnaldo Carvalho de Melo /* 1067d7d1bf1SArnaldo Carvalho de Melo * Calculate copy position to head. 1077d7d1bf1SArnaldo Carvalho de Melo */ 1087d7d1bf1SArnaldo Carvalho de Melo addl $0x20, %edx 1097d7d1bf1SArnaldo Carvalho de Melo subq %rdx, %rsi 1107d7d1bf1SArnaldo Carvalho de Melo subq %rdx, %rdi 1117d7d1bf1SArnaldo Carvalho de Melo.Lhandle_tail: 1127d7d1bf1SArnaldo Carvalho de Melo cmpl $16, %edx 1137d7d1bf1SArnaldo Carvalho de Melo jb .Lless_16bytes 1147d7d1bf1SArnaldo Carvalho de Melo 1157d7d1bf1SArnaldo Carvalho de Melo /* 1167d7d1bf1SArnaldo Carvalho de Melo * Move data from 16 bytes to 31 bytes. 1177d7d1bf1SArnaldo Carvalho de Melo */ 1187d7d1bf1SArnaldo Carvalho de Melo movq 0*8(%rsi), %r8 1197d7d1bf1SArnaldo Carvalho de Melo movq 1*8(%rsi), %r9 1207d7d1bf1SArnaldo Carvalho de Melo movq -2*8(%rsi, %rdx), %r10 1217d7d1bf1SArnaldo Carvalho de Melo movq -1*8(%rsi, %rdx), %r11 1227d7d1bf1SArnaldo Carvalho de Melo movq %r8, 0*8(%rdi) 1237d7d1bf1SArnaldo Carvalho de Melo movq %r9, 1*8(%rdi) 1247d7d1bf1SArnaldo Carvalho de Melo movq %r10, -2*8(%rdi, %rdx) 1257d7d1bf1SArnaldo Carvalho de Melo movq %r11, -1*8(%rdi, %rdx) 12635cb8c71SArnaldo Carvalho de Melo RET 1277d7d1bf1SArnaldo Carvalho de Melo .p2align 4 1287d7d1bf1SArnaldo Carvalho de Melo.Lless_16bytes: 1297d7d1bf1SArnaldo Carvalho de Melo cmpl $8, %edx 1307d7d1bf1SArnaldo Carvalho de Melo jb .Lless_8bytes 1317d7d1bf1SArnaldo Carvalho de Melo /* 1327d7d1bf1SArnaldo Carvalho de Melo * Move data from 8 bytes to 15 bytes. 1337d7d1bf1SArnaldo Carvalho de Melo */ 1347d7d1bf1SArnaldo Carvalho de Melo movq 0*8(%rsi), %r8 1357d7d1bf1SArnaldo Carvalho de Melo movq -1*8(%rsi, %rdx), %r9 1367d7d1bf1SArnaldo Carvalho de Melo movq %r8, 0*8(%rdi) 1377d7d1bf1SArnaldo Carvalho de Melo movq %r9, -1*8(%rdi, %rdx) 13835cb8c71SArnaldo Carvalho de Melo RET 1397d7d1bf1SArnaldo Carvalho de Melo .p2align 4 1407d7d1bf1SArnaldo Carvalho de Melo.Lless_8bytes: 1417d7d1bf1SArnaldo Carvalho de Melo cmpl $4, %edx 1427d7d1bf1SArnaldo Carvalho de Melo jb .Lless_3bytes 1437d7d1bf1SArnaldo Carvalho de Melo 1447d7d1bf1SArnaldo Carvalho de Melo /* 1457d7d1bf1SArnaldo Carvalho de Melo * Move data from 4 bytes to 7 bytes. 1467d7d1bf1SArnaldo Carvalho de Melo */ 1477d7d1bf1SArnaldo Carvalho de Melo movl (%rsi), %ecx 1487d7d1bf1SArnaldo Carvalho de Melo movl -4(%rsi, %rdx), %r8d 1497d7d1bf1SArnaldo Carvalho de Melo movl %ecx, (%rdi) 1507d7d1bf1SArnaldo Carvalho de Melo movl %r8d, -4(%rdi, %rdx) 15135cb8c71SArnaldo Carvalho de Melo RET 1527d7d1bf1SArnaldo Carvalho de Melo .p2align 4 1537d7d1bf1SArnaldo Carvalho de Melo.Lless_3bytes: 1547d7d1bf1SArnaldo Carvalho de Melo subl $1, %edx 1557d7d1bf1SArnaldo Carvalho de Melo jb .Lend 1567d7d1bf1SArnaldo Carvalho de Melo /* 1577d7d1bf1SArnaldo Carvalho de Melo * Move data from 1 bytes to 3 bytes. 1587d7d1bf1SArnaldo Carvalho de Melo */ 1597d7d1bf1SArnaldo Carvalho de Melo movzbl (%rsi), %ecx 1607d7d1bf1SArnaldo Carvalho de Melo jz .Lstore_1byte 1617d7d1bf1SArnaldo Carvalho de Melo movzbq 1(%rsi), %r8 1627d7d1bf1SArnaldo Carvalho de Melo movzbq (%rsi, %rdx), %r9 1637d7d1bf1SArnaldo Carvalho de Melo movb %r8b, 1(%rdi) 1647d7d1bf1SArnaldo Carvalho de Melo movb %r9b, (%rdi, %rdx) 1657d7d1bf1SArnaldo Carvalho de Melo.Lstore_1byte: 1667d7d1bf1SArnaldo Carvalho de Melo movb %cl, (%rdi) 1677d7d1bf1SArnaldo Carvalho de Melo 1687d7d1bf1SArnaldo Carvalho de Melo.Lend: 16935cb8c71SArnaldo Carvalho de Melo RET 170bd5c6b81SArnaldo Carvalho de MeloSYM_FUNC_END(memcpy_orig) 1717d7d1bf1SArnaldo Carvalho de Melo 172