1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* Copyright 2002 Andi Kleen */ 3 4#include <linux/linkage.h> 5#include <linux/cfi_types.h> 6#include <asm/errno.h> 7#include <asm/cpufeatures.h> 8#include <asm/alternative.h> 9#include <asm/export.h> 10 11.section .noinstr.text, "ax" 12 13/* 14 * memcpy - Copy a memory block. 15 * 16 * Input: 17 * rdi destination 18 * rsi source 19 * rdx count 20 * 21 * Output: 22 * rax original destination 23 * 24 * The FSRM alternative should be done inline (avoiding the call and 25 * the disgusting return handling), but that would require some help 26 * from the compiler for better calling conventions. 27 * 28 * The 'rep movsb' itself is small enough to replace the call, but the 29 * two register moves blow up the code. And one of them is "needed" 30 * only for the return value that is the same as the source input, 31 * which the compiler could/should do much better anyway. 32 */ 33SYM_TYPED_FUNC_START(__memcpy) 34 ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 35 36 movq %rdi, %rax 37 movq %rdx, %rcx 38 rep movsb 39 RET 40SYM_FUNC_END(__memcpy) 41EXPORT_SYMBOL(__memcpy) 42 43SYM_FUNC_ALIAS(memcpy, __memcpy) 44EXPORT_SYMBOL(memcpy) 45 46SYM_FUNC_START_LOCAL(memcpy_orig) 47 movq %rdi, %rax 48 49 cmpq $0x20, %rdx 50 jb .Lhandle_tail 51 52 /* 53 * We check whether memory false dependence could occur, 54 * then jump to corresponding copy mode. 55 */ 56 cmp %dil, %sil 57 jl .Lcopy_backward 58 subq $0x20, %rdx 59.Lcopy_forward_loop: 60 subq $0x20, %rdx 61 62 /* 63 * Move in blocks of 4x8 bytes: 64 */ 65 movq 0*8(%rsi), %r8 66 movq 1*8(%rsi), %r9 67 movq 2*8(%rsi), %r10 68 movq 3*8(%rsi), %r11 69 leaq 4*8(%rsi), %rsi 70 71 movq %r8, 0*8(%rdi) 72 movq %r9, 1*8(%rdi) 73 movq %r10, 2*8(%rdi) 74 movq %r11, 3*8(%rdi) 75 leaq 4*8(%rdi), %rdi 76 jae .Lcopy_forward_loop 77 addl $0x20, %edx 78 jmp .Lhandle_tail 79 80.Lcopy_backward: 81 /* 82 * Calculate copy position to tail. 83 */ 84 addq %rdx, %rsi 85 addq %rdx, %rdi 86 subq $0x20, %rdx 87 /* 88 * At most 3 ALU operations in one cycle, 89 * so append NOPS in the same 16 bytes trunk. 90 */ 91 .p2align 4 92.Lcopy_backward_loop: 93 subq $0x20, %rdx 94 movq -1*8(%rsi), %r8 95 movq -2*8(%rsi), %r9 96 movq -3*8(%rsi), %r10 97 movq -4*8(%rsi), %r11 98 leaq -4*8(%rsi), %rsi 99 movq %r8, -1*8(%rdi) 100 movq %r9, -2*8(%rdi) 101 movq %r10, -3*8(%rdi) 102 movq %r11, -4*8(%rdi) 103 leaq -4*8(%rdi), %rdi 104 jae .Lcopy_backward_loop 105 106 /* 107 * Calculate copy position to head. 108 */ 109 addl $0x20, %edx 110 subq %rdx, %rsi 111 subq %rdx, %rdi 112.Lhandle_tail: 113 cmpl $16, %edx 114 jb .Lless_16bytes 115 116 /* 117 * Move data from 16 bytes to 31 bytes. 118 */ 119 movq 0*8(%rsi), %r8 120 movq 1*8(%rsi), %r9 121 movq -2*8(%rsi, %rdx), %r10 122 movq -1*8(%rsi, %rdx), %r11 123 movq %r8, 0*8(%rdi) 124 movq %r9, 1*8(%rdi) 125 movq %r10, -2*8(%rdi, %rdx) 126 movq %r11, -1*8(%rdi, %rdx) 127 RET 128 .p2align 4 129.Lless_16bytes: 130 cmpl $8, %edx 131 jb .Lless_8bytes 132 /* 133 * Move data from 8 bytes to 15 bytes. 134 */ 135 movq 0*8(%rsi), %r8 136 movq -1*8(%rsi, %rdx), %r9 137 movq %r8, 0*8(%rdi) 138 movq %r9, -1*8(%rdi, %rdx) 139 RET 140 .p2align 4 141.Lless_8bytes: 142 cmpl $4, %edx 143 jb .Lless_3bytes 144 145 /* 146 * Move data from 4 bytes to 7 bytes. 147 */ 148 movl (%rsi), %ecx 149 movl -4(%rsi, %rdx), %r8d 150 movl %ecx, (%rdi) 151 movl %r8d, -4(%rdi, %rdx) 152 RET 153 .p2align 4 154.Lless_3bytes: 155 subl $1, %edx 156 jb .Lend 157 /* 158 * Move data from 1 bytes to 3 bytes. 159 */ 160 movzbl (%rsi), %ecx 161 jz .Lstore_1byte 162 movzbq 1(%rsi), %r8 163 movzbq (%rsi, %rdx), %r9 164 movb %r8b, 1(%rdi) 165 movb %r9b, (%rdi, %rdx) 166.Lstore_1byte: 167 movb %cl, (%rdi) 168 169.Lend: 170 RET 171SYM_FUNC_END(memcpy_orig) 172 173