1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* Copyright 2002 Andi Kleen */ 3 4#include <linux/linkage.h> 5#include <asm/errno.h> 6#include <asm/cpufeatures.h> 7#include <asm/alternative-asm.h> 8#include <asm/export.h> 9 10.pushsection .noinstr.text, "ax" 11 12/* 13 * We build a jump to memcpy_orig by default which gets NOPped out on 14 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 15 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 16 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 17 */ 18 19.weak memcpy 20 21/* 22 * memcpy - Copy a memory block. 23 * 24 * Input: 25 * rdi destination 26 * rsi source 27 * rdx count 28 * 29 * Output: 30 * rax original destination 31 */ 32SYM_FUNC_START_ALIAS(__memcpy) 33SYM_FUNC_START_LOCAL(memcpy) 34 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 35 "jmp memcpy_erms", X86_FEATURE_ERMS 36 37 movq %rdi, %rax 38 movq %rdx, %rcx 39 shrq $3, %rcx 40 andl $7, %edx 41 rep movsq 42 movl %edx, %ecx 43 rep movsb 44 ret 45SYM_FUNC_END(memcpy) 46SYM_FUNC_END_ALIAS(__memcpy) 47EXPORT_SYMBOL(memcpy) 48EXPORT_SYMBOL(__memcpy) 49 50/* 51 * memcpy_erms() - enhanced fast string memcpy. This is faster and 52 * simpler than memcpy. Use memcpy_erms when possible. 53 */ 54SYM_FUNC_START_LOCAL(memcpy_erms) 55 movq %rdi, %rax 56 movq %rdx, %rcx 57 rep movsb 58 ret 59SYM_FUNC_END(memcpy_erms) 60 61SYM_FUNC_START_LOCAL(memcpy_orig) 62 movq %rdi, %rax 63 64 cmpq $0x20, %rdx 65 jb .Lhandle_tail 66 67 /* 68 * We check whether memory false dependence could occur, 69 * then jump to corresponding copy mode. 70 */ 71 cmp %dil, %sil 72 jl .Lcopy_backward 73 subq $0x20, %rdx 74.Lcopy_forward_loop: 75 subq $0x20, %rdx 76 77 /* 78 * Move in blocks of 4x8 bytes: 79 */ 80 movq 0*8(%rsi), %r8 81 movq 1*8(%rsi), %r9 82 movq 2*8(%rsi), %r10 83 movq 3*8(%rsi), %r11 84 leaq 4*8(%rsi), %rsi 85 86 movq %r8, 0*8(%rdi) 87 movq %r9, 1*8(%rdi) 88 movq %r10, 2*8(%rdi) 89 movq %r11, 3*8(%rdi) 90 leaq 4*8(%rdi), %rdi 91 jae .Lcopy_forward_loop 92 addl $0x20, %edx 93 jmp .Lhandle_tail 94 95.Lcopy_backward: 96 /* 97 * Calculate copy position to tail. 98 */ 99 addq %rdx, %rsi 100 addq %rdx, %rdi 101 subq $0x20, %rdx 102 /* 103 * At most 3 ALU operations in one cycle, 104 * so append NOPS in the same 16 bytes trunk. 105 */ 106 .p2align 4 107.Lcopy_backward_loop: 108 subq $0x20, %rdx 109 movq -1*8(%rsi), %r8 110 movq -2*8(%rsi), %r9 111 movq -3*8(%rsi), %r10 112 movq -4*8(%rsi), %r11 113 leaq -4*8(%rsi), %rsi 114 movq %r8, -1*8(%rdi) 115 movq %r9, -2*8(%rdi) 116 movq %r10, -3*8(%rdi) 117 movq %r11, -4*8(%rdi) 118 leaq -4*8(%rdi), %rdi 119 jae .Lcopy_backward_loop 120 121 /* 122 * Calculate copy position to head. 123 */ 124 addl $0x20, %edx 125 subq %rdx, %rsi 126 subq %rdx, %rdi 127.Lhandle_tail: 128 cmpl $16, %edx 129 jb .Lless_16bytes 130 131 /* 132 * Move data from 16 bytes to 31 bytes. 133 */ 134 movq 0*8(%rsi), %r8 135 movq 1*8(%rsi), %r9 136 movq -2*8(%rsi, %rdx), %r10 137 movq -1*8(%rsi, %rdx), %r11 138 movq %r8, 0*8(%rdi) 139 movq %r9, 1*8(%rdi) 140 movq %r10, -2*8(%rdi, %rdx) 141 movq %r11, -1*8(%rdi, %rdx) 142 retq 143 .p2align 4 144.Lless_16bytes: 145 cmpl $8, %edx 146 jb .Lless_8bytes 147 /* 148 * Move data from 8 bytes to 15 bytes. 149 */ 150 movq 0*8(%rsi), %r8 151 movq -1*8(%rsi, %rdx), %r9 152 movq %r8, 0*8(%rdi) 153 movq %r9, -1*8(%rdi, %rdx) 154 retq 155 .p2align 4 156.Lless_8bytes: 157 cmpl $4, %edx 158 jb .Lless_3bytes 159 160 /* 161 * Move data from 4 bytes to 7 bytes. 162 */ 163 movl (%rsi), %ecx 164 movl -4(%rsi, %rdx), %r8d 165 movl %ecx, (%rdi) 166 movl %r8d, -4(%rdi, %rdx) 167 retq 168 .p2align 4 169.Lless_3bytes: 170 subl $1, %edx 171 jb .Lend 172 /* 173 * Move data from 1 bytes to 3 bytes. 174 */ 175 movzbl (%rsi), %ecx 176 jz .Lstore_1byte 177 movzbq 1(%rsi), %r8 178 movzbq (%rsi, %rdx), %r9 179 movb %r8b, 1(%rdi) 180 movb %r9b, (%rdi, %rdx) 181.Lstore_1byte: 182 movb %cl, (%rdi) 183 184.Lend: 185 retq 186SYM_FUNC_END(memcpy_orig) 187 188.popsection 189