1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4#include <asm/errno.h> 5#include <asm/cpufeatures.h> 6#include <asm/alternative-asm.h> 7#include <asm/export.h> 8 9/* 10 * We build a jump to memcpy_orig by default which gets NOPped out on 11 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 12 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 13 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 14 */ 15 16.weak memcpy 17 18/* 19 * memcpy - Copy a memory block. 20 * 21 * Input: 22 * rdi destination 23 * rsi source 24 * rdx count 25 * 26 * Output: 27 * rax original destination 28 */ 29ENTRY(__memcpy) 30ENTRY(memcpy) 31 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 32 "jmp memcpy_erms", X86_FEATURE_ERMS 33 34 movq %rdi, %rax 35 movq %rdx, %rcx 36 shrq $3, %rcx 37 andl $7, %edx 38 rep movsq 39 movl %edx, %ecx 40 rep movsb 41 ret 42ENDPROC(memcpy) 43ENDPROC(__memcpy) 44EXPORT_SYMBOL(memcpy) 45EXPORT_SYMBOL(__memcpy) 46 47/* 48 * memcpy_erms() - enhanced fast string memcpy. This is faster and 49 * simpler than memcpy. Use memcpy_erms when possible. 50 */ 51ENTRY(memcpy_erms) 52 movq %rdi, %rax 53 movq %rdx, %rcx 54 rep movsb 55 ret 56ENDPROC(memcpy_erms) 57 58ENTRY(memcpy_orig) 59 movq %rdi, %rax 60 61 cmpq $0x20, %rdx 62 jb .Lhandle_tail 63 64 /* 65 * We check whether memory false dependence could occur, 66 * then jump to corresponding copy mode. 67 */ 68 cmp %dil, %sil 69 jl .Lcopy_backward 70 subq $0x20, %rdx 71.Lcopy_forward_loop: 72 subq $0x20, %rdx 73 74 /* 75 * Move in blocks of 4x8 bytes: 76 */ 77 movq 0*8(%rsi), %r8 78 movq 1*8(%rsi), %r9 79 movq 2*8(%rsi), %r10 80 movq 3*8(%rsi), %r11 81 leaq 4*8(%rsi), %rsi 82 83 movq %r8, 0*8(%rdi) 84 movq %r9, 1*8(%rdi) 85 movq %r10, 2*8(%rdi) 86 movq %r11, 3*8(%rdi) 87 leaq 4*8(%rdi), %rdi 88 jae .Lcopy_forward_loop 89 addl $0x20, %edx 90 jmp .Lhandle_tail 91 92.Lcopy_backward: 93 /* 94 * Calculate copy position to tail. 95 */ 96 addq %rdx, %rsi 97 addq %rdx, %rdi 98 subq $0x20, %rdx 99 /* 100 * At most 3 ALU operations in one cycle, 101 * so append NOPS in the same 16 bytes trunk. 102 */ 103 .p2align 4 104.Lcopy_backward_loop: 105 subq $0x20, %rdx 106 movq -1*8(%rsi), %r8 107 movq -2*8(%rsi), %r9 108 movq -3*8(%rsi), %r10 109 movq -4*8(%rsi), %r11 110 leaq -4*8(%rsi), %rsi 111 movq %r8, -1*8(%rdi) 112 movq %r9, -2*8(%rdi) 113 movq %r10, -3*8(%rdi) 114 movq %r11, -4*8(%rdi) 115 leaq -4*8(%rdi), %rdi 116 jae .Lcopy_backward_loop 117 118 /* 119 * Calculate copy position to head. 120 */ 121 addl $0x20, %edx 122 subq %rdx, %rsi 123 subq %rdx, %rdi 124.Lhandle_tail: 125 cmpl $16, %edx 126 jb .Lless_16bytes 127 128 /* 129 * Move data from 16 bytes to 31 bytes. 130 */ 131 movq 0*8(%rsi), %r8 132 movq 1*8(%rsi), %r9 133 movq -2*8(%rsi, %rdx), %r10 134 movq -1*8(%rsi, %rdx), %r11 135 movq %r8, 0*8(%rdi) 136 movq %r9, 1*8(%rdi) 137 movq %r10, -2*8(%rdi, %rdx) 138 movq %r11, -1*8(%rdi, %rdx) 139 retq 140 .p2align 4 141.Lless_16bytes: 142 cmpl $8, %edx 143 jb .Lless_8bytes 144 /* 145 * Move data from 8 bytes to 15 bytes. 146 */ 147 movq 0*8(%rsi), %r8 148 movq -1*8(%rsi, %rdx), %r9 149 movq %r8, 0*8(%rdi) 150 movq %r9, -1*8(%rdi, %rdx) 151 retq 152 .p2align 4 153.Lless_8bytes: 154 cmpl $4, %edx 155 jb .Lless_3bytes 156 157 /* 158 * Move data from 4 bytes to 7 bytes. 159 */ 160 movl (%rsi), %ecx 161 movl -4(%rsi, %rdx), %r8d 162 movl %ecx, (%rdi) 163 movl %r8d, -4(%rdi, %rdx) 164 retq 165 .p2align 4 166.Lless_3bytes: 167 subl $1, %edx 168 jb .Lend 169 /* 170 * Move data from 1 bytes to 3 bytes. 171 */ 172 movzbl (%rsi), %ecx 173 jz .Lstore_1byte 174 movzbq 1(%rsi), %r8 175 movzbq (%rsi, %rdx), %r9 176 movb %r8b, 1(%rdi) 177 movb %r9b, (%rdi, %rdx) 178.Lstore_1byte: 179 movb %cl, (%rdi) 180 181.Lend: 182 retq 183ENDPROC(memcpy_orig) 184 185#ifndef CONFIG_UML 186/* 187 * memcpy_mcsafe_unrolled - memory copy with machine check exception handling 188 * Note that we only catch machine checks when reading the source addresses. 189 * Writes to target are posted and don't generate machine checks. 190 */ 191ENTRY(memcpy_mcsafe_unrolled) 192 cmpl $8, %edx 193 /* Less than 8 bytes? Go to byte copy loop */ 194 jb .L_no_whole_words 195 196 /* Check for bad alignment of source */ 197 testl $7, %esi 198 /* Already aligned */ 199 jz .L_8byte_aligned 200 201 /* Copy one byte at a time until source is 8-byte aligned */ 202 movl %esi, %ecx 203 andl $7, %ecx 204 subl $8, %ecx 205 negl %ecx 206 subl %ecx, %edx 207.L_copy_leading_bytes: 208 movb (%rsi), %al 209 movb %al, (%rdi) 210 incq %rsi 211 incq %rdi 212 decl %ecx 213 jnz .L_copy_leading_bytes 214 215.L_8byte_aligned: 216 /* Figure out how many whole cache lines (64-bytes) to copy */ 217 movl %edx, %ecx 218 andl $63, %edx 219 shrl $6, %ecx 220 jz .L_no_whole_cache_lines 221 222 /* Loop copying whole cache lines */ 223.L_cache_w0: movq (%rsi), %r8 224.L_cache_w1: movq 1*8(%rsi), %r9 225.L_cache_w2: movq 2*8(%rsi), %r10 226.L_cache_w3: movq 3*8(%rsi), %r11 227 movq %r8, (%rdi) 228 movq %r9, 1*8(%rdi) 229 movq %r10, 2*8(%rdi) 230 movq %r11, 3*8(%rdi) 231.L_cache_w4: movq 4*8(%rsi), %r8 232.L_cache_w5: movq 5*8(%rsi), %r9 233.L_cache_w6: movq 6*8(%rsi), %r10 234.L_cache_w7: movq 7*8(%rsi), %r11 235 movq %r8, 4*8(%rdi) 236 movq %r9, 5*8(%rdi) 237 movq %r10, 6*8(%rdi) 238 movq %r11, 7*8(%rdi) 239 leaq 64(%rsi), %rsi 240 leaq 64(%rdi), %rdi 241 decl %ecx 242 jnz .L_cache_w0 243 244 /* Are there any trailing 8-byte words? */ 245.L_no_whole_cache_lines: 246 movl %edx, %ecx 247 andl $7, %edx 248 shrl $3, %ecx 249 jz .L_no_whole_words 250 251 /* Copy trailing words */ 252.L_copy_trailing_words: 253 movq (%rsi), %r8 254 mov %r8, (%rdi) 255 leaq 8(%rsi), %rsi 256 leaq 8(%rdi), %rdi 257 decl %ecx 258 jnz .L_copy_trailing_words 259 260 /* Any trailing bytes? */ 261.L_no_whole_words: 262 andl %edx, %edx 263 jz .L_done_memcpy_trap 264 265 /* Copy trailing bytes */ 266 movl %edx, %ecx 267.L_copy_trailing_bytes: 268 movb (%rsi), %al 269 movb %al, (%rdi) 270 incq %rsi 271 incq %rdi 272 decl %ecx 273 jnz .L_copy_trailing_bytes 274 275 /* Copy successful. Return zero */ 276.L_done_memcpy_trap: 277 xorq %rax, %rax 278 ret 279ENDPROC(memcpy_mcsafe_unrolled) 280EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) 281 282 .section .fixup, "ax" 283 /* Return -EFAULT for any failure */ 284.L_memcpy_mcsafe_fail: 285 mov $-EFAULT, %rax 286 ret 287 288 .previous 289 290 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) 291 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) 292 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) 293 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) 294 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) 295 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) 296 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) 297 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) 298 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) 299 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) 300 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) 301#endif 302