1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7#include <asm/alternative-asm.h> 8 9/* 10 * memcpy - Copy a memory block. 11 * 12 * Input: 13 * rdi destination 14 * rsi source 15 * rdx count 16 * 17 * Output: 18 * rax original destination 19 */ 20 21/* 22 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 23 * 24 * This gets patched over the unrolled variant (below) via the 25 * alternative instructions framework: 26 */ 27 .section .altinstr_replacement, "ax", @progbits 28.Lmemcpy_c: 29 movq %rdi, %rax 30 31 movl %edx, %ecx 32 shrl $3, %ecx 33 andl $7, %edx 34 rep movsq 35 movl %edx, %ecx 36 rep movsb 37 ret 38.Lmemcpy_e: 39 .previous 40 41/* 42 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 43 * memcpy_c. Use memcpy_c_e when possible. 44 * 45 * This gets patched over the unrolled variant (below) via the 46 * alternative instructions framework: 47 */ 48 .section .altinstr_replacement, "ax", @progbits 49.Lmemcpy_c_e: 50 movq %rdi, %rax 51 52 movl %edx, %ecx 53 rep movsb 54 ret 55.Lmemcpy_e_e: 56 .previous 57 58ENTRY(__memcpy) 59ENTRY(memcpy) 60 CFI_STARTPROC 61 movq %rdi, %rax 62 63 /* 64 * Use 32bit CMP here to avoid long NOP padding. 65 */ 66 cmp $0x20, %edx 67 jb .Lhandle_tail 68 69 /* 70 * We check whether memory false dependence could occur, 71 * then jump to corresponding copy mode. 72 */ 73 cmp %dil, %sil 74 jl .Lcopy_backward 75 subl $0x20, %edx 76.Lcopy_forward_loop: 77 subq $0x20, %rdx 78 79 /* 80 * Move in blocks of 4x8 bytes: 81 */ 82 movq 0*8(%rsi), %r8 83 movq 1*8(%rsi), %r9 84 movq 2*8(%rsi), %r10 85 movq 3*8(%rsi), %r11 86 leaq 4*8(%rsi), %rsi 87 88 movq %r8, 0*8(%rdi) 89 movq %r9, 1*8(%rdi) 90 movq %r10, 2*8(%rdi) 91 movq %r11, 3*8(%rdi) 92 leaq 4*8(%rdi), %rdi 93 jae .Lcopy_forward_loop 94 addq $0x20, %rdx 95 jmp .Lhandle_tail 96 97.Lcopy_backward: 98 /* 99 * Calculate copy position to tail. 100 */ 101 addq %rdx, %rsi 102 addq %rdx, %rdi 103 subq $0x20, %rdx 104 /* 105 * At most 3 ALU operations in one cycle, 106 * so append NOPS in the same 16bytes trunk. 107 */ 108 .p2align 4 109.Lcopy_backward_loop: 110 subq $0x20, %rdx 111 movq -1*8(%rsi), %r8 112 movq -2*8(%rsi), %r9 113 movq -3*8(%rsi), %r10 114 movq -4*8(%rsi), %r11 115 leaq -4*8(%rsi), %rsi 116 movq %r8, -1*8(%rdi) 117 movq %r9, -2*8(%rdi) 118 movq %r10, -3*8(%rdi) 119 movq %r11, -4*8(%rdi) 120 leaq -4*8(%rdi), %rdi 121 jae .Lcopy_backward_loop 122 123 /* 124 * Calculate copy position to head. 125 */ 126 addq $0x20, %rdx 127 subq %rdx, %rsi 128 subq %rdx, %rdi 129.Lhandle_tail: 130 cmpq $16, %rdx 131 jb .Lless_16bytes 132 133 /* 134 * Move data from 16 bytes to 31 bytes. 135 */ 136 movq 0*8(%rsi), %r8 137 movq 1*8(%rsi), %r9 138 movq -2*8(%rsi, %rdx), %r10 139 movq -1*8(%rsi, %rdx), %r11 140 movq %r8, 0*8(%rdi) 141 movq %r9, 1*8(%rdi) 142 movq %r10, -2*8(%rdi, %rdx) 143 movq %r11, -1*8(%rdi, %rdx) 144 retq 145 .p2align 4 146.Lless_16bytes: 147 cmpq $8, %rdx 148 jb .Lless_8bytes 149 /* 150 * Move data from 8 bytes to 15 bytes. 151 */ 152 movq 0*8(%rsi), %r8 153 movq -1*8(%rsi, %rdx), %r9 154 movq %r8, 0*8(%rdi) 155 movq %r9, -1*8(%rdi, %rdx) 156 retq 157 .p2align 4 158.Lless_8bytes: 159 cmpq $4, %rdx 160 jb .Lless_3bytes 161 162 /* 163 * Move data from 4 bytes to 7 bytes. 164 */ 165 movl (%rsi), %ecx 166 movl -4(%rsi, %rdx), %r8d 167 movl %ecx, (%rdi) 168 movl %r8d, -4(%rdi, %rdx) 169 retq 170 .p2align 4 171.Lless_3bytes: 172 cmpl $0, %edx 173 je .Lend 174 /* 175 * Move data from 1 bytes to 3 bytes. 176 */ 177.Lloop_1: 178 movb (%rsi), %r8b 179 movb %r8b, (%rdi) 180 incq %rdi 181 incq %rsi 182 decl %edx 183 jnz .Lloop_1 184 185.Lend: 186 retq 187 CFI_ENDPROC 188ENDPROC(memcpy) 189ENDPROC(__memcpy) 190 191 /* 192 * Some CPUs are adding enhanced REP MOVSB/STOSB feature 193 * If the feature is supported, memcpy_c_e() is the first choice. 194 * If enhanced rep movsb copy is not available, use fast string copy 195 * memcpy_c() when possible. This is faster and code is simpler than 196 * original memcpy(). 197 * Otherwise, original memcpy() is used. 198 * In .altinstructions section, ERMS feature is placed after REG_GOOD 199 * feature to implement the right patch order. 200 * 201 * Replace only beginning, memcpy is used to apply alternatives, 202 * so it is silly to overwrite itself with nops - reboot is the 203 * only outcome... 204 */ 205 .section .altinstructions, "a" 206 altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ 207 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c 208 altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ 209 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e 210 .previous 211