1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7#include <asm/alternative-asm.h> 8 9/* 10 * memcpy - Copy a memory block. 11 * 12 * Input: 13 * rdi destination 14 * rsi source 15 * rdx count 16 * 17 * Output: 18 * rax original destination 19 */ 20 21/* 22 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 23 * 24 * This gets patched over the unrolled variant (below) via the 25 * alternative instructions framework: 26 */ 27 .section .altinstr_replacement, "ax", @progbits 28.Lmemcpy_c: 29 movq %rdi, %rax 30 movq %rdx, %rcx 31 shrq $3, %rcx 32 andl $7, %edx 33 rep movsq 34 movl %edx, %ecx 35 rep movsb 36 ret 37.Lmemcpy_e: 38 .previous 39 40/* 41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 42 * memcpy_c. Use memcpy_c_e when possible. 43 * 44 * This gets patched over the unrolled variant (below) via the 45 * alternative instructions framework: 46 */ 47 .section .altinstr_replacement, "ax", @progbits 48.Lmemcpy_c_e: 49 movq %rdi, %rax 50 movq %rdx, %rcx 51 rep movsb 52 ret 53.Lmemcpy_e_e: 54 .previous 55 56ENTRY(__memcpy) 57ENTRY(memcpy) 58 CFI_STARTPROC 59 movq %rdi, %rax 60 61 cmpq $0x20, %rdx 62 jb .Lhandle_tail 63 64 /* 65 * We check whether memory false dependence could occur, 66 * then jump to corresponding copy mode. 67 */ 68 cmp %dil, %sil 69 jl .Lcopy_backward 70 subq $0x20, %rdx 71.Lcopy_forward_loop: 72 subq $0x20, %rdx 73 74 /* 75 * Move in blocks of 4x8 bytes: 76 */ 77 movq 0*8(%rsi), %r8 78 movq 1*8(%rsi), %r9 79 movq 2*8(%rsi), %r10 80 movq 3*8(%rsi), %r11 81 leaq 4*8(%rsi), %rsi 82 83 movq %r8, 0*8(%rdi) 84 movq %r9, 1*8(%rdi) 85 movq %r10, 2*8(%rdi) 86 movq %r11, 3*8(%rdi) 87 leaq 4*8(%rdi), %rdi 88 jae .Lcopy_forward_loop 89 addl $0x20, %edx 90 jmp .Lhandle_tail 91 92.Lcopy_backward: 93 /* 94 * Calculate copy position to tail. 95 */ 96 addq %rdx, %rsi 97 addq %rdx, %rdi 98 subq $0x20, %rdx 99 /* 100 * At most 3 ALU operations in one cycle, 101 * so append NOPS in the same 16bytes trunk. 102 */ 103 .p2align 4 104.Lcopy_backward_loop: 105 subq $0x20, %rdx 106 movq -1*8(%rsi), %r8 107 movq -2*8(%rsi), %r9 108 movq -3*8(%rsi), %r10 109 movq -4*8(%rsi), %r11 110 leaq -4*8(%rsi), %rsi 111 movq %r8, -1*8(%rdi) 112 movq %r9, -2*8(%rdi) 113 movq %r10, -3*8(%rdi) 114 movq %r11, -4*8(%rdi) 115 leaq -4*8(%rdi), %rdi 116 jae .Lcopy_backward_loop 117 118 /* 119 * Calculate copy position to head. 120 */ 121 addl $0x20, %edx 122 subq %rdx, %rsi 123 subq %rdx, %rdi 124.Lhandle_tail: 125 cmpl $16, %edx 126 jb .Lless_16bytes 127 128 /* 129 * Move data from 16 bytes to 31 bytes. 130 */ 131 movq 0*8(%rsi), %r8 132 movq 1*8(%rsi), %r9 133 movq -2*8(%rsi, %rdx), %r10 134 movq -1*8(%rsi, %rdx), %r11 135 movq %r8, 0*8(%rdi) 136 movq %r9, 1*8(%rdi) 137 movq %r10, -2*8(%rdi, %rdx) 138 movq %r11, -1*8(%rdi, %rdx) 139 retq 140 .p2align 4 141.Lless_16bytes: 142 cmpl $8, %edx 143 jb .Lless_8bytes 144 /* 145 * Move data from 8 bytes to 15 bytes. 146 */ 147 movq 0*8(%rsi), %r8 148 movq -1*8(%rsi, %rdx), %r9 149 movq %r8, 0*8(%rdi) 150 movq %r9, -1*8(%rdi, %rdx) 151 retq 152 .p2align 4 153.Lless_8bytes: 154 cmpl $4, %edx 155 jb .Lless_3bytes 156 157 /* 158 * Move data from 4 bytes to 7 bytes. 159 */ 160 movl (%rsi), %ecx 161 movl -4(%rsi, %rdx), %r8d 162 movl %ecx, (%rdi) 163 movl %r8d, -4(%rdi, %rdx) 164 retq 165 .p2align 4 166.Lless_3bytes: 167 subl $1, %edx 168 jb .Lend 169 /* 170 * Move data from 1 bytes to 3 bytes. 171 */ 172 movzbl (%rsi), %ecx 173 jz .Lstore_1byte 174 movzbq 1(%rsi), %r8 175 movzbq (%rsi, %rdx), %r9 176 movb %r8b, 1(%rdi) 177 movb %r9b, (%rdi, %rdx) 178.Lstore_1byte: 179 movb %cl, (%rdi) 180 181.Lend: 182 retq 183 CFI_ENDPROC 184ENDPROC(memcpy) 185ENDPROC(__memcpy) 186 187 /* 188 * Some CPUs are adding enhanced REP MOVSB/STOSB feature 189 * If the feature is supported, memcpy_c_e() is the first choice. 190 * If enhanced rep movsb copy is not available, use fast string copy 191 * memcpy_c() when possible. This is faster and code is simpler than 192 * original memcpy(). 193 * Otherwise, original memcpy() is used. 194 * In .altinstructions section, ERMS feature is placed after REG_GOOD 195 * feature to implement the right patch order. 196 * 197 * Replace only beginning, memcpy is used to apply alternatives, 198 * so it is silly to overwrite itself with nops - reboot is the 199 * only outcome... 200 */ 201 .section .altinstructions, "a" 202 altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ 203 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c 204 altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ 205 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e 206 .previous 207