1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7 8/* 9 * memcpy - Copy a memory block. 10 * 11 * Input: 12 * rdi destination 13 * rsi source 14 * rdx count 15 * 16 * Output: 17 * rax original destination 18 */ 19 20/* 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 22 * 23 * Calls to this get patched into the kernel image via the 24 * alternative instructions framework: 25 */ 26 ALIGN 27memcpy_c: 28 CFI_STARTPROC 29 movq %rdi, %rax 30 31 movl %edx, %ecx 32 shrl $3, %ecx 33 andl $7, %edx 34 rep movsq 35 movl %edx, %ecx 36 rep movsb 37 ret 38 CFI_ENDPROC 39ENDPROC(memcpy_c) 40 41ENTRY(__memcpy) 42ENTRY(memcpy) 43 CFI_STARTPROC 44 45 /* 46 * Put the number of full 64-byte blocks into %ecx. 47 * Tail portion is handled at the end: 48 */ 49 movq %rdi, %rax 50 movl %edx, %ecx 51 shrl $6, %ecx 52 jz .Lhandle_tail 53 54 .p2align 4 55.Lloop_64: 56 /* 57 * We decrement the loop index here - and the zero-flag is 58 * checked at the end of the loop (instructions inbetween do 59 * not change the zero flag): 60 */ 61 decl %ecx 62 63 /* 64 * Move in blocks of 4x16 bytes: 65 */ 66 movq 0*8(%rsi), %r11 67 movq 1*8(%rsi), %r8 68 movq %r11, 0*8(%rdi) 69 movq %r8, 1*8(%rdi) 70 71 movq 2*8(%rsi), %r9 72 movq 3*8(%rsi), %r10 73 movq %r9, 2*8(%rdi) 74 movq %r10, 3*8(%rdi) 75 76 movq 4*8(%rsi), %r11 77 movq 5*8(%rsi), %r8 78 movq %r11, 4*8(%rdi) 79 movq %r8, 5*8(%rdi) 80 81 movq 6*8(%rsi), %r9 82 movq 7*8(%rsi), %r10 83 movq %r9, 6*8(%rdi) 84 movq %r10, 7*8(%rdi) 85 86 leaq 64(%rsi), %rsi 87 leaq 64(%rdi), %rdi 88 89 jnz .Lloop_64 90 91.Lhandle_tail: 92 movl %edx, %ecx 93 andl $63, %ecx 94 shrl $3, %ecx 95 jz .Lhandle_7 96 97 .p2align 4 98.Lloop_8: 99 decl %ecx 100 movq (%rsi), %r8 101 movq %r8, (%rdi) 102 leaq 8(%rdi), %rdi 103 leaq 8(%rsi), %rsi 104 jnz .Lloop_8 105 106.Lhandle_7: 107 movl %edx, %ecx 108 andl $7, %ecx 109 jz .Lend 110 111 .p2align 4 112.Lloop_1: 113 movb (%rsi), %r8b 114 movb %r8b, (%rdi) 115 incq %rdi 116 incq %rsi 117 decl %ecx 118 jnz .Lloop_1 119 120.Lend: 121 ret 122 CFI_ENDPROC 123ENDPROC(memcpy) 124ENDPROC(__memcpy) 125 126 /* 127 * Some CPUs run faster using the string copy instructions. 128 * It is also a lot simpler. Use this when possible: 129 */ 130 131 .section .altinstr_replacement, "ax" 1321: .byte 0xeb /* jmp <disp8> */ 133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 1342: 135 .previous 136 137 .section .altinstructions, "a" 138 .align 8 139 .quad memcpy 140 .quad 1b 141 .byte X86_FEATURE_REP_GOOD 142 143 /* 144 * Replace only beginning, memcpy is used to apply alternatives, 145 * so it is silly to overwrite itself with nops - reboot is the 146 * only outcome... 147 */ 148 .byte 2b - 1b 149 .byte 2b - 1b 150 .previous 151