1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ 2 3#include <linux/linkage.h> 4#include <asm/cpufeature.h> 5#include <asm/alternative-asm.h> 6 7/* 8 * Some CPUs run faster using the string copy instructions (sane microcode). 9 * It is also a lot simpler. Use this when possible. But, don't use streaming 10 * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the 11 * prefetch distance based on SMP/UP. 12 */ 13 ALIGN 14ENTRY(copy_page) 15 ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD 16 movl $4096/8, %ecx 17 rep movsq 18 ret 19ENDPROC(copy_page) 20 21ENTRY(copy_page_regs) 22 subq $2*8, %rsp 23 movq %rbx, (%rsp) 24 movq %r12, 1*8(%rsp) 25 26 movl $(4096/64)-5, %ecx 27 .p2align 4 28.Loop64: 29 dec %rcx 30 movq 0x8*0(%rsi), %rax 31 movq 0x8*1(%rsi), %rbx 32 movq 0x8*2(%rsi), %rdx 33 movq 0x8*3(%rsi), %r8 34 movq 0x8*4(%rsi), %r9 35 movq 0x8*5(%rsi), %r10 36 movq 0x8*6(%rsi), %r11 37 movq 0x8*7(%rsi), %r12 38 39 prefetcht0 5*64(%rsi) 40 41 movq %rax, 0x8*0(%rdi) 42 movq %rbx, 0x8*1(%rdi) 43 movq %rdx, 0x8*2(%rdi) 44 movq %r8, 0x8*3(%rdi) 45 movq %r9, 0x8*4(%rdi) 46 movq %r10, 0x8*5(%rdi) 47 movq %r11, 0x8*6(%rdi) 48 movq %r12, 0x8*7(%rdi) 49 50 leaq 64 (%rsi), %rsi 51 leaq 64 (%rdi), %rdi 52 53 jnz .Loop64 54 55 movl $5, %ecx 56 .p2align 4 57.Loop2: 58 decl %ecx 59 60 movq 0x8*0(%rsi), %rax 61 movq 0x8*1(%rsi), %rbx 62 movq 0x8*2(%rsi), %rdx 63 movq 0x8*3(%rsi), %r8 64 movq 0x8*4(%rsi), %r9 65 movq 0x8*5(%rsi), %r10 66 movq 0x8*6(%rsi), %r11 67 movq 0x8*7(%rsi), %r12 68 69 movq %rax, 0x8*0(%rdi) 70 movq %rbx, 0x8*1(%rdi) 71 movq %rdx, 0x8*2(%rdi) 72 movq %r8, 0x8*3(%rdi) 73 movq %r9, 0x8*4(%rdi) 74 movq %r10, 0x8*5(%rdi) 75 movq %r11, 0x8*6(%rdi) 76 movq %r12, 0x8*7(%rdi) 77 78 leaq 64(%rdi), %rdi 79 leaq 64(%rsi), %rsi 80 jnz .Loop2 81 82 movq (%rsp), %rbx 83 movq 1*8(%rsp), %r12 84 addq $2*8, %rsp 85 ret 86ENDPROC(copy_page_regs) 87