1/* Copyright 2002 Andi Kleen, SuSE Labs */ 2 3#include <linux/linkage.h> 4#include <asm/dwarf2.h> 5#include <asm/cpufeature.h> 6#include <asm/alternative-asm.h> 7 8/* 9 * ISO C memset - set a memory block to a byte value. This function uses fast 10 * string to get better performance than the original function. The code is 11 * simpler and shorter than the orignal function as well. 12 * 13 * rdi destination 14 * rsi value (char) 15 * rdx count (bytes) 16 * 17 * rax original destination 18 */ 19 .section .altinstr_replacement, "ax", @progbits 20.Lmemset_c: 21 movq %rdi,%r9 22 movq %rdx,%rcx 23 andl $7,%edx 24 shrq $3,%rcx 25 /* expand byte value */ 26 movzbl %sil,%esi 27 movabs $0x0101010101010101,%rax 28 imulq %rsi,%rax 29 rep stosq 30 movl %edx,%ecx 31 rep stosb 32 movq %r9,%rax 33 ret 34.Lmemset_e: 35 .previous 36 37/* 38 * ISO C memset - set a memory block to a byte value. This function uses 39 * enhanced rep stosb to override the fast string function. 40 * The code is simpler and shorter than the fast string function as well. 41 * 42 * rdi destination 43 * rsi value (char) 44 * rdx count (bytes) 45 * 46 * rax original destination 47 */ 48 .section .altinstr_replacement, "ax", @progbits 49.Lmemset_c_e: 50 movq %rdi,%r9 51 movb %sil,%al 52 movq %rdx,%rcx 53 rep stosb 54 movq %r9,%rax 55 ret 56.Lmemset_e_e: 57 .previous 58 59ENTRY(memset) 60ENTRY(__memset) 61 CFI_STARTPROC 62 movq %rdi,%r10 63 64 /* expand byte value */ 65 movzbl %sil,%ecx 66 movabs $0x0101010101010101,%rax 67 imulq %rcx,%rax 68 69 /* align dst */ 70 movl %edi,%r9d 71 andl $7,%r9d 72 jnz .Lbad_alignment 73 CFI_REMEMBER_STATE 74.Lafter_bad_alignment: 75 76 movq %rdx,%rcx 77 shrq $6,%rcx 78 jz .Lhandle_tail 79 80 .p2align 4 81.Lloop_64: 82 decq %rcx 83 movq %rax,(%rdi) 84 movq %rax,8(%rdi) 85 movq %rax,16(%rdi) 86 movq %rax,24(%rdi) 87 movq %rax,32(%rdi) 88 movq %rax,40(%rdi) 89 movq %rax,48(%rdi) 90 movq %rax,56(%rdi) 91 leaq 64(%rdi),%rdi 92 jnz .Lloop_64 93 94 /* Handle tail in loops. The loops should be faster than hard 95 to predict jump tables. */ 96 .p2align 4 97.Lhandle_tail: 98 movl %edx,%ecx 99 andl $63&(~7),%ecx 100 jz .Lhandle_7 101 shrl $3,%ecx 102 .p2align 4 103.Lloop_8: 104 decl %ecx 105 movq %rax,(%rdi) 106 leaq 8(%rdi),%rdi 107 jnz .Lloop_8 108 109.Lhandle_7: 110 andl $7,%edx 111 jz .Lende 112 .p2align 4 113.Lloop_1: 114 decl %edx 115 movb %al,(%rdi) 116 leaq 1(%rdi),%rdi 117 jnz .Lloop_1 118 119.Lende: 120 movq %r10,%rax 121 ret 122 123 CFI_RESTORE_STATE 124.Lbad_alignment: 125 cmpq $7,%rdx 126 jbe .Lhandle_7 127 movq %rax,(%rdi) /* unaligned store */ 128 movq $8,%r8 129 subq %r9,%r8 130 addq %r8,%rdi 131 subq %r8,%rdx 132 jmp .Lafter_bad_alignment 133.Lfinal: 134 CFI_ENDPROC 135ENDPROC(memset) 136ENDPROC(__memset) 137 138 /* Some CPUs support enhanced REP MOVSB/STOSB feature. 139 * It is recommended to use this when possible. 140 * 141 * If enhanced REP MOVSB/STOSB feature is not available, use fast string 142 * instructions. 143 * 144 * Otherwise, use original memset function. 145 * 146 * In .altinstructions section, ERMS feature is placed after REG_GOOD 147 * feature to implement the right patch order. 148 */ 149 .section .altinstructions,"a" 150 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ 151 .Lfinal-memset,.Lmemset_e-.Lmemset_c 152 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ 153 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e 154 .previous 155