1/* Copyright 2002 Andi Kleen, SuSE Labs */ 2 3#include <linux/linkage.h> 4#include <asm/dwarf2.h> 5#include <asm/cpufeature.h> 6#include <asm/alternative-asm.h> 7 8/* 9 * ISO C memset - set a memory block to a byte value. This function uses fast 10 * string to get better performance than the original function. The code is 11 * simpler and shorter than the orignal function as well. 12 * 13 * rdi destination 14 * rsi value (char) 15 * rdx count (bytes) 16 * 17 * rax original destination 18 */ 19 .section .altinstr_replacement, "ax", @progbits 20.Lmemset_c: 21 movq %rdi,%r9 22 movl %edx,%r8d 23 andl $7,%r8d 24 movl %edx,%ecx 25 shrl $3,%ecx 26 /* expand byte value */ 27 movzbl %sil,%esi 28 movabs $0x0101010101010101,%rax 29 mulq %rsi /* with rax, clobbers rdx */ 30 rep stosq 31 movl %r8d,%ecx 32 rep stosb 33 movq %r9,%rax 34 ret 35.Lmemset_e: 36 .previous 37 38/* 39 * ISO C memset - set a memory block to a byte value. This function uses 40 * enhanced rep stosb to override the fast string function. 41 * The code is simpler and shorter than the fast string function as well. 42 * 43 * rdi destination 44 * rsi value (char) 45 * rdx count (bytes) 46 * 47 * rax original destination 48 */ 49 .section .altinstr_replacement, "ax", @progbits 50.Lmemset_c_e: 51 movq %rdi,%r9 52 movb %sil,%al 53 movl %edx,%ecx 54 rep stosb 55 movq %r9,%rax 56 ret 57.Lmemset_e_e: 58 .previous 59 60ENTRY(memset) 61ENTRY(__memset) 62 CFI_STARTPROC 63 movq %rdi,%r10 64 movq %rdx,%r11 65 66 /* expand byte value */ 67 movzbl %sil,%ecx 68 movabs $0x0101010101010101,%rax 69 mul %rcx /* with rax, clobbers rdx */ 70 71 /* align dst */ 72 movl %edi,%r9d 73 andl $7,%r9d 74 jnz .Lbad_alignment 75 CFI_REMEMBER_STATE 76.Lafter_bad_alignment: 77 78 movl %r11d,%ecx 79 shrl $6,%ecx 80 jz .Lhandle_tail 81 82 .p2align 4 83.Lloop_64: 84 decl %ecx 85 movq %rax,(%rdi) 86 movq %rax,8(%rdi) 87 movq %rax,16(%rdi) 88 movq %rax,24(%rdi) 89 movq %rax,32(%rdi) 90 movq %rax,40(%rdi) 91 movq %rax,48(%rdi) 92 movq %rax,56(%rdi) 93 leaq 64(%rdi),%rdi 94 jnz .Lloop_64 95 96 /* Handle tail in loops. The loops should be faster than hard 97 to predict jump tables. */ 98 .p2align 4 99.Lhandle_tail: 100 movl %r11d,%ecx 101 andl $63&(~7),%ecx 102 jz .Lhandle_7 103 shrl $3,%ecx 104 .p2align 4 105.Lloop_8: 106 decl %ecx 107 movq %rax,(%rdi) 108 leaq 8(%rdi),%rdi 109 jnz .Lloop_8 110 111.Lhandle_7: 112 movl %r11d,%ecx 113 andl $7,%ecx 114 jz .Lende 115 .p2align 4 116.Lloop_1: 117 decl %ecx 118 movb %al,(%rdi) 119 leaq 1(%rdi),%rdi 120 jnz .Lloop_1 121 122.Lende: 123 movq %r10,%rax 124 ret 125 126 CFI_RESTORE_STATE 127.Lbad_alignment: 128 cmpq $7,%r11 129 jbe .Lhandle_7 130 movq %rax,(%rdi) /* unaligned store */ 131 movq $8,%r8 132 subq %r9,%r8 133 addq %r8,%rdi 134 subq %r8,%r11 135 jmp .Lafter_bad_alignment 136.Lfinal: 137 CFI_ENDPROC 138ENDPROC(memset) 139ENDPROC(__memset) 140 141 /* Some CPUs support enhanced REP MOVSB/STOSB feature. 142 * It is recommended to use this when possible. 143 * 144 * If enhanced REP MOVSB/STOSB feature is not available, use fast string 145 * instructions. 146 * 147 * Otherwise, use original memset function. 148 * 149 * In .altinstructions section, ERMS feature is placed after REG_GOOD 150 * feature to implement the right patch order. 151 */ 152 .section .altinstructions,"a" 153 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ 154 .Lfinal-memset,.Lmemset_e-.Lmemset_c 155 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ 156 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e 157 .previous 158