1/* SPDX-License-Identifier: GPL-2.0 */ 2/* Copyright 2002 Andi Kleen, SuSE Labs */ 3 4#include <linux/linkage.h> 5#include <asm/cpufeatures.h> 6#include <asm/alternative-asm.h> 7#include <asm/export.h> 8 9.weak memset 10 11/* 12 * ISO C memset - set a memory block to a byte value. This function uses fast 13 * string to get better performance than the original function. The code is 14 * simpler and shorter than the original function as well. 15 * 16 * rdi destination 17 * rsi value (char) 18 * rdx count (bytes) 19 * 20 * rax original destination 21 */ 22ENTRY(memset) 23ENTRY(__memset) 24 /* 25 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 26 * to use it when possible. If not available, use fast string instructions. 27 * 28 * Otherwise, use original memset function. 29 */ 30 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 31 "jmp memset_erms", X86_FEATURE_ERMS 32 33 movq %rdi,%r9 34 movq %rdx,%rcx 35 andl $7,%edx 36 shrq $3,%rcx 37 /* expand byte value */ 38 movzbl %sil,%esi 39 movabs $0x0101010101010101,%rax 40 imulq %rsi,%rax 41 rep stosq 42 movl %edx,%ecx 43 rep stosb 44 movq %r9,%rax 45 ret 46ENDPROC(memset) 47ENDPROC(__memset) 48EXPORT_SYMBOL(memset) 49EXPORT_SYMBOL(__memset) 50 51/* 52 * ISO C memset - set a memory block to a byte value. This function uses 53 * enhanced rep stosb to override the fast string function. 54 * The code is simpler and shorter than the fast string function as well. 55 * 56 * rdi destination 57 * rsi value (char) 58 * rdx count (bytes) 59 * 60 * rax original destination 61 */ 62ENTRY(memset_erms) 63 movq %rdi,%r9 64 movb %sil,%al 65 movq %rdx,%rcx 66 rep stosb 67 movq %r9,%rax 68 ret 69ENDPROC(memset_erms) 70 71ENTRY(memset_orig) 72 movq %rdi,%r10 73 74 /* expand byte value */ 75 movzbl %sil,%ecx 76 movabs $0x0101010101010101,%rax 77 imulq %rcx,%rax 78 79 /* align dst */ 80 movl %edi,%r9d 81 andl $7,%r9d 82 jnz .Lbad_alignment 83.Lafter_bad_alignment: 84 85 movq %rdx,%rcx 86 shrq $6,%rcx 87 jz .Lhandle_tail 88 89 .p2align 4 90.Lloop_64: 91 decq %rcx 92 movq %rax,(%rdi) 93 movq %rax,8(%rdi) 94 movq %rax,16(%rdi) 95 movq %rax,24(%rdi) 96 movq %rax,32(%rdi) 97 movq %rax,40(%rdi) 98 movq %rax,48(%rdi) 99 movq %rax,56(%rdi) 100 leaq 64(%rdi),%rdi 101 jnz .Lloop_64 102 103 /* Handle tail in loops. The loops should be faster than hard 104 to predict jump tables. */ 105 .p2align 4 106.Lhandle_tail: 107 movl %edx,%ecx 108 andl $63&(~7),%ecx 109 jz .Lhandle_7 110 shrl $3,%ecx 111 .p2align 4 112.Lloop_8: 113 decl %ecx 114 movq %rax,(%rdi) 115 leaq 8(%rdi),%rdi 116 jnz .Lloop_8 117 118.Lhandle_7: 119 andl $7,%edx 120 jz .Lende 121 .p2align 4 122.Lloop_1: 123 decl %edx 124 movb %al,(%rdi) 125 leaq 1(%rdi),%rdi 126 jnz .Lloop_1 127 128.Lende: 129 movq %r10,%rax 130 ret 131 132.Lbad_alignment: 133 cmpq $7,%rdx 134 jbe .Lhandle_7 135 movq %rax,(%rdi) /* unaligned store */ 136 movq $8,%r8 137 subq %r9,%r8 138 addq %r8,%rdi 139 subq %r8,%rdx 140 jmp .Lafter_bad_alignment 141.Lfinal: 142ENDPROC(memset_orig) 143