1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 ARM Ltd. 4 * Copyright (C) 2013 Linaro. 5 * 6 * This code is based on glibc cortex strings work originally authored by Linaro 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 */ 12 13#include <linux/linkage.h> 14#include <asm/assembler.h> 15#include <asm/cache.h> 16 17/* 18 * Fill in the buffer with character c (alignment handled by the hardware) 19 * 20 * Parameters: 21 * x0 - buf 22 * x1 - c 23 * x2 - n 24 * Returns: 25 * x0 - buf 26 */ 27 28dstin .req x0 29val .req w1 30count .req x2 31tmp1 .req x3 32tmp1w .req w3 33tmp2 .req x4 34tmp2w .req w4 35zva_len_x .req x5 36zva_len .req w5 37zva_bits_x .req x6 38 39A_l .req x7 40A_lw .req w7 41dst .req x8 42tmp3w .req w9 43tmp3 .req x9 44 45SYM_FUNC_START(__pi_memset) 46 mov dst, dstin /* Preserve return value. */ 47 and A_lw, val, #255 48 orr A_lw, A_lw, A_lw, lsl #8 49 orr A_lw, A_lw, A_lw, lsl #16 50 orr A_l, A_l, A_l, lsl #32 51 52 cmp count, #15 53 b.hi .Lover16_proc 54 /*All store maybe are non-aligned..*/ 55 tbz count, #3, 1f 56 str A_l, [dst], #8 571: 58 tbz count, #2, 2f 59 str A_lw, [dst], #4 602: 61 tbz count, #1, 3f 62 strh A_lw, [dst], #2 633: 64 tbz count, #0, 4f 65 strb A_lw, [dst] 664: 67 ret 68 69.Lover16_proc: 70 /*Whether the start address is aligned with 16.*/ 71 neg tmp2, dst 72 ands tmp2, tmp2, #15 73 b.eq .Laligned 74/* 75* The count is not less than 16, we can use stp to store the start 16 bytes, 76* then adjust the dst aligned with 16.This process will make the current 77* memory address at alignment boundary. 78*/ 79 stp A_l, A_l, [dst] /*non-aligned store..*/ 80 /*make the dst aligned..*/ 81 sub count, count, tmp2 82 add dst, dst, tmp2 83 84.Laligned: 85 cbz A_l, .Lzero_mem 86 87.Ltail_maybe_long: 88 cmp count, #64 89 b.ge .Lnot_short 90.Ltail63: 91 ands tmp1, count, #0x30 92 b.eq 3f 93 cmp tmp1w, #0x20 94 b.eq 1f 95 b.lt 2f 96 stp A_l, A_l, [dst], #16 971: 98 stp A_l, A_l, [dst], #16 992: 100 stp A_l, A_l, [dst], #16 101/* 102* The last store length is less than 16,use stp to write last 16 bytes. 103* It will lead some bytes written twice and the access is non-aligned. 104*/ 1053: 106 ands count, count, #15 107 cbz count, 4f 108 add dst, dst, count 109 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 1104: 111 ret 112 113 /* 114 * Critical loop. Start at a new cache line boundary. Assuming 115 * 64 bytes per line, this ensures the entire loop is in one line. 116 */ 117 .p2align L1_CACHE_SHIFT 118.Lnot_short: 119 sub dst, dst, #16/* Pre-bias. */ 120 sub count, count, #64 1211: 122 stp A_l, A_l, [dst, #16] 123 stp A_l, A_l, [dst, #32] 124 stp A_l, A_l, [dst, #48] 125 stp A_l, A_l, [dst, #64]! 126 subs count, count, #64 127 b.ge 1b 128 tst count, #0x3f 129 add dst, dst, #16 130 b.ne .Ltail63 131.Lexitfunc: 132 ret 133 134 /* 135 * For zeroing memory, check to see if we can use the ZVA feature to 136 * zero entire 'cache' lines. 137 */ 138.Lzero_mem: 139 cmp count, #63 140 b.le .Ltail63 141 /* 142 * For zeroing small amounts of memory, it's not worth setting up 143 * the line-clear code. 144 */ 145 cmp count, #128 146 b.lt .Lnot_short /*count is at least 128 bytes*/ 147 148 mrs tmp1, dczid_el0 149 tbnz tmp1, #4, .Lnot_short 150 mov tmp3w, #4 151 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 152 lsl zva_len, tmp3w, zva_len 153 154 ands tmp3w, zva_len, #63 155 /* 156 * ensure the zva_len is not less than 64. 157 * It is not meaningful to use ZVA if the block size is less than 64. 158 */ 159 b.ne .Lnot_short 160.Lzero_by_line: 161 /* 162 * Compute how far we need to go to become suitably aligned. We're 163 * already at quad-word alignment. 164 */ 165 cmp count, zva_len_x 166 b.lt .Lnot_short /* Not enough to reach alignment. */ 167 sub zva_bits_x, zva_len_x, #1 168 neg tmp2, dst 169 ands tmp2, tmp2, zva_bits_x 170 b.eq 2f /* Already aligned. */ 171 /* Not aligned, check that there's enough to copy after alignment.*/ 172 sub tmp1, count, tmp2 173 /* 174 * grantee the remain length to be ZVA is bigger than 64, 175 * avoid to make the 2f's process over mem range.*/ 176 cmp tmp1, #64 177 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 178 b.lt .Lnot_short 179 /* 180 * We know that there's at least 64 bytes to zero and that it's safe 181 * to overrun by 64 bytes. 182 */ 183 mov count, tmp1 1841: 185 stp A_l, A_l, [dst] 186 stp A_l, A_l, [dst, #16] 187 stp A_l, A_l, [dst, #32] 188 subs tmp2, tmp2, #64 189 stp A_l, A_l, [dst, #48] 190 add dst, dst, #64 191 b.ge 1b 192 /* We've overrun a bit, so adjust dst downwards.*/ 193 add dst, dst, tmp2 1942: 195 sub count, count, zva_len_x 1963: 197 dc zva, dst 198 add dst, dst, zva_len_x 199 subs count, count, zva_len_x 200 b.ge 3b 201 ands count, count, zva_bits_x 202 b.ne .Ltail_maybe_long 203 ret 204SYM_FUNC_END(__pi_memset) 205 206SYM_FUNC_ALIAS(__memset, __pi_memset) 207EXPORT_SYMBOL(__memset) 208 209SYM_FUNC_ALIAS_WEAK(memset, __pi_memset) 210EXPORT_SYMBOL(memset) 211