1/* 2 * Copyright (C) 2013 ARM Ltd. 3 * Copyright (C) 2013 Linaro. 4 * 5 * This code is based on glibc cortex strings work originally authored by Linaro 6 * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License version 2 as 14 * published by the Free Software Foundation. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25#include <linux/linkage.h> 26#include <asm/assembler.h> 27#include <asm/cache.h> 28 29/* 30 * Fill in the buffer with character c (alignment handled by the hardware) 31 * 32 * Parameters: 33 * x0 - buf 34 * x1 - c 35 * x2 - n 36 * Returns: 37 * x0 - buf 38 */ 39 40dstin .req x0 41val .req w1 42count .req x2 43tmp1 .req x3 44tmp1w .req w3 45tmp2 .req x4 46tmp2w .req w4 47zva_len_x .req x5 48zva_len .req w5 49zva_bits_x .req x6 50 51A_l .req x7 52A_lw .req w7 53dst .req x8 54tmp3w .req w9 55tmp3 .req x9 56 57ENTRY(memset) 58 mov dst, dstin /* Preserve return value. */ 59 and A_lw, val, #255 60 orr A_lw, A_lw, A_lw, lsl #8 61 orr A_lw, A_lw, A_lw, lsl #16 62 orr A_l, A_l, A_l, lsl #32 63 64 cmp count, #15 65 b.hi .Lover16_proc 66 /*All store maybe are non-aligned..*/ 67 tbz count, #3, 1f 68 str A_l, [dst], #8 691: 70 tbz count, #2, 2f 71 str A_lw, [dst], #4 722: 73 tbz count, #1, 3f 74 strh A_lw, [dst], #2 753: 76 tbz count, #0, 4f 77 strb A_lw, [dst] 784: 79 ret 80 81.Lover16_proc: 82 /*Whether the start address is aligned with 16.*/ 83 neg tmp2, dst 84 ands tmp2, tmp2, #15 85 b.eq .Laligned 86/* 87* The count is not less than 16, we can use stp to store the start 16 bytes, 88* then adjust the dst aligned with 16.This process will make the current 89* memory address at alignment boundary. 90*/ 91 stp A_l, A_l, [dst] /*non-aligned store..*/ 92 /*make the dst aligned..*/ 93 sub count, count, tmp2 94 add dst, dst, tmp2 95 96.Laligned: 97 cbz A_l, .Lzero_mem 98 99.Ltail_maybe_long: 100 cmp count, #64 101 b.ge .Lnot_short 102.Ltail63: 103 ands tmp1, count, #0x30 104 b.eq 3f 105 cmp tmp1w, #0x20 106 b.eq 1f 107 b.lt 2f 108 stp A_l, A_l, [dst], #16 1091: 110 stp A_l, A_l, [dst], #16 1112: 112 stp A_l, A_l, [dst], #16 113/* 114* The last store length is less than 16,use stp to write last 16 bytes. 115* It will lead some bytes written twice and the access is non-aligned. 116*/ 1173: 118 ands count, count, #15 119 cbz count, 4f 120 add dst, dst, count 121 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 1224: 123 ret 124 125 /* 126 * Critical loop. Start at a new cache line boundary. Assuming 127 * 64 bytes per line, this ensures the entire loop is in one line. 128 */ 129 .p2align L1_CACHE_SHIFT 130.Lnot_short: 131 sub dst, dst, #16/* Pre-bias. */ 132 sub count, count, #64 1331: 134 stp A_l, A_l, [dst, #16] 135 stp A_l, A_l, [dst, #32] 136 stp A_l, A_l, [dst, #48] 137 stp A_l, A_l, [dst, #64]! 138 subs count, count, #64 139 b.ge 1b 140 tst count, #0x3f 141 add dst, dst, #16 142 b.ne .Ltail63 143.Lexitfunc: 144 ret 145 146 /* 147 * For zeroing memory, check to see if we can use the ZVA feature to 148 * zero entire 'cache' lines. 149 */ 150.Lzero_mem: 151 cmp count, #63 152 b.le .Ltail63 153 /* 154 * For zeroing small amounts of memory, it's not worth setting up 155 * the line-clear code. 156 */ 157 cmp count, #128 158 b.lt .Lnot_short /*count is at least 128 bytes*/ 159 160 mrs tmp1, dczid_el0 161 tbnz tmp1, #4, .Lnot_short 162 mov tmp3w, #4 163 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 164 lsl zva_len, tmp3w, zva_len 165 166 ands tmp3w, zva_len, #63 167 /* 168 * ensure the zva_len is not less than 64. 169 * It is not meaningful to use ZVA if the block size is less than 64. 170 */ 171 b.ne .Lnot_short 172.Lzero_by_line: 173 /* 174 * Compute how far we need to go to become suitably aligned. We're 175 * already at quad-word alignment. 176 */ 177 cmp count, zva_len_x 178 b.lt .Lnot_short /* Not enough to reach alignment. */ 179 sub zva_bits_x, zva_len_x, #1 180 neg tmp2, dst 181 ands tmp2, tmp2, zva_bits_x 182 b.eq 2f /* Already aligned. */ 183 /* Not aligned, check that there's enough to copy after alignment.*/ 184 sub tmp1, count, tmp2 185 /* 186 * grantee the remain length to be ZVA is bigger than 64, 187 * avoid to make the 2f's process over mem range.*/ 188 cmp tmp1, #64 189 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 190 b.lt .Lnot_short 191 /* 192 * We know that there's at least 64 bytes to zero and that it's safe 193 * to overrun by 64 bytes. 194 */ 195 mov count, tmp1 1961: 197 stp A_l, A_l, [dst] 198 stp A_l, A_l, [dst, #16] 199 stp A_l, A_l, [dst, #32] 200 subs tmp2, tmp2, #64 201 stp A_l, A_l, [dst, #48] 202 add dst, dst, #64 203 b.ge 1b 204 /* We've overrun a bit, so adjust dst downwards.*/ 205 add dst, dst, tmp2 2062: 207 sub count, count, zva_len_x 2083: 209 dc zva, dst 210 add dst, dst, zva_len_x 211 subs count, count, zva_len_x 212 b.ge 3b 213 ands count, count, zva_bits_x 214 b.ne .Ltail_maybe_long 215 ret 216ENDPROC(memset) 217