1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (c) 2013-2021, Arm Limited. 4 * 5 * Adapted from the original at: 6 * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S 7 */ 8 9#include <linux/linkage.h> 10#include <asm/assembler.h> 11 12/* Assumptions: 13 * 14 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 15 */ 16 17#define L(label) .L ## label 18 19/* Arguments and results. */ 20#define srcin x0 21#define len x0 22 23/* Locals and temporaries. */ 24#define src x1 25#define data1 x2 26#define data2 x3 27#define has_nul1 x4 28#define has_nul2 x5 29#define tmp1 x4 30#define tmp2 x5 31#define tmp3 x6 32#define tmp4 x7 33#define zeroones x8 34 35 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 36 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 37 can be done in parallel across the entire word. A faster check 38 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 39 false hits for characters 129..255. */ 40 41#define REP8_01 0x0101010101010101 42#define REP8_7f 0x7f7f7f7f7f7f7f7f 43#define REP8_80 0x8080808080808080 44 45#define MIN_PAGE_SIZE 4096 46 47 /* Since strings are short on average, we check the first 16 bytes 48 of the string for a NUL character. In order to do an unaligned ldp 49 safely we have to do a page cross check first. If there is a NUL 50 byte we calculate the length from the 2 8-byte words using 51 conditional select to reduce branch mispredictions (it is unlikely 52 strlen will be repeatedly called on strings with the same length). 53 54 If the string is longer than 16 bytes, we align src so don't need 55 further page cross checks, and process 32 bytes per iteration 56 using the fast NUL check. If we encounter non-ASCII characters, 57 fallback to a second loop using the full NUL check. 58 59 If the page cross check fails, we read 16 bytes from an aligned 60 address, remove any characters before the string, and continue 61 in the main loop using aligned loads. Since strings crossing a 62 page in the first 16 bytes are rare (probability of 63 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 64 65 AArch64 systems have a minimum page size of 4k. We don't bother 66 checking for larger page sizes - the cost of setting up the correct 67 page size is just not worth the extra gain from a small reduction in 68 the cases taking the slow path. Note that we only care about 69 whether the first fetch, which may be misaligned, crosses a page 70 boundary. */ 71 72SYM_FUNC_START_WEAK_PI(strlen) 73 and tmp1, srcin, MIN_PAGE_SIZE - 1 74 mov zeroones, REP8_01 75 cmp tmp1, MIN_PAGE_SIZE - 16 76 b.gt L(page_cross) 77 ldp data1, data2, [srcin] 78#ifdef __AARCH64EB__ 79 /* For big-endian, carry propagation (if the final byte in the 80 string is 0x01) means we cannot use has_nul1/2 directly. 81 Since we expect strings to be small and early-exit, 82 byte-swap the data now so has_null1/2 will be correct. */ 83 rev data1, data1 84 rev data2, data2 85#endif 86 sub tmp1, data1, zeroones 87 orr tmp2, data1, REP8_7f 88 sub tmp3, data2, zeroones 89 orr tmp4, data2, REP8_7f 90 bics has_nul1, tmp1, tmp2 91 bic has_nul2, tmp3, tmp4 92 ccmp has_nul2, 0, 0, eq 93 beq L(main_loop_entry) 94 95 /* Enter with C = has_nul1 == 0. */ 96 csel has_nul1, has_nul1, has_nul2, cc 97 mov len, 8 98 rev has_nul1, has_nul1 99 clz tmp1, has_nul1 100 csel len, xzr, len, cc 101 add len, len, tmp1, lsr 3 102 ret 103 104 /* The inner loop processes 32 bytes per iteration and uses the fast 105 NUL check. If we encounter non-ASCII characters, use a second 106 loop with the accurate NUL check. */ 107 .p2align 4 108L(main_loop_entry): 109 bic src, srcin, 15 110 sub src, src, 16 111L(main_loop): 112 ldp data1, data2, [src, 32]! 113L(page_cross_entry): 114 sub tmp1, data1, zeroones 115 sub tmp3, data2, zeroones 116 orr tmp2, tmp1, tmp3 117 tst tmp2, zeroones, lsl 7 118 bne 1f 119 ldp data1, data2, [src, 16] 120 sub tmp1, data1, zeroones 121 sub tmp3, data2, zeroones 122 orr tmp2, tmp1, tmp3 123 tst tmp2, zeroones, lsl 7 124 beq L(main_loop) 125 add src, src, 16 1261: 127 /* The fast check failed, so do the slower, accurate NUL check. */ 128 orr tmp2, data1, REP8_7f 129 orr tmp4, data2, REP8_7f 130 bics has_nul1, tmp1, tmp2 131 bic has_nul2, tmp3, tmp4 132 ccmp has_nul2, 0, 0, eq 133 beq L(nonascii_loop) 134 135 /* Enter with C = has_nul1 == 0. */ 136L(tail): 137#ifdef __AARCH64EB__ 138 /* For big-endian, carry propagation (if the final byte in the 139 string is 0x01) means we cannot use has_nul1/2 directly. The 140 easiest way to get the correct byte is to byte-swap the data 141 and calculate the syndrome a second time. */ 142 csel data1, data1, data2, cc 143 rev data1, data1 144 sub tmp1, data1, zeroones 145 orr tmp2, data1, REP8_7f 146 bic has_nul1, tmp1, tmp2 147#else 148 csel has_nul1, has_nul1, has_nul2, cc 149#endif 150 sub len, src, srcin 151 rev has_nul1, has_nul1 152 add tmp2, len, 8 153 clz tmp1, has_nul1 154 csel len, len, tmp2, cc 155 add len, len, tmp1, lsr 3 156 ret 157 158L(nonascii_loop): 159 ldp data1, data2, [src, 16]! 160 sub tmp1, data1, zeroones 161 orr tmp2, data1, REP8_7f 162 sub tmp3, data2, zeroones 163 orr tmp4, data2, REP8_7f 164 bics has_nul1, tmp1, tmp2 165 bic has_nul2, tmp3, tmp4 166 ccmp has_nul2, 0, 0, eq 167 bne L(tail) 168 ldp data1, data2, [src, 16]! 169 sub tmp1, data1, zeroones 170 orr tmp2, data1, REP8_7f 171 sub tmp3, data2, zeroones 172 orr tmp4, data2, REP8_7f 173 bics has_nul1, tmp1, tmp2 174 bic has_nul2, tmp3, tmp4 175 ccmp has_nul2, 0, 0, eq 176 beq L(nonascii_loop) 177 b L(tail) 178 179 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 180 srcin to 0x7f, so we ignore any NUL bytes before the string. 181 Then continue in the aligned loop. */ 182L(page_cross): 183 bic src, srcin, 15 184 ldp data1, data2, [src] 185 lsl tmp1, srcin, 3 186 mov tmp4, -1 187#ifdef __AARCH64EB__ 188 /* Big-endian. Early bytes are at MSB. */ 189 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 190#else 191 /* Little-endian. Early bytes are at LSB. */ 192 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 193#endif 194 orr tmp1, tmp1, REP8_80 195 orn data1, data1, tmp1 196 orn tmp2, data2, tmp1 197 tst srcin, 8 198 csel data1, data1, tmp4, eq 199 csel data2, data2, tmp2, eq 200 b L(page_cross_entry) 201 202SYM_FUNC_END_PI(strlen) 203EXPORT_SYMBOL_NOKASAN(strlen) 204