1/* 2 * Copyright (C) 2013 ARM Ltd. 3 * Copyright (C) 2013 Linaro. 4 * 5 * This code is based on glibc cortex strings work originally authored by Linaro 6 * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License version 2 as 14 * published by the Free Software Foundation. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25 26/* 27 * Copy a buffer from src to dest (alignment handled by the hardware) 28 * 29 * Parameters: 30 * x0 - dest 31 * x1 - src 32 * x2 - n 33 * Returns: 34 * x0 - dest 35 */ 36dstin .req x0 37src .req x1 38count .req x2 39tmp1 .req x3 40tmp1w .req w3 41tmp2 .req x4 42tmp2w .req w4 43dst .req x6 44 45A_l .req x7 46A_h .req x8 47B_l .req x9 48B_h .req x10 49C_l .req x11 50C_h .req x12 51D_l .req x13 52D_h .req x14 53 54 mov dst, dstin 55 cmp count, #16 56 /*When memory length is less than 16, the accessed are not aligned.*/ 57 b.lo .Ltiny15 58 59 neg tmp2, src 60 ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 61 b.eq .LSrcAligned 62 sub count, count, tmp2 63 /* 64 * Copy the leading memory data from src to dst in an increasing 65 * address order.By this way,the risk of overwriting the source 66 * memory data is eliminated when the distance between src and 67 * dst is less than 16. The memory accesses here are alignment. 68 */ 69 tbz tmp2, #0, 1f 70 ldrb1 tmp1w, src, #1 71 strb1 tmp1w, dst, #1 721: 73 tbz tmp2, #1, 2f 74 ldrh1 tmp1w, src, #2 75 strh1 tmp1w, dst, #2 762: 77 tbz tmp2, #2, 3f 78 ldr1 tmp1w, src, #4 79 str1 tmp1w, dst, #4 803: 81 tbz tmp2, #3, .LSrcAligned 82 ldr1 tmp1, src, #8 83 str1 tmp1, dst, #8 84 85.LSrcAligned: 86 cmp count, #64 87 b.ge .Lcpy_over64 88 /* 89 * Deal with small copies quickly by dropping straight into the 90 * exit block. 91 */ 92.Ltail63: 93 /* 94 * Copy up to 48 bytes of data. At this point we only need the 95 * bottom 6 bits of count to be accurate. 96 */ 97 ands tmp1, count, #0x30 98 b.eq .Ltiny15 99 cmp tmp1w, #0x20 100 b.eq 1f 101 b.lt 2f 102 ldp1 A_l, A_h, src, #16 103 stp1 A_l, A_h, dst, #16 1041: 105 ldp1 A_l, A_h, src, #16 106 stp1 A_l, A_h, dst, #16 1072: 108 ldp1 A_l, A_h, src, #16 109 stp1 A_l, A_h, dst, #16 110.Ltiny15: 111 /* 112 * Prefer to break one ldp/stp into several load/store to access 113 * memory in an increasing address order,rather than to load/store 16 114 * bytes from (src-16) to (dst-16) and to backward the src to aligned 115 * address,which way is used in original cortex memcpy. If keeping 116 * the original memcpy process here, memmove need to satisfy the 117 * precondition that src address is at least 16 bytes bigger than dst 118 * address,otherwise some source data will be overwritten when memove 119 * call memcpy directly. To make memmove simpler and decouple the 120 * memcpy's dependency on memmove, withdrew the original process. 121 */ 122 tbz count, #3, 1f 123 ldr1 tmp1, src, #8 124 str1 tmp1, dst, #8 1251: 126 tbz count, #2, 2f 127 ldr1 tmp1w, src, #4 128 str1 tmp1w, dst, #4 1292: 130 tbz count, #1, 3f 131 ldrh1 tmp1w, src, #2 132 strh1 tmp1w, dst, #2 1333: 134 tbz count, #0, .Lexitfunc 135 ldrb1 tmp1w, src, #1 136 strb1 tmp1w, dst, #1 137 138 b .Lexitfunc 139 140.Lcpy_over64: 141 subs count, count, #128 142 b.ge .Lcpy_body_large 143 /* 144 * Less than 128 bytes to copy, so handle 64 here and then jump 145 * to the tail. 146 */ 147 ldp1 A_l, A_h, src, #16 148 stp1 A_l, A_h, dst, #16 149 ldp1 B_l, B_h, src, #16 150 ldp1 C_l, C_h, src, #16 151 stp1 B_l, B_h, dst, #16 152 stp1 C_l, C_h, dst, #16 153 ldp1 D_l, D_h, src, #16 154 stp1 D_l, D_h, dst, #16 155 156 tst count, #0x3f 157 b.ne .Ltail63 158 b .Lexitfunc 159 160 /* 161 * Critical loop. Start at a new cache line boundary. Assuming 162 * 64 bytes per line this ensures the entire loop is in one line. 163 */ 164 .p2align L1_CACHE_SHIFT 165.Lcpy_body_large: 166 /* pre-get 64 bytes data. */ 167 ldp1 A_l, A_h, src, #16 168 ldp1 B_l, B_h, src, #16 169 ldp1 C_l, C_h, src, #16 170 ldp1 D_l, D_h, src, #16 1711: 172 /* 173 * interlace the load of next 64 bytes data block with store of the last 174 * loaded 64 bytes data. 175 */ 176 stp1 A_l, A_h, dst, #16 177 ldp1 A_l, A_h, src, #16 178 stp1 B_l, B_h, dst, #16 179 ldp1 B_l, B_h, src, #16 180 stp1 C_l, C_h, dst, #16 181 ldp1 C_l, C_h, src, #16 182 stp1 D_l, D_h, dst, #16 183 ldp1 D_l, D_h, src, #16 184 subs count, count, #64 185 b.ge 1b 186 stp1 A_l, A_h, dst, #16 187 stp1 B_l, B_h, dst, #16 188 stp1 C_l, C_h, dst, #16 189 stp1 D_l, D_h, dst, #16 190 191 tst count, #0x3f 192 b.ne .Ltail63 193.Lexitfunc: 194