1e5c88e3fSFeng Kan/* 2e5c88e3fSFeng Kan * Copyright (C) 2013 ARM Ltd. 3e5c88e3fSFeng Kan * Copyright (C) 2013 Linaro. 4e5c88e3fSFeng Kan * 5e5c88e3fSFeng Kan * This code is based on glibc cortex strings work originally authored by Linaro 6e5c88e3fSFeng Kan * and re-licensed under GPLv2 for the Linux kernel. The original code can 7e5c88e3fSFeng Kan * be found @ 8e5c88e3fSFeng Kan * 9e5c88e3fSFeng Kan * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10e5c88e3fSFeng Kan * files/head:/src/aarch64/ 11e5c88e3fSFeng Kan * 12e5c88e3fSFeng Kan * This program is free software; you can redistribute it and/or modify 13e5c88e3fSFeng Kan * it under the terms of the GNU General Public License version 2 as 14e5c88e3fSFeng Kan * published by the Free Software Foundation. 15e5c88e3fSFeng Kan * 16e5c88e3fSFeng Kan * This program is distributed in the hope that it will be useful, 17e5c88e3fSFeng Kan * but WITHOUT ANY WARRANTY; without even the implied warranty of 18e5c88e3fSFeng Kan * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19e5c88e3fSFeng Kan * GNU General Public License for more details. 20e5c88e3fSFeng Kan * 21e5c88e3fSFeng Kan * You should have received a copy of the GNU General Public License 22e5c88e3fSFeng Kan * along with this program. If not, see <http://www.gnu.org/licenses/>. 23e5c88e3fSFeng Kan */ 24e5c88e3fSFeng Kan 25e5c88e3fSFeng Kan 26e5c88e3fSFeng Kan/* 27e5c88e3fSFeng Kan * Copy a buffer from src to dest (alignment handled by the hardware) 28e5c88e3fSFeng Kan * 29e5c88e3fSFeng Kan * Parameters: 30e5c88e3fSFeng Kan * x0 - dest 31e5c88e3fSFeng Kan * x1 - src 32e5c88e3fSFeng Kan * x2 - n 33e5c88e3fSFeng Kan * Returns: 34e5c88e3fSFeng Kan * x0 - dest 35e5c88e3fSFeng Kan */ 36e5c88e3fSFeng Kandstin .req x0 37e5c88e3fSFeng Kansrc .req x1 38e5c88e3fSFeng Kancount .req x2 39e5c88e3fSFeng Kantmp1 .req x3 40e5c88e3fSFeng Kantmp1w .req w3 41e5c88e3fSFeng Kantmp2 .req x4 42e5c88e3fSFeng Kantmp2w .req w4 43e5c88e3fSFeng Kandst .req x6 44e5c88e3fSFeng Kan 45e5c88e3fSFeng KanA_l .req x7 46e5c88e3fSFeng KanA_h .req x8 47e5c88e3fSFeng KanB_l .req x9 48e5c88e3fSFeng KanB_h .req x10 49e5c88e3fSFeng KanC_l .req x11 50e5c88e3fSFeng KanC_h .req x12 51e5c88e3fSFeng KanD_l .req x13 52e5c88e3fSFeng KanD_h .req x14 53e5c88e3fSFeng Kan 54e5c88e3fSFeng Kan mov dst, dstin 55e5c88e3fSFeng Kan cmp count, #16 56e5c88e3fSFeng Kan /*When memory length is less than 16, the accessed are not aligned.*/ 57e5c88e3fSFeng Kan b.lo .Ltiny15 58e5c88e3fSFeng Kan 59e5c88e3fSFeng Kan neg tmp2, src 60e5c88e3fSFeng Kan ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 61e5c88e3fSFeng Kan b.eq .LSrcAligned 62e5c88e3fSFeng Kan sub count, count, tmp2 63e5c88e3fSFeng Kan /* 64e5c88e3fSFeng Kan * Copy the leading memory data from src to dst in an increasing 65e5c88e3fSFeng Kan * address order.By this way,the risk of overwritting the source 66e5c88e3fSFeng Kan * memory data is eliminated when the distance between src and 67e5c88e3fSFeng Kan * dst is less than 16. The memory accesses here are alignment. 68e5c88e3fSFeng Kan */ 69e5c88e3fSFeng Kan tbz tmp2, #0, 1f 70e5c88e3fSFeng Kan ldrb1 tmp1w, src, #1 71e5c88e3fSFeng Kan strb1 tmp1w, dst, #1 72e5c88e3fSFeng Kan1: 73e5c88e3fSFeng Kan tbz tmp2, #1, 2f 74e5c88e3fSFeng Kan ldrh1 tmp1w, src, #2 75e5c88e3fSFeng Kan strh1 tmp1w, dst, #2 76e5c88e3fSFeng Kan2: 77e5c88e3fSFeng Kan tbz tmp2, #2, 3f 78e5c88e3fSFeng Kan ldr1 tmp1w, src, #4 79e5c88e3fSFeng Kan str1 tmp1w, dst, #4 80e5c88e3fSFeng Kan3: 81e5c88e3fSFeng Kan tbz tmp2, #3, .LSrcAligned 82e5c88e3fSFeng Kan ldr1 tmp1, src, #8 83e5c88e3fSFeng Kan str1 tmp1, dst, #8 84e5c88e3fSFeng Kan 85e5c88e3fSFeng Kan.LSrcAligned: 86e5c88e3fSFeng Kan cmp count, #64 87e5c88e3fSFeng Kan b.ge .Lcpy_over64 88e5c88e3fSFeng Kan /* 89e5c88e3fSFeng Kan * Deal with small copies quickly by dropping straight into the 90e5c88e3fSFeng Kan * exit block. 91e5c88e3fSFeng Kan */ 92e5c88e3fSFeng Kan.Ltail63: 93e5c88e3fSFeng Kan /* 94e5c88e3fSFeng Kan * Copy up to 48 bytes of data. At this point we only need the 95e5c88e3fSFeng Kan * bottom 6 bits of count to be accurate. 96e5c88e3fSFeng Kan */ 97e5c88e3fSFeng Kan ands tmp1, count, #0x30 98e5c88e3fSFeng Kan b.eq .Ltiny15 99e5c88e3fSFeng Kan cmp tmp1w, #0x20 100e5c88e3fSFeng Kan b.eq 1f 101e5c88e3fSFeng Kan b.lt 2f 102e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 103e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 104e5c88e3fSFeng Kan1: 105e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 106e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 107e5c88e3fSFeng Kan2: 108e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 109e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 110e5c88e3fSFeng Kan.Ltiny15: 111e5c88e3fSFeng Kan /* 112e5c88e3fSFeng Kan * Prefer to break one ldp/stp into several load/store to access 113e5c88e3fSFeng Kan * memory in an increasing address order,rather than to load/store 16 114e5c88e3fSFeng Kan * bytes from (src-16) to (dst-16) and to backward the src to aligned 115e5c88e3fSFeng Kan * address,which way is used in original cortex memcpy. If keeping 116e5c88e3fSFeng Kan * the original memcpy process here, memmove need to satisfy the 117e5c88e3fSFeng Kan * precondition that src address is at least 16 bytes bigger than dst 118e5c88e3fSFeng Kan * address,otherwise some source data will be overwritten when memove 119e5c88e3fSFeng Kan * call memcpy directly. To make memmove simpler and decouple the 120e5c88e3fSFeng Kan * memcpy's dependency on memmove, withdrew the original process. 121e5c88e3fSFeng Kan */ 122e5c88e3fSFeng Kan tbz count, #3, 1f 123e5c88e3fSFeng Kan ldr1 tmp1, src, #8 124e5c88e3fSFeng Kan str1 tmp1, dst, #8 125e5c88e3fSFeng Kan1: 126e5c88e3fSFeng Kan tbz count, #2, 2f 127e5c88e3fSFeng Kan ldr1 tmp1w, src, #4 128e5c88e3fSFeng Kan str1 tmp1w, dst, #4 129e5c88e3fSFeng Kan2: 130e5c88e3fSFeng Kan tbz count, #1, 3f 131e5c88e3fSFeng Kan ldrh1 tmp1w, src, #2 132e5c88e3fSFeng Kan strh1 tmp1w, dst, #2 133e5c88e3fSFeng Kan3: 134e5c88e3fSFeng Kan tbz count, #0, .Lexitfunc 135e5c88e3fSFeng Kan ldrb1 tmp1w, src, #1 136e5c88e3fSFeng Kan strb1 tmp1w, dst, #1 137e5c88e3fSFeng Kan 138e5c88e3fSFeng Kan b .Lexitfunc 139e5c88e3fSFeng Kan 140e5c88e3fSFeng Kan.Lcpy_over64: 141e5c88e3fSFeng Kan subs count, count, #128 142e5c88e3fSFeng Kan b.ge .Lcpy_body_large 143e5c88e3fSFeng Kan /* 144e5c88e3fSFeng Kan * Less than 128 bytes to copy, so handle 64 here and then jump 145e5c88e3fSFeng Kan * to the tail. 146e5c88e3fSFeng Kan */ 147e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 148e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 149e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 150e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 151e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 152e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 153e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 154e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 155e5c88e3fSFeng Kan 156e5c88e3fSFeng Kan tst count, #0x3f 157e5c88e3fSFeng Kan b.ne .Ltail63 158e5c88e3fSFeng Kan b .Lexitfunc 159e5c88e3fSFeng Kan 160e5c88e3fSFeng Kan /* 161e5c88e3fSFeng Kan * Critical loop. Start at a new cache line boundary. Assuming 162e5c88e3fSFeng Kan * 64 bytes per line this ensures the entire loop is in one line. 163e5c88e3fSFeng Kan */ 164e5c88e3fSFeng Kan .p2align L1_CACHE_SHIFT 165e5c88e3fSFeng Kan.Lcpy_body_large: 166e5c88e3fSFeng Kan /* pre-get 64 bytes data. */ 167e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 168e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 169e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 170e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 171e5c88e3fSFeng Kan1: 172e5c88e3fSFeng Kan /* 173e5c88e3fSFeng Kan * interlace the load of next 64 bytes data block with store of the last 174e5c88e3fSFeng Kan * loaded 64 bytes data. 175e5c88e3fSFeng Kan */ 176e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 177e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 178e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 179e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 180e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 181e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 182e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 183e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 184e5c88e3fSFeng Kan subs count, count, #64 185e5c88e3fSFeng Kan b.ge 1b 186e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 187e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 188e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 189e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 190e5c88e3fSFeng Kan 191e5c88e3fSFeng Kan tst count, #0x3f 192e5c88e3fSFeng Kan b.ne .Ltail63 193e5c88e3fSFeng Kan.Lexitfunc: 194