1*caab277bSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */ 2e5c88e3fSFeng Kan/* 3e5c88e3fSFeng Kan * Copyright (C) 2013 ARM Ltd. 4e5c88e3fSFeng Kan * Copyright (C) 2013 Linaro. 5e5c88e3fSFeng Kan * 6e5c88e3fSFeng Kan * This code is based on glibc cortex strings work originally authored by Linaro 7e5c88e3fSFeng Kan * be found @ 8e5c88e3fSFeng Kan * 9e5c88e3fSFeng Kan * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10e5c88e3fSFeng Kan * files/head:/src/aarch64/ 11e5c88e3fSFeng Kan */ 12e5c88e3fSFeng Kan 13e5c88e3fSFeng Kan 14e5c88e3fSFeng Kan/* 15e5c88e3fSFeng Kan * Copy a buffer from src to dest (alignment handled by the hardware) 16e5c88e3fSFeng Kan * 17e5c88e3fSFeng Kan * Parameters: 18e5c88e3fSFeng Kan * x0 - dest 19e5c88e3fSFeng Kan * x1 - src 20e5c88e3fSFeng Kan * x2 - n 21e5c88e3fSFeng Kan * Returns: 22e5c88e3fSFeng Kan * x0 - dest 23e5c88e3fSFeng Kan */ 24e5c88e3fSFeng Kandstin .req x0 25e5c88e3fSFeng Kansrc .req x1 26e5c88e3fSFeng Kancount .req x2 27e5c88e3fSFeng Kantmp1 .req x3 28e5c88e3fSFeng Kantmp1w .req w3 29e5c88e3fSFeng Kantmp2 .req x4 30e5c88e3fSFeng Kantmp2w .req w4 31e5c88e3fSFeng Kandst .req x6 32e5c88e3fSFeng Kan 33e5c88e3fSFeng KanA_l .req x7 34e5c88e3fSFeng KanA_h .req x8 35e5c88e3fSFeng KanB_l .req x9 36e5c88e3fSFeng KanB_h .req x10 37e5c88e3fSFeng KanC_l .req x11 38e5c88e3fSFeng KanC_h .req x12 39e5c88e3fSFeng KanD_l .req x13 40e5c88e3fSFeng KanD_h .req x14 41e5c88e3fSFeng Kan 42e5c88e3fSFeng Kan mov dst, dstin 43e5c88e3fSFeng Kan cmp count, #16 44e5c88e3fSFeng Kan /*When memory length is less than 16, the accessed are not aligned.*/ 45e5c88e3fSFeng Kan b.lo .Ltiny15 46e5c88e3fSFeng Kan 47e5c88e3fSFeng Kan neg tmp2, src 48e5c88e3fSFeng Kan ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 49e5c88e3fSFeng Kan b.eq .LSrcAligned 50e5c88e3fSFeng Kan sub count, count, tmp2 51e5c88e3fSFeng Kan /* 52e5c88e3fSFeng Kan * Copy the leading memory data from src to dst in an increasing 539a284e5cSMasahiro Yamada * address order.By this way,the risk of overwriting the source 54e5c88e3fSFeng Kan * memory data is eliminated when the distance between src and 55e5c88e3fSFeng Kan * dst is less than 16. The memory accesses here are alignment. 56e5c88e3fSFeng Kan */ 57e5c88e3fSFeng Kan tbz tmp2, #0, 1f 58e5c88e3fSFeng Kan ldrb1 tmp1w, src, #1 59e5c88e3fSFeng Kan strb1 tmp1w, dst, #1 60e5c88e3fSFeng Kan1: 61e5c88e3fSFeng Kan tbz tmp2, #1, 2f 62e5c88e3fSFeng Kan ldrh1 tmp1w, src, #2 63e5c88e3fSFeng Kan strh1 tmp1w, dst, #2 64e5c88e3fSFeng Kan2: 65e5c88e3fSFeng Kan tbz tmp2, #2, 3f 66e5c88e3fSFeng Kan ldr1 tmp1w, src, #4 67e5c88e3fSFeng Kan str1 tmp1w, dst, #4 68e5c88e3fSFeng Kan3: 69e5c88e3fSFeng Kan tbz tmp2, #3, .LSrcAligned 70e5c88e3fSFeng Kan ldr1 tmp1, src, #8 71e5c88e3fSFeng Kan str1 tmp1, dst, #8 72e5c88e3fSFeng Kan 73e5c88e3fSFeng Kan.LSrcAligned: 74e5c88e3fSFeng Kan cmp count, #64 75e5c88e3fSFeng Kan b.ge .Lcpy_over64 76e5c88e3fSFeng Kan /* 77e5c88e3fSFeng Kan * Deal with small copies quickly by dropping straight into the 78e5c88e3fSFeng Kan * exit block. 79e5c88e3fSFeng Kan */ 80e5c88e3fSFeng Kan.Ltail63: 81e5c88e3fSFeng Kan /* 82e5c88e3fSFeng Kan * Copy up to 48 bytes of data. At this point we only need the 83e5c88e3fSFeng Kan * bottom 6 bits of count to be accurate. 84e5c88e3fSFeng Kan */ 85e5c88e3fSFeng Kan ands tmp1, count, #0x30 86e5c88e3fSFeng Kan b.eq .Ltiny15 87e5c88e3fSFeng Kan cmp tmp1w, #0x20 88e5c88e3fSFeng Kan b.eq 1f 89e5c88e3fSFeng Kan b.lt 2f 90e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 91e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 92e5c88e3fSFeng Kan1: 93e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 94e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 95e5c88e3fSFeng Kan2: 96e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 97e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 98e5c88e3fSFeng Kan.Ltiny15: 99e5c88e3fSFeng Kan /* 100e5c88e3fSFeng Kan * Prefer to break one ldp/stp into several load/store to access 101e5c88e3fSFeng Kan * memory in an increasing address order,rather than to load/store 16 102e5c88e3fSFeng Kan * bytes from (src-16) to (dst-16) and to backward the src to aligned 103e5c88e3fSFeng Kan * address,which way is used in original cortex memcpy. If keeping 104e5c88e3fSFeng Kan * the original memcpy process here, memmove need to satisfy the 105e5c88e3fSFeng Kan * precondition that src address is at least 16 bytes bigger than dst 106e5c88e3fSFeng Kan * address,otherwise some source data will be overwritten when memove 107e5c88e3fSFeng Kan * call memcpy directly. To make memmove simpler and decouple the 108e5c88e3fSFeng Kan * memcpy's dependency on memmove, withdrew the original process. 109e5c88e3fSFeng Kan */ 110e5c88e3fSFeng Kan tbz count, #3, 1f 111e5c88e3fSFeng Kan ldr1 tmp1, src, #8 112e5c88e3fSFeng Kan str1 tmp1, dst, #8 113e5c88e3fSFeng Kan1: 114e5c88e3fSFeng Kan tbz count, #2, 2f 115e5c88e3fSFeng Kan ldr1 tmp1w, src, #4 116e5c88e3fSFeng Kan str1 tmp1w, dst, #4 117e5c88e3fSFeng Kan2: 118e5c88e3fSFeng Kan tbz count, #1, 3f 119e5c88e3fSFeng Kan ldrh1 tmp1w, src, #2 120e5c88e3fSFeng Kan strh1 tmp1w, dst, #2 121e5c88e3fSFeng Kan3: 122e5c88e3fSFeng Kan tbz count, #0, .Lexitfunc 123e5c88e3fSFeng Kan ldrb1 tmp1w, src, #1 124e5c88e3fSFeng Kan strb1 tmp1w, dst, #1 125e5c88e3fSFeng Kan 126e5c88e3fSFeng Kan b .Lexitfunc 127e5c88e3fSFeng Kan 128e5c88e3fSFeng Kan.Lcpy_over64: 129e5c88e3fSFeng Kan subs count, count, #128 130e5c88e3fSFeng Kan b.ge .Lcpy_body_large 131e5c88e3fSFeng Kan /* 132e5c88e3fSFeng Kan * Less than 128 bytes to copy, so handle 64 here and then jump 133e5c88e3fSFeng Kan * to the tail. 134e5c88e3fSFeng Kan */ 135e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 136e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 137e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 138e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 139e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 140e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 141e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 142e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 143e5c88e3fSFeng Kan 144e5c88e3fSFeng Kan tst count, #0x3f 145e5c88e3fSFeng Kan b.ne .Ltail63 146e5c88e3fSFeng Kan b .Lexitfunc 147e5c88e3fSFeng Kan 148e5c88e3fSFeng Kan /* 149e5c88e3fSFeng Kan * Critical loop. Start at a new cache line boundary. Assuming 150e5c88e3fSFeng Kan * 64 bytes per line this ensures the entire loop is in one line. 151e5c88e3fSFeng Kan */ 152e5c88e3fSFeng Kan .p2align L1_CACHE_SHIFT 153e5c88e3fSFeng Kan.Lcpy_body_large: 154e5c88e3fSFeng Kan /* pre-get 64 bytes data. */ 155e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 156e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 157e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 158e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 159e5c88e3fSFeng Kan1: 160e5c88e3fSFeng Kan /* 161e5c88e3fSFeng Kan * interlace the load of next 64 bytes data block with store of the last 162e5c88e3fSFeng Kan * loaded 64 bytes data. 163e5c88e3fSFeng Kan */ 164e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 165e5c88e3fSFeng Kan ldp1 A_l, A_h, src, #16 166e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 167e5c88e3fSFeng Kan ldp1 B_l, B_h, src, #16 168e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 169e5c88e3fSFeng Kan ldp1 C_l, C_h, src, #16 170e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 171e5c88e3fSFeng Kan ldp1 D_l, D_h, src, #16 172e5c88e3fSFeng Kan subs count, count, #64 173e5c88e3fSFeng Kan b.ge 1b 174e5c88e3fSFeng Kan stp1 A_l, A_h, dst, #16 175e5c88e3fSFeng Kan stp1 B_l, B_h, dst, #16 176e5c88e3fSFeng Kan stp1 C_l, C_h, dst, #16 177e5c88e3fSFeng Kan stp1 D_l, D_h, dst, #16 178e5c88e3fSFeng Kan 179e5c88e3fSFeng Kan tst count, #0x3f 180e5c88e3fSFeng Kan b.ne .Ltail63 181e5c88e3fSFeng Kan.Lexitfunc: 182