19d1f0ec9SMichael T. Kloos/* SPDX-License-Identifier: GPL-2.0-only */ 29d1f0ec9SMichael T. Kloos/* 39d1f0ec9SMichael T. Kloos * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> 49d1f0ec9SMichael T. Kloos */ 504091d6cSNylon Chen 604091d6cSNylon Chen#include <linux/linkage.h> 704091d6cSNylon Chen#include <asm/asm.h> 804091d6cSNylon Chen 99d1f0ec9SMichael T. KloosSYM_FUNC_START(__memmove) 109d1f0ec9SMichael T. KloosSYM_FUNC_START_WEAK(memmove) 119d1f0ec9SMichael T. Kloos /* 129d1f0ec9SMichael T. Kloos * Returns 139d1f0ec9SMichael T. Kloos * a0 - dest 149d1f0ec9SMichael T. Kloos * 159d1f0ec9SMichael T. Kloos * Parameters 169d1f0ec9SMichael T. Kloos * a0 - Inclusive first byte of dest 179d1f0ec9SMichael T. Kloos * a1 - Inclusive first byte of src 189d1f0ec9SMichael T. Kloos * a2 - Length of copy n 199d1f0ec9SMichael T. Kloos * 209d1f0ec9SMichael T. Kloos * Because the return matches the parameter register a0, 219d1f0ec9SMichael T. Kloos * we will not clobber or modify that register. 229d1f0ec9SMichael T. Kloos * 239d1f0ec9SMichael T. Kloos * Note: This currently only works on little-endian. 249d1f0ec9SMichael T. Kloos * To port to big-endian, reverse the direction of shifts 259d1f0ec9SMichael T. Kloos * in the 2 misaligned fixup copy loops. 269d1f0ec9SMichael T. Kloos */ 2704091d6cSNylon Chen 289d1f0ec9SMichael T. Kloos /* Return if nothing to do */ 299d1f0ec9SMichael T. Kloos beq a0, a1, return_from_memmove 309d1f0ec9SMichael T. Kloos beqz a2, return_from_memmove 3104091d6cSNylon Chen 329d1f0ec9SMichael T. Kloos /* 339d1f0ec9SMichael T. Kloos * Register Uses 349d1f0ec9SMichael T. Kloos * Forward Copy: a1 - Index counter of src 359d1f0ec9SMichael T. Kloos * Reverse Copy: a4 - Index counter of src 369d1f0ec9SMichael T. Kloos * Forward Copy: t3 - Index counter of dest 379d1f0ec9SMichael T. Kloos * Reverse Copy: t4 - Index counter of dest 389d1f0ec9SMichael T. Kloos * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest 399d1f0ec9SMichael T. Kloos * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest 409d1f0ec9SMichael T. Kloos * Both Copy Modes: t0 - Link / Temporary for load-store 419d1f0ec9SMichael T. Kloos * Both Copy Modes: t1 - Temporary for load-store 429d1f0ec9SMichael T. Kloos * Both Copy Modes: t2 - Temporary for load-store 439d1f0ec9SMichael T. Kloos * Both Copy Modes: a5 - dest to src alignment offset 449d1f0ec9SMichael T. Kloos * Both Copy Modes: a6 - Shift ammount 459d1f0ec9SMichael T. Kloos * Both Copy Modes: a7 - Inverse Shift ammount 469d1f0ec9SMichael T. Kloos * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops 479d1f0ec9SMichael T. Kloos */ 4804091d6cSNylon Chen 499d1f0ec9SMichael T. Kloos /* 509d1f0ec9SMichael T. Kloos * Solve for some register values now. 519d1f0ec9SMichael T. Kloos * Byte copy does not need t5 or t6. 529d1f0ec9SMichael T. Kloos */ 539d1f0ec9SMichael T. Kloos mv t3, a0 549d1f0ec9SMichael T. Kloos add t4, a0, a2 559d1f0ec9SMichael T. Kloos add a4, a1, a2 5604091d6cSNylon Chen 579d1f0ec9SMichael T. Kloos /* 589d1f0ec9SMichael T. Kloos * Byte copy if copying less than (2 * SZREG) bytes. This can 599d1f0ec9SMichael T. Kloos * cause problems with the bulk copy implementation and is 609d1f0ec9SMichael T. Kloos * small enough not to bother. 619d1f0ec9SMichael T. Kloos */ 629d1f0ec9SMichael T. Kloos andi t0, a2, -(2 * SZREG) 639d1f0ec9SMichael T. Kloos beqz t0, byte_copy 6404091d6cSNylon Chen 659d1f0ec9SMichael T. Kloos /* 669d1f0ec9SMichael T. Kloos * Now solve for t5 and t6. 679d1f0ec9SMichael T. Kloos */ 689d1f0ec9SMichael T. Kloos andi t5, t3, -SZREG 699d1f0ec9SMichael T. Kloos andi t6, t4, -SZREG 709d1f0ec9SMichael T. Kloos /* 719d1f0ec9SMichael T. Kloos * If dest(Register t3) rounded down to the nearest naturally 729d1f0ec9SMichael T. Kloos * aligned SZREG address, does not equal dest, then add SZREG 739d1f0ec9SMichael T. Kloos * to find the low-bound of SZREG alignment in the dest memory 749d1f0ec9SMichael T. Kloos * region. Note that this could overshoot the dest memory 759d1f0ec9SMichael T. Kloos * region if n is less than SZREG. This is one reason why 769d1f0ec9SMichael T. Kloos * we always byte copy if n is less than SZREG. 779d1f0ec9SMichael T. Kloos * Otherwise, dest is already naturally aligned to SZREG. 789d1f0ec9SMichael T. Kloos */ 799d1f0ec9SMichael T. Kloos beq t5, t3, 1f 809d1f0ec9SMichael T. Kloos addi t5, t5, SZREG 819d1f0ec9SMichael T. Kloos 1: 8204091d6cSNylon Chen 839d1f0ec9SMichael T. Kloos /* 849d1f0ec9SMichael T. Kloos * If the dest and src are co-aligned to SZREG, then there is 859d1f0ec9SMichael T. Kloos * no need for the full rigmarole of a full misaligned fixup copy. 869d1f0ec9SMichael T. Kloos * Instead, do a simpler co-aligned copy. 879d1f0ec9SMichael T. Kloos */ 889d1f0ec9SMichael T. Kloos xor t0, a0, a1 899d1f0ec9SMichael T. Kloos andi t1, t0, (SZREG - 1) 909d1f0ec9SMichael T. Kloos beqz t1, coaligned_copy 919d1f0ec9SMichael T. Kloos /* Fall through to misaligned fixup copy */ 9204091d6cSNylon Chen 939d1f0ec9SMichael T. Kloosmisaligned_fixup_copy: 949d1f0ec9SMichael T. Kloos bltu a1, a0, misaligned_fixup_copy_reverse 9504091d6cSNylon Chen 969d1f0ec9SMichael T. Kloosmisaligned_fixup_copy_forward: 979d1f0ec9SMichael T. Kloos jal t0, byte_copy_until_aligned_forward 989d1f0ec9SMichael T. Kloos 999d1f0ec9SMichael T. Kloos andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ 1009d1f0ec9SMichael T. Kloos slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 1019d1f0ec9SMichael T. Kloos sub a5, a1, t3 /* Find the difference between src and dest */ 1029d1f0ec9SMichael T. Kloos andi a1, a1, -SZREG /* Align the src pointer */ 1039d1f0ec9SMichael T. Kloos addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ 1049d1f0ec9SMichael T. Kloos 1059d1f0ec9SMichael T. Kloos /* 1069d1f0ec9SMichael T. Kloos * Compute The Inverse Shift 1079d1f0ec9SMichael T. Kloos * a7 = XLEN - a6 = XLEN + -a6 1089d1f0ec9SMichael T. Kloos * 2s complement negation to find the negative: -a6 = ~a6 + 1 1099d1f0ec9SMichael T. Kloos * Add that to XLEN. XLEN = SZREG * 8. 1109d1f0ec9SMichael T. Kloos */ 1119d1f0ec9SMichael T. Kloos not a7, a6 1129d1f0ec9SMichael T. Kloos addi a7, a7, (SZREG * 8 + 1) 1139d1f0ec9SMichael T. Kloos 1149d1f0ec9SMichael T. Kloos /* 1159d1f0ec9SMichael T. Kloos * Fix Misalignment Copy Loop - Forward 1169d1f0ec9SMichael T. Kloos * load_val0 = load_ptr[0]; 1179d1f0ec9SMichael T. Kloos * do { 1189d1f0ec9SMichael T. Kloos * load_val1 = load_ptr[1]; 1199d1f0ec9SMichael T. Kloos * store_ptr += 2; 1209d1f0ec9SMichael T. Kloos * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); 1219d1f0ec9SMichael T. Kloos * 1229d1f0ec9SMichael T. Kloos * if (store_ptr == {a2}) 1239d1f0ec9SMichael T. Kloos * break; 1249d1f0ec9SMichael T. Kloos * 1259d1f0ec9SMichael T. Kloos * load_val0 = load_ptr[2]; 1269d1f0ec9SMichael T. Kloos * load_ptr += 2; 1279d1f0ec9SMichael T. Kloos * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); 1289d1f0ec9SMichael T. Kloos * 1299d1f0ec9SMichael T. Kloos * } while (store_ptr != store_ptr_end); 1309d1f0ec9SMichael T. Kloos * store_ptr = store_ptr_end; 1319d1f0ec9SMichael T. Kloos */ 1329d1f0ec9SMichael T. Kloos 1339d1f0ec9SMichael T. Kloos REG_L t0, (0 * SZREG)(a1) 1349d1f0ec9SMichael T. Kloos 1: 1359d1f0ec9SMichael T. Kloos REG_L t1, (1 * SZREG)(a1) 1369d1f0ec9SMichael T. Kloos addi t3, t3, (2 * SZREG) 1379d1f0ec9SMichael T. Kloos srl t0, t0, a6 1389d1f0ec9SMichael T. Kloos sll t2, t1, a7 1399d1f0ec9SMichael T. Kloos or t2, t0, t2 1409d1f0ec9SMichael T. Kloos REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) 1419d1f0ec9SMichael T. Kloos 1429d1f0ec9SMichael T. Kloos beq t3, a2, 2f 1439d1f0ec9SMichael T. Kloos 1449d1f0ec9SMichael T. Kloos REG_L t0, (2 * SZREG)(a1) 1459d1f0ec9SMichael T. Kloos addi a1, a1, (2 * SZREG) 1469d1f0ec9SMichael T. Kloos srl t1, t1, a6 1479d1f0ec9SMichael T. Kloos sll t2, t0, a7 1489d1f0ec9SMichael T. Kloos or t2, t1, t2 1499d1f0ec9SMichael T. Kloos REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) 1509d1f0ec9SMichael T. Kloos 1519d1f0ec9SMichael T. Kloos bne t3, t6, 1b 1529d1f0ec9SMichael T. Kloos 2: 1539d1f0ec9SMichael T. Kloos mv t3, t6 /* Fix the dest pointer in case the loop was broken */ 1549d1f0ec9SMichael T. Kloos 1559d1f0ec9SMichael T. Kloos add a1, t3, a5 /* Restore the src pointer */ 1569d1f0ec9SMichael T. Kloos j byte_copy_forward /* Copy any remaining bytes */ 1579d1f0ec9SMichael T. Kloos 1589d1f0ec9SMichael T. Kloosmisaligned_fixup_copy_reverse: 1599d1f0ec9SMichael T. Kloos jal t0, byte_copy_until_aligned_reverse 1609d1f0ec9SMichael T. Kloos 1619d1f0ec9SMichael T. Kloos andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ 1629d1f0ec9SMichael T. Kloos slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 1639d1f0ec9SMichael T. Kloos sub a5, a4, t4 /* Find the difference between src and dest */ 1649d1f0ec9SMichael T. Kloos andi a4, a4, -SZREG /* Align the src pointer */ 1659d1f0ec9SMichael T. Kloos addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ 1669d1f0ec9SMichael T. Kloos 1679d1f0ec9SMichael T. Kloos /* 1689d1f0ec9SMichael T. Kloos * Compute The Inverse Shift 1699d1f0ec9SMichael T. Kloos * a7 = XLEN - a6 = XLEN + -a6 1709d1f0ec9SMichael T. Kloos * 2s complement negation to find the negative: -a6 = ~a6 + 1 1719d1f0ec9SMichael T. Kloos * Add that to XLEN. XLEN = SZREG * 8. 1729d1f0ec9SMichael T. Kloos */ 1739d1f0ec9SMichael T. Kloos not a7, a6 1749d1f0ec9SMichael T. Kloos addi a7, a7, (SZREG * 8 + 1) 1759d1f0ec9SMichael T. Kloos 1769d1f0ec9SMichael T. Kloos /* 1779d1f0ec9SMichael T. Kloos * Fix Misalignment Copy Loop - Reverse 1789d1f0ec9SMichael T. Kloos * load_val1 = load_ptr[0]; 1799d1f0ec9SMichael T. Kloos * do { 1809d1f0ec9SMichael T. Kloos * load_val0 = load_ptr[-1]; 1819d1f0ec9SMichael T. Kloos * store_ptr -= 2; 1829d1f0ec9SMichael T. Kloos * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); 1839d1f0ec9SMichael T. Kloos * 1849d1f0ec9SMichael T. Kloos * if (store_ptr == {a2}) 1859d1f0ec9SMichael T. Kloos * break; 1869d1f0ec9SMichael T. Kloos * 1879d1f0ec9SMichael T. Kloos * load_val1 = load_ptr[-2]; 1889d1f0ec9SMichael T. Kloos * load_ptr -= 2; 1899d1f0ec9SMichael T. Kloos * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); 1909d1f0ec9SMichael T. Kloos * 1919d1f0ec9SMichael T. Kloos * } while (store_ptr != store_ptr_end); 1929d1f0ec9SMichael T. Kloos * store_ptr = store_ptr_end; 1939d1f0ec9SMichael T. Kloos */ 1949d1f0ec9SMichael T. Kloos 1959d1f0ec9SMichael T. Kloos REG_L t1, ( 0 * SZREG)(a4) 1969d1f0ec9SMichael T. Kloos 1: 1979d1f0ec9SMichael T. Kloos REG_L t0, (-1 * SZREG)(a4) 1989d1f0ec9SMichael T. Kloos addi t4, t4, (-2 * SZREG) 1999d1f0ec9SMichael T. Kloos sll t1, t1, a7 2009d1f0ec9SMichael T. Kloos srl t2, t0, a6 2019d1f0ec9SMichael T. Kloos or t2, t1, t2 2029d1f0ec9SMichael T. Kloos REG_S t2, ( 1 * SZREG)(t4) 2039d1f0ec9SMichael T. Kloos 2049d1f0ec9SMichael T. Kloos beq t4, a2, 2f 2059d1f0ec9SMichael T. Kloos 2069d1f0ec9SMichael T. Kloos REG_L t1, (-2 * SZREG)(a4) 2079d1f0ec9SMichael T. Kloos addi a4, a4, (-2 * SZREG) 2089d1f0ec9SMichael T. Kloos sll t0, t0, a7 2099d1f0ec9SMichael T. Kloos srl t2, t1, a6 2109d1f0ec9SMichael T. Kloos or t2, t0, t2 2119d1f0ec9SMichael T. Kloos REG_S t2, ( 0 * SZREG)(t4) 2129d1f0ec9SMichael T. Kloos 2139d1f0ec9SMichael T. Kloos bne t4, t5, 1b 2149d1f0ec9SMichael T. Kloos 2: 2159d1f0ec9SMichael T. Kloos mv t4, t5 /* Fix the dest pointer in case the loop was broken */ 2169d1f0ec9SMichael T. Kloos 2179d1f0ec9SMichael T. Kloos add a4, t4, a5 /* Restore the src pointer */ 2189d1f0ec9SMichael T. Kloos j byte_copy_reverse /* Copy any remaining bytes */ 2199d1f0ec9SMichael T. Kloos 2209d1f0ec9SMichael T. Kloos/* 2219d1f0ec9SMichael T. Kloos * Simple copy loops for SZREG co-aligned memory locations. 2229d1f0ec9SMichael T. Kloos * These also make calls to do byte copies for any unaligned 2239d1f0ec9SMichael T. Kloos * data at their terminations. 2249d1f0ec9SMichael T. Kloos */ 2259d1f0ec9SMichael T. Klooscoaligned_copy: 2269d1f0ec9SMichael T. Kloos bltu a1, a0, coaligned_copy_reverse 2279d1f0ec9SMichael T. Kloos 2289d1f0ec9SMichael T. Klooscoaligned_copy_forward: 2299d1f0ec9SMichael T. Kloos jal t0, byte_copy_until_aligned_forward 2309d1f0ec9SMichael T. Kloos 2319d1f0ec9SMichael T. Kloos 1: 2329d1f0ec9SMichael T. Kloos REG_L t1, ( 0 * SZREG)(a1) 2339d1f0ec9SMichael T. Kloos addi a1, a1, SZREG 2349d1f0ec9SMichael T. Kloos addi t3, t3, SZREG 2359d1f0ec9SMichael T. Kloos REG_S t1, (-1 * SZREG)(t3) 2369d1f0ec9SMichael T. Kloos bne t3, t6, 1b 2379d1f0ec9SMichael T. Kloos 2389d1f0ec9SMichael T. Kloos j byte_copy_forward /* Copy any remaining bytes */ 2399d1f0ec9SMichael T. Kloos 2409d1f0ec9SMichael T. Klooscoaligned_copy_reverse: 2419d1f0ec9SMichael T. Kloos jal t0, byte_copy_until_aligned_reverse 2429d1f0ec9SMichael T. Kloos 2439d1f0ec9SMichael T. Kloos 1: 2449d1f0ec9SMichael T. Kloos REG_L t1, (-1 * SZREG)(a4) 2459d1f0ec9SMichael T. Kloos addi a4, a4, -SZREG 2469d1f0ec9SMichael T. Kloos addi t4, t4, -SZREG 2479d1f0ec9SMichael T. Kloos REG_S t1, ( 0 * SZREG)(t4) 2489d1f0ec9SMichael T. Kloos bne t4, t5, 1b 2499d1f0ec9SMichael T. Kloos 2509d1f0ec9SMichael T. Kloos j byte_copy_reverse /* Copy any remaining bytes */ 2519d1f0ec9SMichael T. Kloos 2529d1f0ec9SMichael T. Kloos/* 2539d1f0ec9SMichael T. Kloos * These are basically sub-functions within the function. They 2549d1f0ec9SMichael T. Kloos * are used to byte copy until the dest pointer is in alignment. 2559d1f0ec9SMichael T. Kloos * At which point, a bulk copy method can be used by the 2569d1f0ec9SMichael T. Kloos * calling code. These work on the same registers as the bulk 2579d1f0ec9SMichael T. Kloos * copy loops. Therefore, the register values can be picked 2589d1f0ec9SMichael T. Kloos * up from where they were left and we avoid code duplication 2599d1f0ec9SMichael T. Kloos * without any overhead except the call in and return jumps. 2609d1f0ec9SMichael T. Kloos */ 2619d1f0ec9SMichael T. Kloosbyte_copy_until_aligned_forward: 2629d1f0ec9SMichael T. Kloos beq t3, t5, 2f 2639d1f0ec9SMichael T. Kloos 1: 2649d1f0ec9SMichael T. Kloos lb t1, 0(a1) 2659d1f0ec9SMichael T. Kloos addi a1, a1, 1 2669d1f0ec9SMichael T. Kloos addi t3, t3, 1 2679d1f0ec9SMichael T. Kloos sb t1, -1(t3) 2689d1f0ec9SMichael T. Kloos bne t3, t5, 1b 2699d1f0ec9SMichael T. Kloos 2: 2709d1f0ec9SMichael T. Kloos jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 2719d1f0ec9SMichael T. Kloos 2729d1f0ec9SMichael T. Kloosbyte_copy_until_aligned_reverse: 2739d1f0ec9SMichael T. Kloos beq t4, t6, 2f 2749d1f0ec9SMichael T. Kloos 1: 2759d1f0ec9SMichael T. Kloos lb t1, -1(a4) 2769d1f0ec9SMichael T. Kloos addi a4, a4, -1 2779d1f0ec9SMichael T. Kloos addi t4, t4, -1 2789d1f0ec9SMichael T. Kloos sb t1, 0(t4) 2799d1f0ec9SMichael T. Kloos bne t4, t6, 1b 2809d1f0ec9SMichael T. Kloos 2: 2819d1f0ec9SMichael T. Kloos jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 2829d1f0ec9SMichael T. Kloos 2839d1f0ec9SMichael T. Kloos/* 2849d1f0ec9SMichael T. Kloos * Simple byte copy loops. 2859d1f0ec9SMichael T. Kloos * These will byte copy until they reach the end of data to copy. 2869d1f0ec9SMichael T. Kloos * At that point, they will call to return from memmove. 2879d1f0ec9SMichael T. Kloos */ 28804091d6cSNylon Chenbyte_copy: 2899d1f0ec9SMichael T. Kloos bltu a1, a0, byte_copy_reverse 29004091d6cSNylon Chen 2919d1f0ec9SMichael T. Kloosbyte_copy_forward: 2929d1f0ec9SMichael T. Kloos beq t3, t4, 2f 2939d1f0ec9SMichael T. Kloos 1: 2949d1f0ec9SMichael T. Kloos lb t1, 0(a1) 2959d1f0ec9SMichael T. Kloos addi a1, a1, 1 2969d1f0ec9SMichael T. Kloos addi t3, t3, 1 2979d1f0ec9SMichael T. Kloos sb t1, -1(t3) 2989d1f0ec9SMichael T. Kloos bne t3, t4, 1b 2999d1f0ec9SMichael T. Kloos 2: 30004091d6cSNylon Chen ret 3019d1f0ec9SMichael T. Kloos 3029d1f0ec9SMichael T. Kloosbyte_copy_reverse: 3039d1f0ec9SMichael T. Kloos beq t4, t3, 2f 3049d1f0ec9SMichael T. Kloos 1: 3059d1f0ec9SMichael T. Kloos lb t1, -1(a4) 3069d1f0ec9SMichael T. Kloos addi a4, a4, -1 3079d1f0ec9SMichael T. Kloos addi t4, t4, -1 3089d1f0ec9SMichael T. Kloos sb t1, 0(t4) 3099d1f0ec9SMichael T. Kloos bne t4, t3, 1b 3109d1f0ec9SMichael T. Kloos 2: 3119d1f0ec9SMichael T. Kloos 3129d1f0ec9SMichael T. Kloosreturn_from_memmove: 3139d1f0ec9SMichael T. Kloos ret 3149d1f0ec9SMichael T. Kloos 3159d1f0ec9SMichael T. KloosSYM_FUNC_END(memmove) 3169d1f0ec9SMichael T. KloosSYM_FUNC_END(__memmove) 317*26e7aacbSAlexandre GhitiSYM_FUNC_ALIAS(__pi_memmove, __memmove) 318*26e7aacbSAlexandre GhitiSYM_FUNC_ALIAS(__pi___memmove, __memmove) 319