1249ac17eSChris Zankel/* 2249ac17eSChris Zankel * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3249ac17eSChris Zankel * xthal_memcpy and xthal_bcopy 4249ac17eSChris Zankel * 5249ac17eSChris Zankel * This file is subject to the terms and conditions of the GNU General Public 6249ac17eSChris Zankel * License. See the file "COPYING" in the main directory of this archive 7249ac17eSChris Zankel * for more details. 8249ac17eSChris Zankel * 9eae8a416SChris Zankel * Copyright (C) 2002 - 2012 Tensilica Inc. 10249ac17eSChris Zankel */ 11249ac17eSChris Zankel 125cf97ebdSMax Filippov#include <linux/linkage.h> 13fbb871e2SMax Filippov#include <asm/asmmacro.h> 148f8d5745SMax Filippov#include <asm/core.h> 15249ac17eSChris Zankel 16249ac17eSChris Zankel/* 17249ac17eSChris Zankel * void *memcpy(void *dst, const void *src, size_t len); 18249ac17eSChris Zankel * 19249ac17eSChris Zankel * This function is intended to do the same thing as the standard 20eae8a416SChris Zankel * library function memcpy() for most cases. 21249ac17eSChris Zankel * However, where the source and/or destination references 22249ac17eSChris Zankel * an instruction RAM or ROM or a data RAM or ROM, that 23249ac17eSChris Zankel * source and/or destination will always be accessed with 24249ac17eSChris Zankel * 32-bit load and store instructions (as required for these 25249ac17eSChris Zankel * types of devices). 26249ac17eSChris Zankel * 27249ac17eSChris Zankel * !!!!!!! XTFIXME: 28249ac17eSChris Zankel * !!!!!!! Handling of IRAM/IROM has not yet 29249ac17eSChris Zankel * !!!!!!! been implemented. 30249ac17eSChris Zankel * 31249ac17eSChris Zankel * The (general case) algorithm is as follows: 32249ac17eSChris Zankel * If destination is unaligned, align it by conditionally 33249ac17eSChris Zankel * copying 1 and 2 bytes. 34249ac17eSChris Zankel * If source is aligned, 35249ac17eSChris Zankel * do 16 bytes with a loop, and then finish up with 36249ac17eSChris Zankel * 8, 4, 2, and 1 byte copies conditional on the length; 37249ac17eSChris Zankel * else (if source is unaligned), 38249ac17eSChris Zankel * do the same, but use SRC to align the source data. 39249ac17eSChris Zankel * This code tries to use fall-through branches for the common 40249ac17eSChris Zankel * case of aligned source and destination and multiple 41249ac17eSChris Zankel * of 4 (or 8) length. 42249ac17eSChris Zankel * 43249ac17eSChris Zankel * Register use: 44249ac17eSChris Zankel * a0/ return address 45249ac17eSChris Zankel * a1/ stack pointer 46249ac17eSChris Zankel * a2/ return value 47249ac17eSChris Zankel * a3/ src 48249ac17eSChris Zankel * a4/ length 49249ac17eSChris Zankel * a5/ dst 50249ac17eSChris Zankel * a6/ tmp 51249ac17eSChris Zankel * a7/ tmp 52249ac17eSChris Zankel * a8/ tmp 53249ac17eSChris Zankel * a9/ tmp 54249ac17eSChris Zankel * a10/ tmp 55249ac17eSChris Zankel * a11/ tmp 56249ac17eSChris Zankel */ 57249ac17eSChris Zankel 58249ac17eSChris Zankel .text 59249ac17eSChris Zankel 60249ac17eSChris Zankel/* 61249ac17eSChris Zankel * Byte by byte copy 62249ac17eSChris Zankel */ 63249ac17eSChris Zankel .align 4 64249ac17eSChris Zankel .byte 0 # 1 mod 4 alignment for LOOPNEZ 65249ac17eSChris Zankel # (0 mod 4 alignment for LBEG) 66249ac17eSChris Zankel.Lbytecopy: 67249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS 68249ac17eSChris Zankel loopnez a4, .Lbytecopydone 69249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 70249ac17eSChris Zankel beqz a4, .Lbytecopydone 71249ac17eSChris Zankel add a7, a3, a4 # a7 = end address for source 72249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 73249ac17eSChris Zankel.Lnextbyte: 74249ac17eSChris Zankel l8ui a6, a3, 0 75249ac17eSChris Zankel addi a3, a3, 1 76249ac17eSChris Zankel s8i a6, a5, 0 77249ac17eSChris Zankel addi a5, a5, 1 78249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS 79eae8a416SChris Zankel bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 80249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 81249ac17eSChris Zankel.Lbytecopydone: 82d6d5f19eSMax Filippov abi_ret_default 83249ac17eSChris Zankel 84249ac17eSChris Zankel/* 85249ac17eSChris Zankel * Destination is unaligned 86249ac17eSChris Zankel */ 87249ac17eSChris Zankel 88249ac17eSChris Zankel .align 4 89249ac17eSChris Zankel.Ldst1mod2: # dst is only byte aligned 90249ac17eSChris Zankel _bltui a4, 7, .Lbytecopy # do short copies byte by byte 91249ac17eSChris Zankel 92249ac17eSChris Zankel # copy 1 byte 93249ac17eSChris Zankel l8ui a6, a3, 0 94249ac17eSChris Zankel addi a3, a3, 1 95249ac17eSChris Zankel addi a4, a4, -1 96249ac17eSChris Zankel s8i a6, a5, 0 97249ac17eSChris Zankel addi a5, a5, 1 98249ac17eSChris Zankel _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 99249ac17eSChris Zankel # return to main algorithm 100249ac17eSChris Zankel.Ldst2mod4: # dst 16-bit aligned 101249ac17eSChris Zankel # copy 2 bytes 102249ac17eSChris Zankel _bltui a4, 6, .Lbytecopy # do short copies byte by byte 103249ac17eSChris Zankel l8ui a6, a3, 0 104249ac17eSChris Zankel l8ui a7, a3, 1 105249ac17eSChris Zankel addi a3, a3, 2 106249ac17eSChris Zankel addi a4, a4, -2 107249ac17eSChris Zankel s8i a6, a5, 0 108249ac17eSChris Zankel s8i a7, a5, 1 109249ac17eSChris Zankel addi a5, a5, 2 110249ac17eSChris Zankel j .Ldstaligned # dst is now aligned, return to main algorithm 111249ac17eSChris Zankel 112c633544aSMax FilippovENTRY(__memcpy) 113c633544aSMax FilippovWEAK(memcpy) 114249ac17eSChris Zankel 115d6d5f19eSMax Filippov abi_entry_default 116249ac17eSChris Zankel # a2/ dst, a3/ src, a4/ len 117249ac17eSChris Zankel mov a5, a2 # copy dst so that a2 is return value 118249ac17eSChris Zankel.Lcommon: 119249ac17eSChris Zankel _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 120249ac17eSChris Zankel _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 121249ac17eSChris Zankel.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 122249ac17eSChris Zankel srli a7, a4, 4 # number of loop iterations with 16B 123249ac17eSChris Zankel # per iteration 124249ac17eSChris Zankel movi a8, 3 # if source is not aligned, 125249ac17eSChris Zankel _bany a3, a8, .Lsrcunaligned # then use shifting copy 126249ac17eSChris Zankel /* 127249ac17eSChris Zankel * Destination and source are word-aligned, use word copy. 128249ac17eSChris Zankel */ 129249ac17eSChris Zankel # copy 16 bytes per iteration for word-aligned dst and word-aligned src 130249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS 131249ac17eSChris Zankel loopnez a7, .Loop1done 132249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 133249ac17eSChris Zankel beqz a7, .Loop1done 134249ac17eSChris Zankel slli a8, a7, 4 135249ac17eSChris Zankel add a8, a8, a3 # a8 = end of last 16B source chunk 136249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 137249ac17eSChris Zankel.Loop1: 138249ac17eSChris Zankel l32i a6, a3, 0 139249ac17eSChris Zankel l32i a7, a3, 4 140249ac17eSChris Zankel s32i a6, a5, 0 141249ac17eSChris Zankel l32i a6, a3, 8 142249ac17eSChris Zankel s32i a7, a5, 4 143249ac17eSChris Zankel l32i a7, a3, 12 144249ac17eSChris Zankel s32i a6, a5, 8 145249ac17eSChris Zankel addi a3, a3, 16 146249ac17eSChris Zankel s32i a7, a5, 12 147249ac17eSChris Zankel addi a5, a5, 16 148249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS 149eae8a416SChris Zankel bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 150249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 151249ac17eSChris Zankel.Loop1done: 152249ac17eSChris Zankel bbci.l a4, 3, .L2 153249ac17eSChris Zankel # copy 8 bytes 154249ac17eSChris Zankel l32i a6, a3, 0 155249ac17eSChris Zankel l32i a7, a3, 4 156249ac17eSChris Zankel addi a3, a3, 8 157249ac17eSChris Zankel s32i a6, a5, 0 158249ac17eSChris Zankel s32i a7, a5, 4 159249ac17eSChris Zankel addi a5, a5, 8 160249ac17eSChris Zankel.L2: 161249ac17eSChris Zankel bbsi.l a4, 2, .L3 162249ac17eSChris Zankel bbsi.l a4, 1, .L4 163249ac17eSChris Zankel bbsi.l a4, 0, .L5 164d6d5f19eSMax Filippov abi_ret_default 165249ac17eSChris Zankel.L3: 166249ac17eSChris Zankel # copy 4 bytes 167249ac17eSChris Zankel l32i a6, a3, 0 168249ac17eSChris Zankel addi a3, a3, 4 169249ac17eSChris Zankel s32i a6, a5, 0 170249ac17eSChris Zankel addi a5, a5, 4 171249ac17eSChris Zankel bbsi.l a4, 1, .L4 172249ac17eSChris Zankel bbsi.l a4, 0, .L5 173d6d5f19eSMax Filippov abi_ret_default 174249ac17eSChris Zankel.L4: 175249ac17eSChris Zankel # copy 2 bytes 176249ac17eSChris Zankel l16ui a6, a3, 0 177249ac17eSChris Zankel addi a3, a3, 2 178249ac17eSChris Zankel s16i a6, a5, 0 179249ac17eSChris Zankel addi a5, a5, 2 180249ac17eSChris Zankel bbsi.l a4, 0, .L5 181d6d5f19eSMax Filippov abi_ret_default 182249ac17eSChris Zankel.L5: 183249ac17eSChris Zankel # copy 1 byte 184249ac17eSChris Zankel l8ui a6, a3, 0 185249ac17eSChris Zankel s8i a6, a5, 0 186d6d5f19eSMax Filippov abi_ret_default 187249ac17eSChris Zankel 188249ac17eSChris Zankel/* 189249ac17eSChris Zankel * Destination is aligned, Source is unaligned 190249ac17eSChris Zankel */ 191249ac17eSChris Zankel 192249ac17eSChris Zankel .align 4 193249ac17eSChris Zankel.Lsrcunaligned: 194249ac17eSChris Zankel _beqz a4, .Ldone # avoid loading anything for zero-length copies 195249ac17eSChris Zankel # copy 16 bytes per iteration for word-aligned dst and unaligned src 196fbb871e2SMax Filippov __ssa8 a3 # set shift amount from byte offset 197c4c4594bSChris Zankel 198c4c4594bSChris Zankel/* set to 1 when running on ISS (simulator) with the 199249ac17eSChris Zankel lint or ferret client, or 0 to save a few cycles */ 200c4c4594bSChris Zankel#define SIM_CHECKS_ALIGNMENT 1 201249ac17eSChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 202249ac17eSChris Zankel and a11, a3, a8 # save unalignment offset for below 203249ac17eSChris Zankel sub a3, a3, a11 # align a3 204249ac17eSChris Zankel#endif 205249ac17eSChris Zankel l32i a6, a3, 0 # load first word 206249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS 207249ac17eSChris Zankel loopnez a7, .Loop2done 208249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 209249ac17eSChris Zankel beqz a7, .Loop2done 210249ac17eSChris Zankel slli a10, a7, 4 211249ac17eSChris Zankel add a10, a10, a3 # a10 = end of last 16B source chunk 212249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 213249ac17eSChris Zankel.Loop2: 214249ac17eSChris Zankel l32i a7, a3, 4 215249ac17eSChris Zankel l32i a8, a3, 8 216fbb871e2SMax Filippov __src_b a6, a6, a7 217249ac17eSChris Zankel s32i a6, a5, 0 218249ac17eSChris Zankel l32i a9, a3, 12 219fbb871e2SMax Filippov __src_b a7, a7, a8 220249ac17eSChris Zankel s32i a7, a5, 4 221249ac17eSChris Zankel l32i a6, a3, 16 222fbb871e2SMax Filippov __src_b a8, a8, a9 223249ac17eSChris Zankel s32i a8, a5, 8 224249ac17eSChris Zankel addi a3, a3, 16 225fbb871e2SMax Filippov __src_b a9, a9, a6 226249ac17eSChris Zankel s32i a9, a5, 12 227249ac17eSChris Zankel addi a5, a5, 16 228249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS 229eae8a416SChris Zankel bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 230249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 231249ac17eSChris Zankel.Loop2done: 232249ac17eSChris Zankel bbci.l a4, 3, .L12 233249ac17eSChris Zankel # copy 8 bytes 234249ac17eSChris Zankel l32i a7, a3, 4 235249ac17eSChris Zankel l32i a8, a3, 8 236fbb871e2SMax Filippov __src_b a6, a6, a7 237249ac17eSChris Zankel s32i a6, a5, 0 238249ac17eSChris Zankel addi a3, a3, 8 239fbb871e2SMax Filippov __src_b a7, a7, a8 240249ac17eSChris Zankel s32i a7, a5, 4 241249ac17eSChris Zankel addi a5, a5, 8 242249ac17eSChris Zankel mov a6, a8 243249ac17eSChris Zankel.L12: 244249ac17eSChris Zankel bbci.l a4, 2, .L13 245249ac17eSChris Zankel # copy 4 bytes 246249ac17eSChris Zankel l32i a7, a3, 4 247249ac17eSChris Zankel addi a3, a3, 4 248fbb871e2SMax Filippov __src_b a6, a6, a7 249249ac17eSChris Zankel s32i a6, a5, 0 250249ac17eSChris Zankel addi a5, a5, 4 251249ac17eSChris Zankel mov a6, a7 252249ac17eSChris Zankel.L13: 253249ac17eSChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 254249ac17eSChris Zankel add a3, a3, a11 # readjust a3 with correct misalignment 255249ac17eSChris Zankel#endif 256249ac17eSChris Zankel bbsi.l a4, 1, .L14 257249ac17eSChris Zankel bbsi.l a4, 0, .L15 258d6d5f19eSMax Filippov.Ldone: abi_ret_default 259249ac17eSChris Zankel.L14: 260249ac17eSChris Zankel # copy 2 bytes 261249ac17eSChris Zankel l8ui a6, a3, 0 262249ac17eSChris Zankel l8ui a7, a3, 1 263249ac17eSChris Zankel addi a3, a3, 2 264249ac17eSChris Zankel s8i a6, a5, 0 265249ac17eSChris Zankel s8i a7, a5, 1 266249ac17eSChris Zankel addi a5, a5, 2 267249ac17eSChris Zankel bbsi.l a4, 0, .L15 268d6d5f19eSMax Filippov abi_ret_default 269249ac17eSChris Zankel.L15: 270249ac17eSChris Zankel # copy 1 byte 271249ac17eSChris Zankel l8ui a6, a3, 0 272249ac17eSChris Zankel s8i a6, a5, 0 273d6d5f19eSMax Filippov abi_ret_default 274eae8a416SChris Zankel 275c633544aSMax FilippovENDPROC(__memcpy) 276*338d9150SMax FilippovEXPORT_SYMBOL(__memcpy) 277*338d9150SMax FilippovEXPORT_SYMBOL(memcpy) 278eae8a416SChris Zankel 279eae8a416SChris Zankel/* 280eae8a416SChris Zankel * void *memmove(void *dst, const void *src, size_t len); 281eae8a416SChris Zankel * 282eae8a416SChris Zankel * This function is intended to do the same thing as the standard 283eae8a416SChris Zankel * library function memmove() for most cases. 284eae8a416SChris Zankel * However, where the source and/or destination references 285eae8a416SChris Zankel * an instruction RAM or ROM or a data RAM or ROM, that 286eae8a416SChris Zankel * source and/or destination will always be accessed with 287eae8a416SChris Zankel * 32-bit load and store instructions (as required for these 288eae8a416SChris Zankel * types of devices). 289eae8a416SChris Zankel * 290eae8a416SChris Zankel * !!!!!!! XTFIXME: 291eae8a416SChris Zankel * !!!!!!! Handling of IRAM/IROM has not yet 292eae8a416SChris Zankel * !!!!!!! been implemented. 293eae8a416SChris Zankel * 294eae8a416SChris Zankel * The (general case) algorithm is as follows: 295eae8a416SChris Zankel * If end of source doesn't overlap destination then use memcpy. 296eae8a416SChris Zankel * Otherwise do memcpy backwards. 297eae8a416SChris Zankel * 298eae8a416SChris Zankel * Register use: 299eae8a416SChris Zankel * a0/ return address 300eae8a416SChris Zankel * a1/ stack pointer 301eae8a416SChris Zankel * a2/ return value 302eae8a416SChris Zankel * a3/ src 303eae8a416SChris Zankel * a4/ length 304eae8a416SChris Zankel * a5/ dst 305eae8a416SChris Zankel * a6/ tmp 306eae8a416SChris Zankel * a7/ tmp 307eae8a416SChris Zankel * a8/ tmp 308eae8a416SChris Zankel * a9/ tmp 309eae8a416SChris Zankel * a10/ tmp 310eae8a416SChris Zankel * a11/ tmp 311eae8a416SChris Zankel */ 312eae8a416SChris Zankel 313eae8a416SChris Zankel/* 314eae8a416SChris Zankel * Byte by byte copy 315eae8a416SChris Zankel */ 316eae8a416SChris Zankel .align 4 317eae8a416SChris Zankel .byte 0 # 1 mod 4 alignment for LOOPNEZ 318eae8a416SChris Zankel # (0 mod 4 alignment for LBEG) 319eae8a416SChris Zankel.Lbackbytecopy: 320eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS 321eae8a416SChris Zankel loopnez a4, .Lbackbytecopydone 322eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 323eae8a416SChris Zankel beqz a4, .Lbackbytecopydone 324eae8a416SChris Zankel sub a7, a3, a4 # a7 = start address for source 325eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 326eae8a416SChris Zankel.Lbacknextbyte: 327eae8a416SChris Zankel addi a3, a3, -1 328eae8a416SChris Zankel l8ui a6, a3, 0 329eae8a416SChris Zankel addi a5, a5, -1 330eae8a416SChris Zankel s8i a6, a5, 0 331eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS 332eae8a416SChris Zankel bne a3, a7, .Lbacknextbyte # continue loop if 333eae8a416SChris Zankel # $a3:src != $a7:src_start 334eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 335eae8a416SChris Zankel.Lbackbytecopydone: 336d6d5f19eSMax Filippov abi_ret_default 337eae8a416SChris Zankel 338eae8a416SChris Zankel/* 339eae8a416SChris Zankel * Destination is unaligned 340eae8a416SChris Zankel */ 341eae8a416SChris Zankel 342eae8a416SChris Zankel .align 4 343eae8a416SChris Zankel.Lbackdst1mod2: # dst is only byte aligned 344eae8a416SChris Zankel _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 345eae8a416SChris Zankel 346eae8a416SChris Zankel # copy 1 byte 347eae8a416SChris Zankel addi a3, a3, -1 348eae8a416SChris Zankel l8ui a6, a3, 0 349eae8a416SChris Zankel addi a5, a5, -1 350eae8a416SChris Zankel s8i a6, a5, 0 351eae8a416SChris Zankel addi a4, a4, -1 352eae8a416SChris Zankel _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 353eae8a416SChris Zankel # return to main algorithm 354eae8a416SChris Zankel.Lbackdst2mod4: # dst 16-bit aligned 355eae8a416SChris Zankel # copy 2 bytes 356eae8a416SChris Zankel _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 357eae8a416SChris Zankel addi a3, a3, -2 358eae8a416SChris Zankel l8ui a6, a3, 0 359eae8a416SChris Zankel l8ui a7, a3, 1 360eae8a416SChris Zankel addi a5, a5, -2 361eae8a416SChris Zankel s8i a6, a5, 0 362eae8a416SChris Zankel s8i a7, a5, 1 363eae8a416SChris Zankel addi a4, a4, -2 364eae8a416SChris Zankel j .Lbackdstaligned # dst is now aligned, 365eae8a416SChris Zankel # return to main algorithm 366eae8a416SChris Zankel 367c633544aSMax FilippovENTRY(__memmove) 368c633544aSMax FilippovWEAK(memmove) 369eae8a416SChris Zankel 370d6d5f19eSMax Filippov abi_entry_default 371eae8a416SChris Zankel # a2/ dst, a3/ src, a4/ len 372eae8a416SChris Zankel mov a5, a2 # copy dst so that a2 is return value 373eae8a416SChris Zankel.Lmovecommon: 374eae8a416SChris Zankel sub a6, a5, a3 375eae8a416SChris Zankel bgeu a6, a4, .Lcommon 376eae8a416SChris Zankel 377eae8a416SChris Zankel add a5, a5, a4 378eae8a416SChris Zankel add a3, a3, a4 379eae8a416SChris Zankel 380eae8a416SChris Zankel _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 381eae8a416SChris Zankel _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 382eae8a416SChris Zankel.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 383eae8a416SChris Zankel srli a7, a4, 4 # number of loop iterations with 16B 384eae8a416SChris Zankel # per iteration 385eae8a416SChris Zankel movi a8, 3 # if source is not aligned, 386eae8a416SChris Zankel _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 387eae8a416SChris Zankel /* 388eae8a416SChris Zankel * Destination and source are word-aligned, use word copy. 389eae8a416SChris Zankel */ 390eae8a416SChris Zankel # copy 16 bytes per iteration for word-aligned dst and word-aligned src 391eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS 3921030c879SMax Filippov loopnez a7, .LbackLoop1done 393eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 3941030c879SMax Filippov beqz a7, .LbackLoop1done 395eae8a416SChris Zankel slli a8, a7, 4 396eae8a416SChris Zankel sub a8, a3, a8 # a8 = start of first 16B source chunk 397eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 3981030c879SMax Filippov.LbackLoop1: 399eae8a416SChris Zankel addi a3, a3, -16 400eae8a416SChris Zankel l32i a7, a3, 12 401eae8a416SChris Zankel l32i a6, a3, 8 402eae8a416SChris Zankel addi a5, a5, -16 403eae8a416SChris Zankel s32i a7, a5, 12 404eae8a416SChris Zankel l32i a7, a3, 4 405eae8a416SChris Zankel s32i a6, a5, 8 406eae8a416SChris Zankel l32i a6, a3, 0 407eae8a416SChris Zankel s32i a7, a5, 4 408eae8a416SChris Zankel s32i a6, a5, 0 409eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS 4101030c879SMax Filippov bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start 411eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 4121030c879SMax Filippov.LbackLoop1done: 413eae8a416SChris Zankel bbci.l a4, 3, .Lback2 414eae8a416SChris Zankel # copy 8 bytes 415eae8a416SChris Zankel addi a3, a3, -8 416eae8a416SChris Zankel l32i a6, a3, 0 417eae8a416SChris Zankel l32i a7, a3, 4 418eae8a416SChris Zankel addi a5, a5, -8 419eae8a416SChris Zankel s32i a6, a5, 0 420eae8a416SChris Zankel s32i a7, a5, 4 421eae8a416SChris Zankel.Lback2: 422eae8a416SChris Zankel bbsi.l a4, 2, .Lback3 423eae8a416SChris Zankel bbsi.l a4, 1, .Lback4 424eae8a416SChris Zankel bbsi.l a4, 0, .Lback5 425d6d5f19eSMax Filippov abi_ret_default 426eae8a416SChris Zankel.Lback3: 427eae8a416SChris Zankel # copy 4 bytes 428eae8a416SChris Zankel addi a3, a3, -4 429eae8a416SChris Zankel l32i a6, a3, 0 430eae8a416SChris Zankel addi a5, a5, -4 431eae8a416SChris Zankel s32i a6, a5, 0 432eae8a416SChris Zankel bbsi.l a4, 1, .Lback4 433eae8a416SChris Zankel bbsi.l a4, 0, .Lback5 434d6d5f19eSMax Filippov abi_ret_default 435eae8a416SChris Zankel.Lback4: 436eae8a416SChris Zankel # copy 2 bytes 437eae8a416SChris Zankel addi a3, a3, -2 438eae8a416SChris Zankel l16ui a6, a3, 0 439eae8a416SChris Zankel addi a5, a5, -2 440eae8a416SChris Zankel s16i a6, a5, 0 441eae8a416SChris Zankel bbsi.l a4, 0, .Lback5 442d6d5f19eSMax Filippov abi_ret_default 443eae8a416SChris Zankel.Lback5: 444eae8a416SChris Zankel # copy 1 byte 445eae8a416SChris Zankel addi a3, a3, -1 446eae8a416SChris Zankel l8ui a6, a3, 0 447eae8a416SChris Zankel addi a5, a5, -1 448eae8a416SChris Zankel s8i a6, a5, 0 449d6d5f19eSMax Filippov abi_ret_default 450eae8a416SChris Zankel 451eae8a416SChris Zankel/* 452eae8a416SChris Zankel * Destination is aligned, Source is unaligned 453eae8a416SChris Zankel */ 454eae8a416SChris Zankel 455eae8a416SChris Zankel .align 4 456eae8a416SChris Zankel.Lbacksrcunaligned: 457eae8a416SChris Zankel _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 458eae8a416SChris Zankel # copy 16 bytes per iteration for word-aligned dst and unaligned src 459fbb871e2SMax Filippov __ssa8 a3 # set shift amount from byte offset 460eae8a416SChris Zankel#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 461eae8a416SChris Zankel * the lint or ferret client, or 0 462eae8a416SChris Zankel * to save a few cycles */ 463eae8a416SChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 464eae8a416SChris Zankel and a11, a3, a8 # save unalignment offset for below 465eae8a416SChris Zankel sub a3, a3, a11 # align a3 466eae8a416SChris Zankel#endif 467eae8a416SChris Zankel l32i a6, a3, 0 # load first word 468eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS 4691030c879SMax Filippov loopnez a7, .LbackLoop2done 470eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */ 4711030c879SMax Filippov beqz a7, .LbackLoop2done 472eae8a416SChris Zankel slli a10, a7, 4 473eae8a416SChris Zankel sub a10, a3, a10 # a10 = start of first 16B source chunk 474eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 4751030c879SMax Filippov.LbackLoop2: 476eae8a416SChris Zankel addi a3, a3, -16 477eae8a416SChris Zankel l32i a7, a3, 12 478eae8a416SChris Zankel l32i a8, a3, 8 479eae8a416SChris Zankel addi a5, a5, -16 480fbb871e2SMax Filippov __src_b a6, a7, a6 481eae8a416SChris Zankel s32i a6, a5, 12 482eae8a416SChris Zankel l32i a9, a3, 4 483fbb871e2SMax Filippov __src_b a7, a8, a7 484eae8a416SChris Zankel s32i a7, a5, 8 485eae8a416SChris Zankel l32i a6, a3, 0 486fbb871e2SMax Filippov __src_b a8, a9, a8 487eae8a416SChris Zankel s32i a8, a5, 4 488fbb871e2SMax Filippov __src_b a9, a6, a9 489eae8a416SChris Zankel s32i a9, a5, 0 490eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS 4911030c879SMax Filippov bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start 492eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */ 4931030c879SMax Filippov.LbackLoop2done: 494eae8a416SChris Zankel bbci.l a4, 3, .Lback12 495eae8a416SChris Zankel # copy 8 bytes 496eae8a416SChris Zankel addi a3, a3, -8 497eae8a416SChris Zankel l32i a7, a3, 4 498eae8a416SChris Zankel l32i a8, a3, 0 499eae8a416SChris Zankel addi a5, a5, -8 500fbb871e2SMax Filippov __src_b a6, a7, a6 501eae8a416SChris Zankel s32i a6, a5, 4 502fbb871e2SMax Filippov __src_b a7, a8, a7 503eae8a416SChris Zankel s32i a7, a5, 0 504eae8a416SChris Zankel mov a6, a8 505eae8a416SChris Zankel.Lback12: 506eae8a416SChris Zankel bbci.l a4, 2, .Lback13 507eae8a416SChris Zankel # copy 4 bytes 508eae8a416SChris Zankel addi a3, a3, -4 509eae8a416SChris Zankel l32i a7, a3, 0 510eae8a416SChris Zankel addi a5, a5, -4 511fbb871e2SMax Filippov __src_b a6, a7, a6 512eae8a416SChris Zankel s32i a6, a5, 0 513eae8a416SChris Zankel mov a6, a7 514eae8a416SChris Zankel.Lback13: 515eae8a416SChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 516eae8a416SChris Zankel add a3, a3, a11 # readjust a3 with correct misalignment 517eae8a416SChris Zankel#endif 518eae8a416SChris Zankel bbsi.l a4, 1, .Lback14 519eae8a416SChris Zankel bbsi.l a4, 0, .Lback15 520eae8a416SChris Zankel.Lbackdone: 521d6d5f19eSMax Filippov abi_ret_default 522eae8a416SChris Zankel.Lback14: 523eae8a416SChris Zankel # copy 2 bytes 524eae8a416SChris Zankel addi a3, a3, -2 525eae8a416SChris Zankel l8ui a6, a3, 0 526eae8a416SChris Zankel l8ui a7, a3, 1 527eae8a416SChris Zankel addi a5, a5, -2 528eae8a416SChris Zankel s8i a6, a5, 0 529eae8a416SChris Zankel s8i a7, a5, 1 530eae8a416SChris Zankel bbsi.l a4, 0, .Lback15 531d6d5f19eSMax Filippov abi_ret_default 532eae8a416SChris Zankel.Lback15: 533eae8a416SChris Zankel # copy 1 byte 534eae8a416SChris Zankel addi a3, a3, -1 535eae8a416SChris Zankel addi a5, a5, -1 536eae8a416SChris Zankel l8ui a6, a3, 0 537eae8a416SChris Zankel s8i a6, a5, 0 538d6d5f19eSMax Filippov abi_ret_default 539eae8a416SChris Zankel 540c633544aSMax FilippovENDPROC(__memmove) 541*338d9150SMax FilippovEXPORT_SYMBOL(__memmove) 542*338d9150SMax FilippovEXPORT_SYMBOL(memmove) 543