1/* 2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3 * xthal_memcpy and xthal_bcopy 4 * 5 * This file is subject to the terms and conditions of the GNU General Public 6 * License. See the file "COPYING" in the main directory of this archive 7 * for more details. 8 * 9 * Copyright (C) 2002 - 2012 Tensilica Inc. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/asmmacro.h> 14#include <asm/core.h> 15 16/* 17 * void *memcpy(void *dst, const void *src, size_t len); 18 * 19 * This function is intended to do the same thing as the standard 20 * library function memcpy() for most cases. 21 * However, where the source and/or destination references 22 * an instruction RAM or ROM or a data RAM or ROM, that 23 * source and/or destination will always be accessed with 24 * 32-bit load and store instructions (as required for these 25 * types of devices). 26 * 27 * !!!!!!! XTFIXME: 28 * !!!!!!! Handling of IRAM/IROM has not yet 29 * !!!!!!! been implemented. 30 * 31 * The (general case) algorithm is as follows: 32 * If destination is unaligned, align it by conditionally 33 * copying 1 and 2 bytes. 34 * If source is aligned, 35 * do 16 bytes with a loop, and then finish up with 36 * 8, 4, 2, and 1 byte copies conditional on the length; 37 * else (if source is unaligned), 38 * do the same, but use SRC to align the source data. 39 * This code tries to use fall-through branches for the common 40 * case of aligned source and destination and multiple 41 * of 4 (or 8) length. 42 * 43 * Register use: 44 * a0/ return address 45 * a1/ stack pointer 46 * a2/ return value 47 * a3/ src 48 * a4/ length 49 * a5/ dst 50 * a6/ tmp 51 * a7/ tmp 52 * a8/ tmp 53 * a9/ tmp 54 * a10/ tmp 55 * a11/ tmp 56 */ 57 58 .text 59 60/* 61 * Byte by byte copy 62 */ 63 .align 4 64 .byte 0 # 1 mod 4 alignment for LOOPNEZ 65 # (0 mod 4 alignment for LBEG) 66.Lbytecopy: 67#if XCHAL_HAVE_LOOPS 68 loopnez a4, .Lbytecopydone 69#else /* !XCHAL_HAVE_LOOPS */ 70 beqz a4, .Lbytecopydone 71 add a7, a3, a4 # a7 = end address for source 72#endif /* !XCHAL_HAVE_LOOPS */ 73.Lnextbyte: 74 l8ui a6, a3, 0 75 addi a3, a3, 1 76 s8i a6, a5, 0 77 addi a5, a5, 1 78#if !XCHAL_HAVE_LOOPS 79 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 80#endif /* !XCHAL_HAVE_LOOPS */ 81.Lbytecopydone: 82 abi_ret_default 83 84/* 85 * Destination is unaligned 86 */ 87 88 .align 4 89.Ldst1mod2: # dst is only byte aligned 90 _bltui a4, 7, .Lbytecopy # do short copies byte by byte 91 92 # copy 1 byte 93 l8ui a6, a3, 0 94 addi a3, a3, 1 95 addi a4, a4, -1 96 s8i a6, a5, 0 97 addi a5, a5, 1 98 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 99 # return to main algorithm 100.Ldst2mod4: # dst 16-bit aligned 101 # copy 2 bytes 102 _bltui a4, 6, .Lbytecopy # do short copies byte by byte 103 l8ui a6, a3, 0 104 l8ui a7, a3, 1 105 addi a3, a3, 2 106 addi a4, a4, -2 107 s8i a6, a5, 0 108 s8i a7, a5, 1 109 addi a5, a5, 2 110 j .Ldstaligned # dst is now aligned, return to main algorithm 111 112ENTRY(__memcpy) 113WEAK(memcpy) 114 115 abi_entry_default 116 # a2/ dst, a3/ src, a4/ len 117 mov a5, a2 # copy dst so that a2 is return value 118.Lcommon: 119 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 120 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 121.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 122 srli a7, a4, 4 # number of loop iterations with 16B 123 # per iteration 124 movi a8, 3 # if source is not aligned, 125 _bany a3, a8, .Lsrcunaligned # then use shifting copy 126 /* 127 * Destination and source are word-aligned, use word copy. 128 */ 129 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 130#if XCHAL_HAVE_LOOPS 131 loopnez a7, .Loop1done 132#else /* !XCHAL_HAVE_LOOPS */ 133 beqz a7, .Loop1done 134 slli a8, a7, 4 135 add a8, a8, a3 # a8 = end of last 16B source chunk 136#endif /* !XCHAL_HAVE_LOOPS */ 137.Loop1: 138 l32i a6, a3, 0 139 l32i a7, a3, 4 140 s32i a6, a5, 0 141 l32i a6, a3, 8 142 s32i a7, a5, 4 143 l32i a7, a3, 12 144 s32i a6, a5, 8 145 addi a3, a3, 16 146 s32i a7, a5, 12 147 addi a5, a5, 16 148#if !XCHAL_HAVE_LOOPS 149 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 150#endif /* !XCHAL_HAVE_LOOPS */ 151.Loop1done: 152 bbci.l a4, 3, .L2 153 # copy 8 bytes 154 l32i a6, a3, 0 155 l32i a7, a3, 4 156 addi a3, a3, 8 157 s32i a6, a5, 0 158 s32i a7, a5, 4 159 addi a5, a5, 8 160.L2: 161 bbsi.l a4, 2, .L3 162 bbsi.l a4, 1, .L4 163 bbsi.l a4, 0, .L5 164 abi_ret_default 165.L3: 166 # copy 4 bytes 167 l32i a6, a3, 0 168 addi a3, a3, 4 169 s32i a6, a5, 0 170 addi a5, a5, 4 171 bbsi.l a4, 1, .L4 172 bbsi.l a4, 0, .L5 173 abi_ret_default 174.L4: 175 # copy 2 bytes 176 l16ui a6, a3, 0 177 addi a3, a3, 2 178 s16i a6, a5, 0 179 addi a5, a5, 2 180 bbsi.l a4, 0, .L5 181 abi_ret_default 182.L5: 183 # copy 1 byte 184 l8ui a6, a3, 0 185 s8i a6, a5, 0 186 abi_ret_default 187 188/* 189 * Destination is aligned, Source is unaligned 190 */ 191 192 .align 4 193.Lsrcunaligned: 194 _beqz a4, .Ldone # avoid loading anything for zero-length copies 195 # copy 16 bytes per iteration for word-aligned dst and unaligned src 196 __ssa8 a3 # set shift amount from byte offset 197 198/* set to 1 when running on ISS (simulator) with the 199 lint or ferret client, or 0 to save a few cycles */ 200#define SIM_CHECKS_ALIGNMENT 1 201#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 202 and a11, a3, a8 # save unalignment offset for below 203 sub a3, a3, a11 # align a3 204#endif 205 l32i a6, a3, 0 # load first word 206#if XCHAL_HAVE_LOOPS 207 loopnez a7, .Loop2done 208#else /* !XCHAL_HAVE_LOOPS */ 209 beqz a7, .Loop2done 210 slli a10, a7, 4 211 add a10, a10, a3 # a10 = end of last 16B source chunk 212#endif /* !XCHAL_HAVE_LOOPS */ 213.Loop2: 214 l32i a7, a3, 4 215 l32i a8, a3, 8 216 __src_b a6, a6, a7 217 s32i a6, a5, 0 218 l32i a9, a3, 12 219 __src_b a7, a7, a8 220 s32i a7, a5, 4 221 l32i a6, a3, 16 222 __src_b a8, a8, a9 223 s32i a8, a5, 8 224 addi a3, a3, 16 225 __src_b a9, a9, a6 226 s32i a9, a5, 12 227 addi a5, a5, 16 228#if !XCHAL_HAVE_LOOPS 229 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 230#endif /* !XCHAL_HAVE_LOOPS */ 231.Loop2done: 232 bbci.l a4, 3, .L12 233 # copy 8 bytes 234 l32i a7, a3, 4 235 l32i a8, a3, 8 236 __src_b a6, a6, a7 237 s32i a6, a5, 0 238 addi a3, a3, 8 239 __src_b a7, a7, a8 240 s32i a7, a5, 4 241 addi a5, a5, 8 242 mov a6, a8 243.L12: 244 bbci.l a4, 2, .L13 245 # copy 4 bytes 246 l32i a7, a3, 4 247 addi a3, a3, 4 248 __src_b a6, a6, a7 249 s32i a6, a5, 0 250 addi a5, a5, 4 251 mov a6, a7 252.L13: 253#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 254 add a3, a3, a11 # readjust a3 with correct misalignment 255#endif 256 bbsi.l a4, 1, .L14 257 bbsi.l a4, 0, .L15 258.Ldone: abi_ret_default 259.L14: 260 # copy 2 bytes 261 l8ui a6, a3, 0 262 l8ui a7, a3, 1 263 addi a3, a3, 2 264 s8i a6, a5, 0 265 s8i a7, a5, 1 266 addi a5, a5, 2 267 bbsi.l a4, 0, .L15 268 abi_ret_default 269.L15: 270 # copy 1 byte 271 l8ui a6, a3, 0 272 s8i a6, a5, 0 273 abi_ret_default 274 275ENDPROC(__memcpy) 276EXPORT_SYMBOL(__memcpy) 277EXPORT_SYMBOL(memcpy) 278 279/* 280 * void *memmove(void *dst, const void *src, size_t len); 281 * 282 * This function is intended to do the same thing as the standard 283 * library function memmove() for most cases. 284 * However, where the source and/or destination references 285 * an instruction RAM or ROM or a data RAM or ROM, that 286 * source and/or destination will always be accessed with 287 * 32-bit load and store instructions (as required for these 288 * types of devices). 289 * 290 * !!!!!!! XTFIXME: 291 * !!!!!!! Handling of IRAM/IROM has not yet 292 * !!!!!!! been implemented. 293 * 294 * The (general case) algorithm is as follows: 295 * If end of source doesn't overlap destination then use memcpy. 296 * Otherwise do memcpy backwards. 297 * 298 * Register use: 299 * a0/ return address 300 * a1/ stack pointer 301 * a2/ return value 302 * a3/ src 303 * a4/ length 304 * a5/ dst 305 * a6/ tmp 306 * a7/ tmp 307 * a8/ tmp 308 * a9/ tmp 309 * a10/ tmp 310 * a11/ tmp 311 */ 312 313/* 314 * Byte by byte copy 315 */ 316 .align 4 317 .byte 0 # 1 mod 4 alignment for LOOPNEZ 318 # (0 mod 4 alignment for LBEG) 319.Lbackbytecopy: 320#if XCHAL_HAVE_LOOPS 321 loopnez a4, .Lbackbytecopydone 322#else /* !XCHAL_HAVE_LOOPS */ 323 beqz a4, .Lbackbytecopydone 324 sub a7, a3, a4 # a7 = start address for source 325#endif /* !XCHAL_HAVE_LOOPS */ 326.Lbacknextbyte: 327 addi a3, a3, -1 328 l8ui a6, a3, 0 329 addi a5, a5, -1 330 s8i a6, a5, 0 331#if !XCHAL_HAVE_LOOPS 332 bne a3, a7, .Lbacknextbyte # continue loop if 333 # $a3:src != $a7:src_start 334#endif /* !XCHAL_HAVE_LOOPS */ 335.Lbackbytecopydone: 336 abi_ret_default 337 338/* 339 * Destination is unaligned 340 */ 341 342 .align 4 343.Lbackdst1mod2: # dst is only byte aligned 344 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 345 346 # copy 1 byte 347 addi a3, a3, -1 348 l8ui a6, a3, 0 349 addi a5, a5, -1 350 s8i a6, a5, 0 351 addi a4, a4, -1 352 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 353 # return to main algorithm 354.Lbackdst2mod4: # dst 16-bit aligned 355 # copy 2 bytes 356 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 357 addi a3, a3, -2 358 l8ui a6, a3, 0 359 l8ui a7, a3, 1 360 addi a5, a5, -2 361 s8i a6, a5, 0 362 s8i a7, a5, 1 363 addi a4, a4, -2 364 j .Lbackdstaligned # dst is now aligned, 365 # return to main algorithm 366 367ENTRY(__memmove) 368WEAK(memmove) 369 370 abi_entry_default 371 # a2/ dst, a3/ src, a4/ len 372 mov a5, a2 # copy dst so that a2 is return value 373.Lmovecommon: 374 sub a6, a5, a3 375 bgeu a6, a4, .Lcommon 376 377 add a5, a5, a4 378 add a3, a3, a4 379 380 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 381 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 382.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 383 srli a7, a4, 4 # number of loop iterations with 16B 384 # per iteration 385 movi a8, 3 # if source is not aligned, 386 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 387 /* 388 * Destination and source are word-aligned, use word copy. 389 */ 390 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 391#if XCHAL_HAVE_LOOPS 392 loopnez a7, .LbackLoop1done 393#else /* !XCHAL_HAVE_LOOPS */ 394 beqz a7, .LbackLoop1done 395 slli a8, a7, 4 396 sub a8, a3, a8 # a8 = start of first 16B source chunk 397#endif /* !XCHAL_HAVE_LOOPS */ 398.LbackLoop1: 399 addi a3, a3, -16 400 l32i a7, a3, 12 401 l32i a6, a3, 8 402 addi a5, a5, -16 403 s32i a7, a5, 12 404 l32i a7, a3, 4 405 s32i a6, a5, 8 406 l32i a6, a3, 0 407 s32i a7, a5, 4 408 s32i a6, a5, 0 409#if !XCHAL_HAVE_LOOPS 410 bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start 411#endif /* !XCHAL_HAVE_LOOPS */ 412.LbackLoop1done: 413 bbci.l a4, 3, .Lback2 414 # copy 8 bytes 415 addi a3, a3, -8 416 l32i a6, a3, 0 417 l32i a7, a3, 4 418 addi a5, a5, -8 419 s32i a6, a5, 0 420 s32i a7, a5, 4 421.Lback2: 422 bbsi.l a4, 2, .Lback3 423 bbsi.l a4, 1, .Lback4 424 bbsi.l a4, 0, .Lback5 425 abi_ret_default 426.Lback3: 427 # copy 4 bytes 428 addi a3, a3, -4 429 l32i a6, a3, 0 430 addi a5, a5, -4 431 s32i a6, a5, 0 432 bbsi.l a4, 1, .Lback4 433 bbsi.l a4, 0, .Lback5 434 abi_ret_default 435.Lback4: 436 # copy 2 bytes 437 addi a3, a3, -2 438 l16ui a6, a3, 0 439 addi a5, a5, -2 440 s16i a6, a5, 0 441 bbsi.l a4, 0, .Lback5 442 abi_ret_default 443.Lback5: 444 # copy 1 byte 445 addi a3, a3, -1 446 l8ui a6, a3, 0 447 addi a5, a5, -1 448 s8i a6, a5, 0 449 abi_ret_default 450 451/* 452 * Destination is aligned, Source is unaligned 453 */ 454 455 .align 4 456.Lbacksrcunaligned: 457 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 458 # copy 16 bytes per iteration for word-aligned dst and unaligned src 459 __ssa8 a3 # set shift amount from byte offset 460#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 461 * the lint or ferret client, or 0 462 * to save a few cycles */ 463#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 464 and a11, a3, a8 # save unalignment offset for below 465 sub a3, a3, a11 # align a3 466#endif 467 l32i a6, a3, 0 # load first word 468#if XCHAL_HAVE_LOOPS 469 loopnez a7, .LbackLoop2done 470#else /* !XCHAL_HAVE_LOOPS */ 471 beqz a7, .LbackLoop2done 472 slli a10, a7, 4 473 sub a10, a3, a10 # a10 = start of first 16B source chunk 474#endif /* !XCHAL_HAVE_LOOPS */ 475.LbackLoop2: 476 addi a3, a3, -16 477 l32i a7, a3, 12 478 l32i a8, a3, 8 479 addi a5, a5, -16 480 __src_b a6, a7, a6 481 s32i a6, a5, 12 482 l32i a9, a3, 4 483 __src_b a7, a8, a7 484 s32i a7, a5, 8 485 l32i a6, a3, 0 486 __src_b a8, a9, a8 487 s32i a8, a5, 4 488 __src_b a9, a6, a9 489 s32i a9, a5, 0 490#if !XCHAL_HAVE_LOOPS 491 bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start 492#endif /* !XCHAL_HAVE_LOOPS */ 493.LbackLoop2done: 494 bbci.l a4, 3, .Lback12 495 # copy 8 bytes 496 addi a3, a3, -8 497 l32i a7, a3, 4 498 l32i a8, a3, 0 499 addi a5, a5, -8 500 __src_b a6, a7, a6 501 s32i a6, a5, 4 502 __src_b a7, a8, a7 503 s32i a7, a5, 0 504 mov a6, a8 505.Lback12: 506 bbci.l a4, 2, .Lback13 507 # copy 4 bytes 508 addi a3, a3, -4 509 l32i a7, a3, 0 510 addi a5, a5, -4 511 __src_b a6, a7, a6 512 s32i a6, a5, 0 513 mov a6, a7 514.Lback13: 515#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 516 add a3, a3, a11 # readjust a3 with correct misalignment 517#endif 518 bbsi.l a4, 1, .Lback14 519 bbsi.l a4, 0, .Lback15 520.Lbackdone: 521 abi_ret_default 522.Lback14: 523 # copy 2 bytes 524 addi a3, a3, -2 525 l8ui a6, a3, 0 526 l8ui a7, a3, 1 527 addi a5, a5, -2 528 s8i a6, a5, 0 529 s8i a7, a5, 1 530 bbsi.l a4, 0, .Lback15 531 abi_ret_default 532.Lback15: 533 # copy 1 byte 534 addi a3, a3, -1 535 addi a5, a5, -1 536 l8ui a6, a3, 0 537 s8i a6, a5, 0 538 abi_ret_default 539 540ENDPROC(__memmove) 541EXPORT_SYMBOL(__memmove) 542EXPORT_SYMBOL(memmove) 543