1/* 2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3 * xthal_memcpy and xthal_bcopy 4 * 5 * This file is subject to the terms and conditions of the GNU General Public 6 * License. See the file "COPYING" in the main directory of this archive 7 * for more details. 8 * 9 * Copyright (C) 2002 - 2012 Tensilica Inc. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/asmmacro.h> 14#include <asm/core.h> 15 16/* 17 * void *memcpy(void *dst, const void *src, size_t len); 18 * 19 * This function is intended to do the same thing as the standard 20 * library function memcpy() for most cases. 21 * However, where the source and/or destination references 22 * an instruction RAM or ROM or a data RAM or ROM, that 23 * source and/or destination will always be accessed with 24 * 32-bit load and store instructions (as required for these 25 * types of devices). 26 * 27 * !!!!!!! XTFIXME: 28 * !!!!!!! Handling of IRAM/IROM has not yet 29 * !!!!!!! been implemented. 30 * 31 * The (general case) algorithm is as follows: 32 * If destination is unaligned, align it by conditionally 33 * copying 1 and 2 bytes. 34 * If source is aligned, 35 * do 16 bytes with a loop, and then finish up with 36 * 8, 4, 2, and 1 byte copies conditional on the length; 37 * else (if source is unaligned), 38 * do the same, but use SRC to align the source data. 39 * This code tries to use fall-through branches for the common 40 * case of aligned source and destination and multiple 41 * of 4 (or 8) length. 42 * 43 * Register use: 44 * a0/ return address 45 * a1/ stack pointer 46 * a2/ return value 47 * a3/ src 48 * a4/ length 49 * a5/ dst 50 * a6/ tmp 51 * a7/ tmp 52 * a8/ tmp 53 * a9/ tmp 54 * a10/ tmp 55 * a11/ tmp 56 */ 57 58 .text 59 60/* 61 * Byte by byte copy 62 */ 63 .align 4 64 .byte 0 # 1 mod 4 alignment for LOOPNEZ 65 # (0 mod 4 alignment for LBEG) 66.Lbytecopy: 67#if XCHAL_HAVE_LOOPS 68 loopnez a4, .Lbytecopydone 69#else /* !XCHAL_HAVE_LOOPS */ 70 beqz a4, .Lbytecopydone 71 add a7, a3, a4 # a7 = end address for source 72#endif /* !XCHAL_HAVE_LOOPS */ 73.Lnextbyte: 74 l8ui a6, a3, 0 75 addi a3, a3, 1 76 s8i a6, a5, 0 77 addi a5, a5, 1 78#if !XCHAL_HAVE_LOOPS 79 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 80#endif /* !XCHAL_HAVE_LOOPS */ 81.Lbytecopydone: 82 retw 83 84/* 85 * Destination is unaligned 86 */ 87 88 .align 4 89.Ldst1mod2: # dst is only byte aligned 90 _bltui a4, 7, .Lbytecopy # do short copies byte by byte 91 92 # copy 1 byte 93 l8ui a6, a3, 0 94 addi a3, a3, 1 95 addi a4, a4, -1 96 s8i a6, a5, 0 97 addi a5, a5, 1 98 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 99 # return to main algorithm 100.Ldst2mod4: # dst 16-bit aligned 101 # copy 2 bytes 102 _bltui a4, 6, .Lbytecopy # do short copies byte by byte 103 l8ui a6, a3, 0 104 l8ui a7, a3, 1 105 addi a3, a3, 2 106 addi a4, a4, -2 107 s8i a6, a5, 0 108 s8i a7, a5, 1 109 addi a5, a5, 2 110 j .Ldstaligned # dst is now aligned, return to main algorithm 111 112ENTRY(__memcpy) 113WEAK(memcpy) 114 115 entry sp, 16 # minimal stack frame 116 # a2/ dst, a3/ src, a4/ len 117 mov a5, a2 # copy dst so that a2 is return value 118.Lcommon: 119 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 120 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 121.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 122 srli a7, a4, 4 # number of loop iterations with 16B 123 # per iteration 124 movi a8, 3 # if source is not aligned, 125 _bany a3, a8, .Lsrcunaligned # then use shifting copy 126 /* 127 * Destination and source are word-aligned, use word copy. 128 */ 129 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 130#if XCHAL_HAVE_LOOPS 131 loopnez a7, .Loop1done 132#else /* !XCHAL_HAVE_LOOPS */ 133 beqz a7, .Loop1done 134 slli a8, a7, 4 135 add a8, a8, a3 # a8 = end of last 16B source chunk 136#endif /* !XCHAL_HAVE_LOOPS */ 137.Loop1: 138 l32i a6, a3, 0 139 l32i a7, a3, 4 140 s32i a6, a5, 0 141 l32i a6, a3, 8 142 s32i a7, a5, 4 143 l32i a7, a3, 12 144 s32i a6, a5, 8 145 addi a3, a3, 16 146 s32i a7, a5, 12 147 addi a5, a5, 16 148#if !XCHAL_HAVE_LOOPS 149 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 150#endif /* !XCHAL_HAVE_LOOPS */ 151.Loop1done: 152 bbci.l a4, 3, .L2 153 # copy 8 bytes 154 l32i a6, a3, 0 155 l32i a7, a3, 4 156 addi a3, a3, 8 157 s32i a6, a5, 0 158 s32i a7, a5, 4 159 addi a5, a5, 8 160.L2: 161 bbsi.l a4, 2, .L3 162 bbsi.l a4, 1, .L4 163 bbsi.l a4, 0, .L5 164 retw 165.L3: 166 # copy 4 bytes 167 l32i a6, a3, 0 168 addi a3, a3, 4 169 s32i a6, a5, 0 170 addi a5, a5, 4 171 bbsi.l a4, 1, .L4 172 bbsi.l a4, 0, .L5 173 retw 174.L4: 175 # copy 2 bytes 176 l16ui a6, a3, 0 177 addi a3, a3, 2 178 s16i a6, a5, 0 179 addi a5, a5, 2 180 bbsi.l a4, 0, .L5 181 retw 182.L5: 183 # copy 1 byte 184 l8ui a6, a3, 0 185 s8i a6, a5, 0 186 retw 187 188/* 189 * Destination is aligned, Source is unaligned 190 */ 191 192 .align 4 193.Lsrcunaligned: 194 _beqz a4, .Ldone # avoid loading anything for zero-length copies 195 # copy 16 bytes per iteration for word-aligned dst and unaligned src 196 __ssa8 a3 # set shift amount from byte offset 197 198/* set to 1 when running on ISS (simulator) with the 199 lint or ferret client, or 0 to save a few cycles */ 200#define SIM_CHECKS_ALIGNMENT 1 201#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 202 and a11, a3, a8 # save unalignment offset for below 203 sub a3, a3, a11 # align a3 204#endif 205 l32i a6, a3, 0 # load first word 206#if XCHAL_HAVE_LOOPS 207 loopnez a7, .Loop2done 208#else /* !XCHAL_HAVE_LOOPS */ 209 beqz a7, .Loop2done 210 slli a10, a7, 4 211 add a10, a10, a3 # a10 = end of last 16B source chunk 212#endif /* !XCHAL_HAVE_LOOPS */ 213.Loop2: 214 l32i a7, a3, 4 215 l32i a8, a3, 8 216 __src_b a6, a6, a7 217 s32i a6, a5, 0 218 l32i a9, a3, 12 219 __src_b a7, a7, a8 220 s32i a7, a5, 4 221 l32i a6, a3, 16 222 __src_b a8, a8, a9 223 s32i a8, a5, 8 224 addi a3, a3, 16 225 __src_b a9, a9, a6 226 s32i a9, a5, 12 227 addi a5, a5, 16 228#if !XCHAL_HAVE_LOOPS 229 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 230#endif /* !XCHAL_HAVE_LOOPS */ 231.Loop2done: 232 bbci.l a4, 3, .L12 233 # copy 8 bytes 234 l32i a7, a3, 4 235 l32i a8, a3, 8 236 __src_b a6, a6, a7 237 s32i a6, a5, 0 238 addi a3, a3, 8 239 __src_b a7, a7, a8 240 s32i a7, a5, 4 241 addi a5, a5, 8 242 mov a6, a8 243.L12: 244 bbci.l a4, 2, .L13 245 # copy 4 bytes 246 l32i a7, a3, 4 247 addi a3, a3, 4 248 __src_b a6, a6, a7 249 s32i a6, a5, 0 250 addi a5, a5, 4 251 mov a6, a7 252.L13: 253#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 254 add a3, a3, a11 # readjust a3 with correct misalignment 255#endif 256 bbsi.l a4, 1, .L14 257 bbsi.l a4, 0, .L15 258.Ldone: retw 259.L14: 260 # copy 2 bytes 261 l8ui a6, a3, 0 262 l8ui a7, a3, 1 263 addi a3, a3, 2 264 s8i a6, a5, 0 265 s8i a7, a5, 1 266 addi a5, a5, 2 267 bbsi.l a4, 0, .L15 268 retw 269.L15: 270 # copy 1 byte 271 l8ui a6, a3, 0 272 s8i a6, a5, 0 273 retw 274 275ENDPROC(__memcpy) 276 277/* 278 * void bcopy(const void *src, void *dest, size_t n); 279 */ 280 281ENTRY(bcopy) 282 283 entry sp, 16 # minimal stack frame 284 # a2=src, a3=dst, a4=len 285 mov a5, a3 286 mov a3, a2 287 mov a2, a5 288 j .Lmovecommon # go to common code for memmove+bcopy 289 290ENDPROC(bcopy) 291 292/* 293 * void *memmove(void *dst, const void *src, size_t len); 294 * 295 * This function is intended to do the same thing as the standard 296 * library function memmove() for most cases. 297 * However, where the source and/or destination references 298 * an instruction RAM or ROM or a data RAM or ROM, that 299 * source and/or destination will always be accessed with 300 * 32-bit load and store instructions (as required for these 301 * types of devices). 302 * 303 * !!!!!!! XTFIXME: 304 * !!!!!!! Handling of IRAM/IROM has not yet 305 * !!!!!!! been implemented. 306 * 307 * The (general case) algorithm is as follows: 308 * If end of source doesn't overlap destination then use memcpy. 309 * Otherwise do memcpy backwards. 310 * 311 * Register use: 312 * a0/ return address 313 * a1/ stack pointer 314 * a2/ return value 315 * a3/ src 316 * a4/ length 317 * a5/ dst 318 * a6/ tmp 319 * a7/ tmp 320 * a8/ tmp 321 * a9/ tmp 322 * a10/ tmp 323 * a11/ tmp 324 */ 325 326/* 327 * Byte by byte copy 328 */ 329 .align 4 330 .byte 0 # 1 mod 4 alignment for LOOPNEZ 331 # (0 mod 4 alignment for LBEG) 332.Lbackbytecopy: 333#if XCHAL_HAVE_LOOPS 334 loopnez a4, .Lbackbytecopydone 335#else /* !XCHAL_HAVE_LOOPS */ 336 beqz a4, .Lbackbytecopydone 337 sub a7, a3, a4 # a7 = start address for source 338#endif /* !XCHAL_HAVE_LOOPS */ 339.Lbacknextbyte: 340 addi a3, a3, -1 341 l8ui a6, a3, 0 342 addi a5, a5, -1 343 s8i a6, a5, 0 344#if !XCHAL_HAVE_LOOPS 345 bne a3, a7, .Lbacknextbyte # continue loop if 346 # $a3:src != $a7:src_start 347#endif /* !XCHAL_HAVE_LOOPS */ 348.Lbackbytecopydone: 349 retw 350 351/* 352 * Destination is unaligned 353 */ 354 355 .align 4 356.Lbackdst1mod2: # dst is only byte aligned 357 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 358 359 # copy 1 byte 360 addi a3, a3, -1 361 l8ui a6, a3, 0 362 addi a5, a5, -1 363 s8i a6, a5, 0 364 addi a4, a4, -1 365 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 366 # return to main algorithm 367.Lbackdst2mod4: # dst 16-bit aligned 368 # copy 2 bytes 369 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 370 addi a3, a3, -2 371 l8ui a6, a3, 0 372 l8ui a7, a3, 1 373 addi a5, a5, -2 374 s8i a6, a5, 0 375 s8i a7, a5, 1 376 addi a4, a4, -2 377 j .Lbackdstaligned # dst is now aligned, 378 # return to main algorithm 379 380ENTRY(__memmove) 381WEAK(memmove) 382 383 entry sp, 16 # minimal stack frame 384 # a2/ dst, a3/ src, a4/ len 385 mov a5, a2 # copy dst so that a2 is return value 386.Lmovecommon: 387 sub a6, a5, a3 388 bgeu a6, a4, .Lcommon 389 390 add a5, a5, a4 391 add a3, a3, a4 392 393 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 394 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 395.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 396 srli a7, a4, 4 # number of loop iterations with 16B 397 # per iteration 398 movi a8, 3 # if source is not aligned, 399 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 400 /* 401 * Destination and source are word-aligned, use word copy. 402 */ 403 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 404#if XCHAL_HAVE_LOOPS 405 loopnez a7, .backLoop1done 406#else /* !XCHAL_HAVE_LOOPS */ 407 beqz a7, .backLoop1done 408 slli a8, a7, 4 409 sub a8, a3, a8 # a8 = start of first 16B source chunk 410#endif /* !XCHAL_HAVE_LOOPS */ 411.backLoop1: 412 addi a3, a3, -16 413 l32i a7, a3, 12 414 l32i a6, a3, 8 415 addi a5, a5, -16 416 s32i a7, a5, 12 417 l32i a7, a3, 4 418 s32i a6, a5, 8 419 l32i a6, a3, 0 420 s32i a7, a5, 4 421 s32i a6, a5, 0 422#if !XCHAL_HAVE_LOOPS 423 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start 424#endif /* !XCHAL_HAVE_LOOPS */ 425.backLoop1done: 426 bbci.l a4, 3, .Lback2 427 # copy 8 bytes 428 addi a3, a3, -8 429 l32i a6, a3, 0 430 l32i a7, a3, 4 431 addi a5, a5, -8 432 s32i a6, a5, 0 433 s32i a7, a5, 4 434.Lback2: 435 bbsi.l a4, 2, .Lback3 436 bbsi.l a4, 1, .Lback4 437 bbsi.l a4, 0, .Lback5 438 retw 439.Lback3: 440 # copy 4 bytes 441 addi a3, a3, -4 442 l32i a6, a3, 0 443 addi a5, a5, -4 444 s32i a6, a5, 0 445 bbsi.l a4, 1, .Lback4 446 bbsi.l a4, 0, .Lback5 447 retw 448.Lback4: 449 # copy 2 bytes 450 addi a3, a3, -2 451 l16ui a6, a3, 0 452 addi a5, a5, -2 453 s16i a6, a5, 0 454 bbsi.l a4, 0, .Lback5 455 retw 456.Lback5: 457 # copy 1 byte 458 addi a3, a3, -1 459 l8ui a6, a3, 0 460 addi a5, a5, -1 461 s8i a6, a5, 0 462 retw 463 464/* 465 * Destination is aligned, Source is unaligned 466 */ 467 468 .align 4 469.Lbacksrcunaligned: 470 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 471 # copy 16 bytes per iteration for word-aligned dst and unaligned src 472 __ssa8 a3 # set shift amount from byte offset 473#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 474 * the lint or ferret client, or 0 475 * to save a few cycles */ 476#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 477 and a11, a3, a8 # save unalignment offset for below 478 sub a3, a3, a11 # align a3 479#endif 480 l32i a6, a3, 0 # load first word 481#if XCHAL_HAVE_LOOPS 482 loopnez a7, .backLoop2done 483#else /* !XCHAL_HAVE_LOOPS */ 484 beqz a7, .backLoop2done 485 slli a10, a7, 4 486 sub a10, a3, a10 # a10 = start of first 16B source chunk 487#endif /* !XCHAL_HAVE_LOOPS */ 488.backLoop2: 489 addi a3, a3, -16 490 l32i a7, a3, 12 491 l32i a8, a3, 8 492 addi a5, a5, -16 493 __src_b a6, a7, a6 494 s32i a6, a5, 12 495 l32i a9, a3, 4 496 __src_b a7, a8, a7 497 s32i a7, a5, 8 498 l32i a6, a3, 0 499 __src_b a8, a9, a8 500 s32i a8, a5, 4 501 __src_b a9, a6, a9 502 s32i a9, a5, 0 503#if !XCHAL_HAVE_LOOPS 504 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start 505#endif /* !XCHAL_HAVE_LOOPS */ 506.backLoop2done: 507 bbci.l a4, 3, .Lback12 508 # copy 8 bytes 509 addi a3, a3, -8 510 l32i a7, a3, 4 511 l32i a8, a3, 0 512 addi a5, a5, -8 513 __src_b a6, a7, a6 514 s32i a6, a5, 4 515 __src_b a7, a8, a7 516 s32i a7, a5, 0 517 mov a6, a8 518.Lback12: 519 bbci.l a4, 2, .Lback13 520 # copy 4 bytes 521 addi a3, a3, -4 522 l32i a7, a3, 0 523 addi a5, a5, -4 524 __src_b a6, a7, a6 525 s32i a6, a5, 0 526 mov a6, a7 527.Lback13: 528#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 529 add a3, a3, a11 # readjust a3 with correct misalignment 530#endif 531 bbsi.l a4, 1, .Lback14 532 bbsi.l a4, 0, .Lback15 533.Lbackdone: 534 retw 535.Lback14: 536 # copy 2 bytes 537 addi a3, a3, -2 538 l8ui a6, a3, 0 539 l8ui a7, a3, 1 540 addi a5, a5, -2 541 s8i a6, a5, 0 542 s8i a7, a5, 1 543 bbsi.l a4, 0, .Lback15 544 retw 545.Lback15: 546 # copy 1 byte 547 addi a3, a3, -1 548 addi a5, a5, -1 549 l8ui a6, a3, 0 550 s8i a6, a5, 0 551 retw 552 553ENDPROC(__memmove) 554