1/* 2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> 3 * Copyright (C) 2008-2009 PetaLogix 4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. 5 * 6 * This file is subject to the terms and conditions of the GNU General 7 * Public License. See the file COPYING in the main directory of this 8 * archive for more details. 9 * 10 * Written by Jim Law <jlaw@irispower.com> 11 * 12 * intended to replace: 13 * memcpy in memcpy.c and 14 * memmove in memmove.c 15 * ... in arch/microblaze/lib 16 * 17 * 18 * assly_fastcopy.S 19 * 20 * Attempt at quicker memcpy and memmove for MicroBlaze 21 * Input : Operand1 in Reg r5 - destination address 22 * Operand2 in Reg r6 - source address 23 * Operand3 in Reg r7 - number of bytes to transfer 24 * Output: Result in Reg r3 - starting destinaition address 25 * 26 * 27 * Explanation: 28 * Perform (possibly unaligned) copy of a block of memory 29 * between mem locations with size of xfer spec'd in bytes 30 */ 31 32#ifdef __MICROBLAZEEL__ 33#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM. 34#endif 35 36#include <linux/linkage.h> 37 .text 38 .globl memcpy 39 .type memcpy, @function 40 .ent memcpy 41 42memcpy: 43fast_memcpy_ascending: 44 /* move d to return register as value of function */ 45 addi r3, r5, 0 46 47 addi r4, r0, 4 /* n = 4 */ 48 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 49 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 50 51 /* transfer first 0~3 bytes to get aligned dest address */ 52 andi r4, r5, 3 /* n = d & 3 */ 53 /* if zero, destination already aligned */ 54 beqi r4, a_dalign_done 55 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ 56 rsubi r4, r4, 4 57 rsub r7, r4, r7 /* c = c - n adjust c */ 58 59a_xfer_first_loop: 60 /* if no bytes left to transfer, transfer the bulk */ 61 beqi r4, a_dalign_done 62 lbui r11, r6, 0 /* h = *s */ 63 sbi r11, r5, 0 /* *d = h */ 64 addi r6, r6, 1 /* s++ */ 65 addi r5, r5, 1 /* d++ */ 66 brid a_xfer_first_loop /* loop */ 67 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 68 69a_dalign_done: 70 addi r4, r0, 32 /* n = 32 */ 71 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 72 /* if n < 0, less than one block to transfer */ 73 blti r4, a_block_done 74 75a_block_xfer: 76 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 77 rsub r7, r4, r7 /* c = c - n */ 78 79 andi r9, r6, 3 /* t1 = s & 3 */ 80 /* if temp != 0, unaligned transfers needed */ 81 bnei r9, a_block_unaligned 82 83a_block_aligned: 84 lwi r9, r6, 0 /* t1 = *(s + 0) */ 85 lwi r10, r6, 4 /* t2 = *(s + 4) */ 86 lwi r11, r6, 8 /* t3 = *(s + 8) */ 87 lwi r12, r6, 12 /* t4 = *(s + 12) */ 88 swi r9, r5, 0 /* *(d + 0) = t1 */ 89 swi r10, r5, 4 /* *(d + 4) = t2 */ 90 swi r11, r5, 8 /* *(d + 8) = t3 */ 91 swi r12, r5, 12 /* *(d + 12) = t4 */ 92 lwi r9, r6, 16 /* t1 = *(s + 16) */ 93 lwi r10, r6, 20 /* t2 = *(s + 20) */ 94 lwi r11, r6, 24 /* t3 = *(s + 24) */ 95 lwi r12, r6, 28 /* t4 = *(s + 28) */ 96 swi r9, r5, 16 /* *(d + 16) = t1 */ 97 swi r10, r5, 20 /* *(d + 20) = t2 */ 98 swi r11, r5, 24 /* *(d + 24) = t3 */ 99 swi r12, r5, 28 /* *(d + 28) = t4 */ 100 addi r6, r6, 32 /* s = s + 32 */ 101 addi r4, r4, -32 /* n = n - 32 */ 102 bneid r4, a_block_aligned /* while (n) loop */ 103 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 104 bri a_block_done 105 106a_block_unaligned: 107 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 108 add r6, r6, r4 /* s = s + n */ 109 lwi r11, r8, 0 /* h = *(as + 0) */ 110 111 addi r9, r9, -1 112 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ 113 addi r9, r9, -1 114 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ 115 116a_block_u3: 117 bslli r11, r11, 24 /* h = h << 24 */ 118a_bu3_loop: 119 lwi r12, r8, 4 /* v = *(as + 4) */ 120 bsrli r9, r12, 8 /* t1 = v >> 8 */ 121 or r9, r11, r9 /* t1 = h | t1 */ 122 swi r9, r5, 0 /* *(d + 0) = t1 */ 123 bslli r11, r12, 24 /* h = v << 24 */ 124 lwi r12, r8, 8 /* v = *(as + 8) */ 125 bsrli r9, r12, 8 /* t1 = v >> 8 */ 126 or r9, r11, r9 /* t1 = h | t1 */ 127 swi r9, r5, 4 /* *(d + 4) = t1 */ 128 bslli r11, r12, 24 /* h = v << 24 */ 129 lwi r12, r8, 12 /* v = *(as + 12) */ 130 bsrli r9, r12, 8 /* t1 = v >> 8 */ 131 or r9, r11, r9 /* t1 = h | t1 */ 132 swi r9, r5, 8 /* *(d + 8) = t1 */ 133 bslli r11, r12, 24 /* h = v << 24 */ 134 lwi r12, r8, 16 /* v = *(as + 16) */ 135 bsrli r9, r12, 8 /* t1 = v >> 8 */ 136 or r9, r11, r9 /* t1 = h | t1 */ 137 swi r9, r5, 12 /* *(d + 12) = t1 */ 138 bslli r11, r12, 24 /* h = v << 24 */ 139 lwi r12, r8, 20 /* v = *(as + 20) */ 140 bsrli r9, r12, 8 /* t1 = v >> 8 */ 141 or r9, r11, r9 /* t1 = h | t1 */ 142 swi r9, r5, 16 /* *(d + 16) = t1 */ 143 bslli r11, r12, 24 /* h = v << 24 */ 144 lwi r12, r8, 24 /* v = *(as + 24) */ 145 bsrli r9, r12, 8 /* t1 = v >> 8 */ 146 or r9, r11, r9 /* t1 = h | t1 */ 147 swi r9, r5, 20 /* *(d + 20) = t1 */ 148 bslli r11, r12, 24 /* h = v << 24 */ 149 lwi r12, r8, 28 /* v = *(as + 28) */ 150 bsrli r9, r12, 8 /* t1 = v >> 8 */ 151 or r9, r11, r9 /* t1 = h | t1 */ 152 swi r9, r5, 24 /* *(d + 24) = t1 */ 153 bslli r11, r12, 24 /* h = v << 24 */ 154 lwi r12, r8, 32 /* v = *(as + 32) */ 155 bsrli r9, r12, 8 /* t1 = v >> 8 */ 156 or r9, r11, r9 /* t1 = h | t1 */ 157 swi r9, r5, 28 /* *(d + 28) = t1 */ 158 bslli r11, r12, 24 /* h = v << 24 */ 159 addi r8, r8, 32 /* as = as + 32 */ 160 addi r4, r4, -32 /* n = n - 32 */ 161 bneid r4, a_bu3_loop /* while (n) loop */ 162 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 163 bri a_block_done 164 165a_block_u1: 166 bslli r11, r11, 8 /* h = h << 8 */ 167a_bu1_loop: 168 lwi r12, r8, 4 /* v = *(as + 4) */ 169 bsrli r9, r12, 24 /* t1 = v >> 24 */ 170 or r9, r11, r9 /* t1 = h | t1 */ 171 swi r9, r5, 0 /* *(d + 0) = t1 */ 172 bslli r11, r12, 8 /* h = v << 8 */ 173 lwi r12, r8, 8 /* v = *(as + 8) */ 174 bsrli r9, r12, 24 /* t1 = v >> 24 */ 175 or r9, r11, r9 /* t1 = h | t1 */ 176 swi r9, r5, 4 /* *(d + 4) = t1 */ 177 bslli r11, r12, 8 /* h = v << 8 */ 178 lwi r12, r8, 12 /* v = *(as + 12) */ 179 bsrli r9, r12, 24 /* t1 = v >> 24 */ 180 or r9, r11, r9 /* t1 = h | t1 */ 181 swi r9, r5, 8 /* *(d + 8) = t1 */ 182 bslli r11, r12, 8 /* h = v << 8 */ 183 lwi r12, r8, 16 /* v = *(as + 16) */ 184 bsrli r9, r12, 24 /* t1 = v >> 24 */ 185 or r9, r11, r9 /* t1 = h | t1 */ 186 swi r9, r5, 12 /* *(d + 12) = t1 */ 187 bslli r11, r12, 8 /* h = v << 8 */ 188 lwi r12, r8, 20 /* v = *(as + 20) */ 189 bsrli r9, r12, 24 /* t1 = v >> 24 */ 190 or r9, r11, r9 /* t1 = h | t1 */ 191 swi r9, r5, 16 /* *(d + 16) = t1 */ 192 bslli r11, r12, 8 /* h = v << 8 */ 193 lwi r12, r8, 24 /* v = *(as + 24) */ 194 bsrli r9, r12, 24 /* t1 = v >> 24 */ 195 or r9, r11, r9 /* t1 = h | t1 */ 196 swi r9, r5, 20 /* *(d + 20) = t1 */ 197 bslli r11, r12, 8 /* h = v << 8 */ 198 lwi r12, r8, 28 /* v = *(as + 28) */ 199 bsrli r9, r12, 24 /* t1 = v >> 24 */ 200 or r9, r11, r9 /* t1 = h | t1 */ 201 swi r9, r5, 24 /* *(d + 24) = t1 */ 202 bslli r11, r12, 8 /* h = v << 8 */ 203 lwi r12, r8, 32 /* v = *(as + 32) */ 204 bsrli r9, r12, 24 /* t1 = v >> 24 */ 205 or r9, r11, r9 /* t1 = h | t1 */ 206 swi r9, r5, 28 /* *(d + 28) = t1 */ 207 bslli r11, r12, 8 /* h = v << 8 */ 208 addi r8, r8, 32 /* as = as + 32 */ 209 addi r4, r4, -32 /* n = n - 32 */ 210 bneid r4, a_bu1_loop /* while (n) loop */ 211 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 212 bri a_block_done 213 214a_block_u2: 215 bslli r11, r11, 16 /* h = h << 16 */ 216a_bu2_loop: 217 lwi r12, r8, 4 /* v = *(as + 4) */ 218 bsrli r9, r12, 16 /* t1 = v >> 16 */ 219 or r9, r11, r9 /* t1 = h | t1 */ 220 swi r9, r5, 0 /* *(d + 0) = t1 */ 221 bslli r11, r12, 16 /* h = v << 16 */ 222 lwi r12, r8, 8 /* v = *(as + 8) */ 223 bsrli r9, r12, 16 /* t1 = v >> 16 */ 224 or r9, r11, r9 /* t1 = h | t1 */ 225 swi r9, r5, 4 /* *(d + 4) = t1 */ 226 bslli r11, r12, 16 /* h = v << 16 */ 227 lwi r12, r8, 12 /* v = *(as + 12) */ 228 bsrli r9, r12, 16 /* t1 = v >> 16 */ 229 or r9, r11, r9 /* t1 = h | t1 */ 230 swi r9, r5, 8 /* *(d + 8) = t1 */ 231 bslli r11, r12, 16 /* h = v << 16 */ 232 lwi r12, r8, 16 /* v = *(as + 16) */ 233 bsrli r9, r12, 16 /* t1 = v >> 16 */ 234 or r9, r11, r9 /* t1 = h | t1 */ 235 swi r9, r5, 12 /* *(d + 12) = t1 */ 236 bslli r11, r12, 16 /* h = v << 16 */ 237 lwi r12, r8, 20 /* v = *(as + 20) */ 238 bsrli r9, r12, 16 /* t1 = v >> 16 */ 239 or r9, r11, r9 /* t1 = h | t1 */ 240 swi r9, r5, 16 /* *(d + 16) = t1 */ 241 bslli r11, r12, 16 /* h = v << 16 */ 242 lwi r12, r8, 24 /* v = *(as + 24) */ 243 bsrli r9, r12, 16 /* t1 = v >> 16 */ 244 or r9, r11, r9 /* t1 = h | t1 */ 245 swi r9, r5, 20 /* *(d + 20) = t1 */ 246 bslli r11, r12, 16 /* h = v << 16 */ 247 lwi r12, r8, 28 /* v = *(as + 28) */ 248 bsrli r9, r12, 16 /* t1 = v >> 16 */ 249 or r9, r11, r9 /* t1 = h | t1 */ 250 swi r9, r5, 24 /* *(d + 24) = t1 */ 251 bslli r11, r12, 16 /* h = v << 16 */ 252 lwi r12, r8, 32 /* v = *(as + 32) */ 253 bsrli r9, r12, 16 /* t1 = v >> 16 */ 254 or r9, r11, r9 /* t1 = h | t1 */ 255 swi r9, r5, 28 /* *(d + 28) = t1 */ 256 bslli r11, r12, 16 /* h = v << 16 */ 257 addi r8, r8, 32 /* as = as + 32 */ 258 addi r4, r4, -32 /* n = n - 32 */ 259 bneid r4, a_bu2_loop /* while (n) loop */ 260 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 261 262a_block_done: 263 addi r4, r0, 4 /* n = 4 */ 264 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 265 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 266 267a_word_xfer: 268 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 269 addi r10, r0, 0 /* offset = 0 */ 270 271 andi r9, r6, 3 /* t1 = s & 3 */ 272 /* if temp != 0, unaligned transfers needed */ 273 bnei r9, a_word_unaligned 274 275a_word_aligned: 276 lw r9, r6, r10 /* t1 = *(s+offset) */ 277 sw r9, r5, r10 /* *(d+offset) = t1 */ 278 addi r4, r4,-4 /* n-- */ 279 bneid r4, a_word_aligned /* loop */ 280 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ 281 282 bri a_word_done 283 284a_word_unaligned: 285 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 286 lwi r11, r8, 0 /* h = *(as + 0) */ 287 addi r8, r8, 4 /* as = as + 4 */ 288 289 addi r9, r9, -1 290 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ 291 addi r9, r9, -1 292 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ 293 294a_word_u3: 295 bslli r11, r11, 24 /* h = h << 24 */ 296a_wu3_loop: 297 lw r12, r8, r10 /* v = *(as + offset) */ 298 bsrli r9, r12, 8 /* t1 = v >> 8 */ 299 or r9, r11, r9 /* t1 = h | t1 */ 300 sw r9, r5, r10 /* *(d + offset) = t1 */ 301 bslli r11, r12, 24 /* h = v << 24 */ 302 addi r4, r4,-4 /* n = n - 4 */ 303 bneid r4, a_wu3_loop /* while (n) loop */ 304 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 305 306 bri a_word_done 307 308a_word_u1: 309 bslli r11, r11, 8 /* h = h << 8 */ 310a_wu1_loop: 311 lw r12, r8, r10 /* v = *(as + offset) */ 312 bsrli r9, r12, 24 /* t1 = v >> 24 */ 313 or r9, r11, r9 /* t1 = h | t1 */ 314 sw r9, r5, r10 /* *(d + offset) = t1 */ 315 bslli r11, r12, 8 /* h = v << 8 */ 316 addi r4, r4,-4 /* n = n - 4 */ 317 bneid r4, a_wu1_loop /* while (n) loop */ 318 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 319 320 bri a_word_done 321 322a_word_u2: 323 bslli r11, r11, 16 /* h = h << 16 */ 324a_wu2_loop: 325 lw r12, r8, r10 /* v = *(as + offset) */ 326 bsrli r9, r12, 16 /* t1 = v >> 16 */ 327 or r9, r11, r9 /* t1 = h | t1 */ 328 sw r9, r5, r10 /* *(d + offset) = t1 */ 329 bslli r11, r12, 16 /* h = v << 16 */ 330 addi r4, r4,-4 /* n = n - 4 */ 331 bneid r4, a_wu2_loop /* while (n) loop */ 332 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 333 334a_word_done: 335 add r5, r5, r10 /* d = d + offset */ 336 add r6, r6, r10 /* s = s + offset */ 337 rsub r7, r10, r7 /* c = c - offset */ 338 339a_xfer_end: 340a_xfer_end_loop: 341 beqi r7, a_done /* while (c) */ 342 lbui r9, r6, 0 /* t1 = *s */ 343 addi r6, r6, 1 /* s++ */ 344 sbi r9, r5, 0 /* *d = t1 */ 345 addi r7, r7, -1 /* c-- */ 346 brid a_xfer_end_loop /* loop */ 347 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ 348 349a_done: 350 rtsd r15, 8 351 nop 352 353.size memcpy, . - memcpy 354.end memcpy 355/*----------------------------------------------------------------------------*/ 356 .globl memmove 357 .type memmove, @function 358 .ent memmove 359 360memmove: 361 cmpu r4, r5, r6 /* n = s - d */ 362 bgei r4,fast_memcpy_ascending 363 364fast_memcpy_descending: 365 /* move d to return register as value of function */ 366 addi r3, r5, 0 367 368 add r5, r5, r7 /* d = d + c */ 369 add r6, r6, r7 /* s = s + c */ 370 371 addi r4, r0, 4 /* n = 4 */ 372 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 373 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 374 375 /* transfer first 0~3 bytes to get aligned dest address */ 376 andi r4, r5, 3 /* n = d & 3 */ 377 /* if zero, destination already aligned */ 378 beqi r4,d_dalign_done 379 rsub r7, r4, r7 /* c = c - n adjust c */ 380 381d_xfer_first_loop: 382 /* if no bytes left to transfer, transfer the bulk */ 383 beqi r4,d_dalign_done 384 addi r6, r6, -1 /* s-- */ 385 addi r5, r5, -1 /* d-- */ 386 lbui r11, r6, 0 /* h = *s */ 387 sbi r11, r5, 0 /* *d = h */ 388 brid d_xfer_first_loop /* loop */ 389 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 390 391d_dalign_done: 392 addi r4, r0, 32 /* n = 32 */ 393 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 394 /* if n < 0, less than one block to transfer */ 395 blti r4, d_block_done 396 397d_block_xfer: 398 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 399 rsub r7, r4, r7 /* c = c - n */ 400 401 andi r9, r6, 3 /* t1 = s & 3 */ 402 /* if temp != 0, unaligned transfers needed */ 403 bnei r9, d_block_unaligned 404 405d_block_aligned: 406 addi r6, r6, -32 /* s = s - 32 */ 407 addi r5, r5, -32 /* d = d - 32 */ 408 lwi r9, r6, 28 /* t1 = *(s + 28) */ 409 lwi r10, r6, 24 /* t2 = *(s + 24) */ 410 lwi r11, r6, 20 /* t3 = *(s + 20) */ 411 lwi r12, r6, 16 /* t4 = *(s + 16) */ 412 swi r9, r5, 28 /* *(d + 28) = t1 */ 413 swi r10, r5, 24 /* *(d + 24) = t2 */ 414 swi r11, r5, 20 /* *(d + 20) = t3 */ 415 swi r12, r5, 16 /* *(d + 16) = t4 */ 416 lwi r9, r6, 12 /* t1 = *(s + 12) */ 417 lwi r10, r6, 8 /* t2 = *(s + 8) */ 418 lwi r11, r6, 4 /* t3 = *(s + 4) */ 419 lwi r12, r6, 0 /* t4 = *(s + 0) */ 420 swi r9, r5, 12 /* *(d + 12) = t1 */ 421 swi r10, r5, 8 /* *(d + 8) = t2 */ 422 swi r11, r5, 4 /* *(d + 4) = t3 */ 423 addi r4, r4, -32 /* n = n - 32 */ 424 bneid r4, d_block_aligned /* while (n) loop */ 425 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ 426 bri d_block_done 427 428d_block_unaligned: 429 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 430 rsub r6, r4, r6 /* s = s - n */ 431 lwi r11, r8, 0 /* h = *(as + 0) */ 432 433 addi r9, r9, -1 434 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ 435 addi r9, r9, -1 436 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ 437 438d_block_u3: 439 bsrli r11, r11, 8 /* h = h >> 8 */ 440d_bu3_loop: 441 addi r8, r8, -32 /* as = as - 32 */ 442 addi r5, r5, -32 /* d = d - 32 */ 443 lwi r12, r8, 28 /* v = *(as + 28) */ 444 bslli r9, r12, 24 /* t1 = v << 24 */ 445 or r9, r11, r9 /* t1 = h | t1 */ 446 swi r9, r5, 28 /* *(d + 28) = t1 */ 447 bsrli r11, r12, 8 /* h = v >> 8 */ 448 lwi r12, r8, 24 /* v = *(as + 24) */ 449 bslli r9, r12, 24 /* t1 = v << 24 */ 450 or r9, r11, r9 /* t1 = h | t1 */ 451 swi r9, r5, 24 /* *(d + 24) = t1 */ 452 bsrli r11, r12, 8 /* h = v >> 8 */ 453 lwi r12, r8, 20 /* v = *(as + 20) */ 454 bslli r9, r12, 24 /* t1 = v << 24 */ 455 or r9, r11, r9 /* t1 = h | t1 */ 456 swi r9, r5, 20 /* *(d + 20) = t1 */ 457 bsrli r11, r12, 8 /* h = v >> 8 */ 458 lwi r12, r8, 16 /* v = *(as + 16) */ 459 bslli r9, r12, 24 /* t1 = v << 24 */ 460 or r9, r11, r9 /* t1 = h | t1 */ 461 swi r9, r5, 16 /* *(d + 16) = t1 */ 462 bsrli r11, r12, 8 /* h = v >> 8 */ 463 lwi r12, r8, 12 /* v = *(as + 12) */ 464 bslli r9, r12, 24 /* t1 = v << 24 */ 465 or r9, r11, r9 /* t1 = h | t1 */ 466 swi r9, r5, 12 /* *(d + 112) = t1 */ 467 bsrli r11, r12, 8 /* h = v >> 8 */ 468 lwi r12, r8, 8 /* v = *(as + 8) */ 469 bslli r9, r12, 24 /* t1 = v << 24 */ 470 or r9, r11, r9 /* t1 = h | t1 */ 471 swi r9, r5, 8 /* *(d + 8) = t1 */ 472 bsrli r11, r12, 8 /* h = v >> 8 */ 473 lwi r12, r8, 4 /* v = *(as + 4) */ 474 bslli r9, r12, 24 /* t1 = v << 24 */ 475 or r9, r11, r9 /* t1 = h | t1 */ 476 swi r9, r5, 4 /* *(d + 4) = t1 */ 477 bsrli r11, r12, 8 /* h = v >> 8 */ 478 lwi r12, r8, 0 /* v = *(as + 0) */ 479 bslli r9, r12, 24 /* t1 = v << 24 */ 480 or r9, r11, r9 /* t1 = h | t1 */ 481 swi r9, r5, 0 /* *(d + 0) = t1 */ 482 addi r4, r4, -32 /* n = n - 32 */ 483 bneid r4, d_bu3_loop /* while (n) loop */ 484 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 485 bri d_block_done 486 487d_block_u1: 488 bsrli r11, r11, 24 /* h = h >> 24 */ 489d_bu1_loop: 490 addi r8, r8, -32 /* as = as - 32 */ 491 addi r5, r5, -32 /* d = d - 32 */ 492 lwi r12, r8, 28 /* v = *(as + 28) */ 493 bslli r9, r12, 8 /* t1 = v << 8 */ 494 or r9, r11, r9 /* t1 = h | t1 */ 495 swi r9, r5, 28 /* *(d + 28) = t1 */ 496 bsrli r11, r12, 24 /* h = v >> 24 */ 497 lwi r12, r8, 24 /* v = *(as + 24) */ 498 bslli r9, r12, 8 /* t1 = v << 8 */ 499 or r9, r11, r9 /* t1 = h | t1 */ 500 swi r9, r5, 24 /* *(d + 24) = t1 */ 501 bsrli r11, r12, 24 /* h = v >> 24 */ 502 lwi r12, r8, 20 /* v = *(as + 20) */ 503 bslli r9, r12, 8 /* t1 = v << 8 */ 504 or r9, r11, r9 /* t1 = h | t1 */ 505 swi r9, r5, 20 /* *(d + 20) = t1 */ 506 bsrli r11, r12, 24 /* h = v >> 24 */ 507 lwi r12, r8, 16 /* v = *(as + 16) */ 508 bslli r9, r12, 8 /* t1 = v << 8 */ 509 or r9, r11, r9 /* t1 = h | t1 */ 510 swi r9, r5, 16 /* *(d + 16) = t1 */ 511 bsrli r11, r12, 24 /* h = v >> 24 */ 512 lwi r12, r8, 12 /* v = *(as + 12) */ 513 bslli r9, r12, 8 /* t1 = v << 8 */ 514 or r9, r11, r9 /* t1 = h | t1 */ 515 swi r9, r5, 12 /* *(d + 112) = t1 */ 516 bsrli r11, r12, 24 /* h = v >> 24 */ 517 lwi r12, r8, 8 /* v = *(as + 8) */ 518 bslli r9, r12, 8 /* t1 = v << 8 */ 519 or r9, r11, r9 /* t1 = h | t1 */ 520 swi r9, r5, 8 /* *(d + 8) = t1 */ 521 bsrli r11, r12, 24 /* h = v >> 24 */ 522 lwi r12, r8, 4 /* v = *(as + 4) */ 523 bslli r9, r12, 8 /* t1 = v << 8 */ 524 or r9, r11, r9 /* t1 = h | t1 */ 525 swi r9, r5, 4 /* *(d + 4) = t1 */ 526 bsrli r11, r12, 24 /* h = v >> 24 */ 527 lwi r12, r8, 0 /* v = *(as + 0) */ 528 bslli r9, r12, 8 /* t1 = v << 8 */ 529 or r9, r11, r9 /* t1 = h | t1 */ 530 swi r9, r5, 0 /* *(d + 0) = t1 */ 531 addi r4, r4, -32 /* n = n - 32 */ 532 bneid r4, d_bu1_loop /* while (n) loop */ 533 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 534 bri d_block_done 535 536d_block_u2: 537 bsrli r11, r11, 16 /* h = h >> 16 */ 538d_bu2_loop: 539 addi r8, r8, -32 /* as = as - 32 */ 540 addi r5, r5, -32 /* d = d - 32 */ 541 lwi r12, r8, 28 /* v = *(as + 28) */ 542 bslli r9, r12, 16 /* t1 = v << 16 */ 543 or r9, r11, r9 /* t1 = h | t1 */ 544 swi r9, r5, 28 /* *(d + 28) = t1 */ 545 bsrli r11, r12, 16 /* h = v >> 16 */ 546 lwi r12, r8, 24 /* v = *(as + 24) */ 547 bslli r9, r12, 16 /* t1 = v << 16 */ 548 or r9, r11, r9 /* t1 = h | t1 */ 549 swi r9, r5, 24 /* *(d + 24) = t1 */ 550 bsrli r11, r12, 16 /* h = v >> 16 */ 551 lwi r12, r8, 20 /* v = *(as + 20) */ 552 bslli r9, r12, 16 /* t1 = v << 16 */ 553 or r9, r11, r9 /* t1 = h | t1 */ 554 swi r9, r5, 20 /* *(d + 20) = t1 */ 555 bsrli r11, r12, 16 /* h = v >> 16 */ 556 lwi r12, r8, 16 /* v = *(as + 16) */ 557 bslli r9, r12, 16 /* t1 = v << 16 */ 558 or r9, r11, r9 /* t1 = h | t1 */ 559 swi r9, r5, 16 /* *(d + 16) = t1 */ 560 bsrli r11, r12, 16 /* h = v >> 16 */ 561 lwi r12, r8, 12 /* v = *(as + 12) */ 562 bslli r9, r12, 16 /* t1 = v << 16 */ 563 or r9, r11, r9 /* t1 = h | t1 */ 564 swi r9, r5, 12 /* *(d + 112) = t1 */ 565 bsrli r11, r12, 16 /* h = v >> 16 */ 566 lwi r12, r8, 8 /* v = *(as + 8) */ 567 bslli r9, r12, 16 /* t1 = v << 16 */ 568 or r9, r11, r9 /* t1 = h | t1 */ 569 swi r9, r5, 8 /* *(d + 8) = t1 */ 570 bsrli r11, r12, 16 /* h = v >> 16 */ 571 lwi r12, r8, 4 /* v = *(as + 4) */ 572 bslli r9, r12, 16 /* t1 = v << 16 */ 573 or r9, r11, r9 /* t1 = h | t1 */ 574 swi r9, r5, 4 /* *(d + 4) = t1 */ 575 bsrli r11, r12, 16 /* h = v >> 16 */ 576 lwi r12, r8, 0 /* v = *(as + 0) */ 577 bslli r9, r12, 16 /* t1 = v << 16 */ 578 or r9, r11, r9 /* t1 = h | t1 */ 579 swi r9, r5, 0 /* *(d + 0) = t1 */ 580 addi r4, r4, -32 /* n = n - 32 */ 581 bneid r4, d_bu2_loop /* while (n) loop */ 582 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 583 584d_block_done: 585 addi r4, r0, 4 /* n = 4 */ 586 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 587 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 588 589d_word_xfer: 590 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 591 rsub r5, r4, r5 /* d = d - n */ 592 rsub r6, r4, r6 /* s = s - n */ 593 rsub r7, r4, r7 /* c = c - n */ 594 595 andi r9, r6, 3 /* t1 = s & 3 */ 596 /* if temp != 0, unaligned transfers needed */ 597 bnei r9, d_word_unaligned 598 599d_word_aligned: 600 addi r4, r4,-4 /* n-- */ 601 lw r9, r6, r4 /* t1 = *(s+n) */ 602 bneid r4, d_word_aligned /* loop */ 603 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ 604 605 bri d_word_done 606 607d_word_unaligned: 608 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 609 lw r11, r8, r4 /* h = *(as + n) */ 610 611 addi r9, r9, -1 612 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ 613 addi r9, r9, -1 614 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ 615 616d_word_u3: 617 bsrli r11, r11, 8 /* h = h >> 8 */ 618d_wu3_loop: 619 addi r4, r4,-4 /* n = n - 4 */ 620 lw r12, r8, r4 /* v = *(as + n) */ 621 bslli r9, r12, 24 /* t1 = v << 24 */ 622 or r9, r11, r9 /* t1 = h | t1 */ 623 sw r9, r5, r4 /* *(d + n) = t1 */ 624 bneid r4, d_wu3_loop /* while (n) loop */ 625 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 626 627 bri d_word_done 628 629d_word_u1: 630 bsrli r11, r11, 24 /* h = h >> 24 */ 631d_wu1_loop: 632 addi r4, r4,-4 /* n = n - 4 */ 633 lw r12, r8, r4 /* v = *(as + n) */ 634 bslli r9, r12, 8 /* t1 = v << 8 */ 635 or r9, r11, r9 /* t1 = h | t1 */ 636 sw r9, r5, r4 /* *(d + n) = t1 */ 637 bneid r4, d_wu1_loop /* while (n) loop */ 638 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 639 640 bri d_word_done 641 642d_word_u2: 643 bsrli r11, r11, 16 /* h = h >> 16 */ 644d_wu2_loop: 645 addi r4, r4,-4 /* n = n - 4 */ 646 lw r12, r8, r4 /* v = *(as + n) */ 647 bslli r9, r12, 16 /* t1 = v << 16 */ 648 or r9, r11, r9 /* t1 = h | t1 */ 649 sw r9, r5, r4 /* *(d + n) = t1 */ 650 bneid r4, d_wu2_loop /* while (n) loop */ 651 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 652 653d_word_done: 654 655d_xfer_end: 656d_xfer_end_loop: 657 beqi r7, a_done /* while (c) */ 658 addi r6, r6, -1 /* s-- */ 659 lbui r9, r6, 0 /* t1 = *s */ 660 addi r5, r5, -1 /* d-- */ 661 sbi r9, r5, 0 /* *d = t1 */ 662 brid d_xfer_end_loop /* loop */ 663 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ 664 665d_done: 666 rtsd r15, 8 667 nop 668 669.size memmove, . - memmove 670.end memmove 671