1/* 2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> 3 * Copyright (C) 2008-2009 PetaLogix 4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. 5 * 6 * This file is subject to the terms and conditions of the GNU General 7 * Public License. See the file COPYING in the main directory of this 8 * archive for more details. 9 * 10 * Written by Jim Law <jlaw@irispower.com> 11 * 12 * intended to replace: 13 * memcpy in memcpy.c and 14 * memmove in memmove.c 15 * ... in arch/microblaze/lib 16 * 17 * 18 * assly_fastcopy.S 19 * 20 * Attempt at quicker memcpy and memmove for MicroBlaze 21 * Input : Operand1 in Reg r5 - destination address 22 * Operand2 in Reg r6 - source address 23 * Operand3 in Reg r7 - number of bytes to transfer 24 * Output: Result in Reg r3 - starting destinaition address 25 * 26 * 27 * Explanation: 28 * Perform (possibly unaligned) copy of a block of memory 29 * between mem locations with size of xfer spec'd in bytes 30 */ 31 32#include <linux/linkage.h> 33 34 .globl memcpy 35 .ent memcpy 36 37memcpy: 38fast_memcpy_ascending: 39 /* move d to return register as value of function */ 40 addi r3, r5, 0 41 42 addi r4, r0, 4 /* n = 4 */ 43 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 44 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 45 46 /* transfer first 0~3 bytes to get aligned dest address */ 47 andi r4, r5, 3 /* n = d & 3 */ 48 /* if zero, destination already aligned */ 49 beqi r4, a_dalign_done 50 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ 51 rsubi r4, r4, 4 52 rsub r7, r4, r7 /* c = c - n adjust c */ 53 54a_xfer_first_loop: 55 /* if no bytes left to transfer, transfer the bulk */ 56 beqi r4, a_dalign_done 57 lbui r11, r6, 0 /* h = *s */ 58 sbi r11, r5, 0 /* *d = h */ 59 addi r6, r6, 1 /* s++ */ 60 addi r5, r5, 1 /* d++ */ 61 brid a_xfer_first_loop /* loop */ 62 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 63 64a_dalign_done: 65 addi r4, r0, 32 /* n = 32 */ 66 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 67 /* if n < 0, less than one block to transfer */ 68 blti r4, a_block_done 69 70a_block_xfer: 71 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 72 rsub r7, r4, r7 /* c = c - n */ 73 74 andi r9, r6, 3 /* t1 = s & 3 */ 75 /* if temp != 0, unaligned transfers needed */ 76 bnei r9, a_block_unaligned 77 78a_block_aligned: 79 lwi r9, r6, 0 /* t1 = *(s + 0) */ 80 lwi r10, r6, 4 /* t2 = *(s + 4) */ 81 lwi r11, r6, 8 /* t3 = *(s + 8) */ 82 lwi r12, r6, 12 /* t4 = *(s + 12) */ 83 swi r9, r5, 0 /* *(d + 0) = t1 */ 84 swi r10, r5, 4 /* *(d + 4) = t2 */ 85 swi r11, r5, 8 /* *(d + 8) = t3 */ 86 swi r12, r5, 12 /* *(d + 12) = t4 */ 87 lwi r9, r6, 16 /* t1 = *(s + 16) */ 88 lwi r10, r6, 20 /* t2 = *(s + 20) */ 89 lwi r11, r6, 24 /* t3 = *(s + 24) */ 90 lwi r12, r6, 28 /* t4 = *(s + 28) */ 91 swi r9, r5, 16 /* *(d + 16) = t1 */ 92 swi r10, r5, 20 /* *(d + 20) = t2 */ 93 swi r11, r5, 24 /* *(d + 24) = t3 */ 94 swi r12, r5, 28 /* *(d + 28) = t4 */ 95 addi r6, r6, 32 /* s = s + 32 */ 96 addi r4, r4, -32 /* n = n - 32 */ 97 bneid r4, a_block_aligned /* while (n) loop */ 98 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 99 bri a_block_done 100 101a_block_unaligned: 102 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 103 add r6, r6, r4 /* s = s + n */ 104 lwi r11, r8, 0 /* h = *(as + 0) */ 105 106 addi r9, r9, -1 107 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ 108 addi r9, r9, -1 109 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ 110 111a_block_u3: 112 bslli r11, r11, 24 /* h = h << 24 */ 113a_bu3_loop: 114 lwi r12, r8, 4 /* v = *(as + 4) */ 115 bsrli r9, r12, 8 /* t1 = v >> 8 */ 116 or r9, r11, r9 /* t1 = h | t1 */ 117 swi r9, r5, 0 /* *(d + 0) = t1 */ 118 bslli r11, r12, 24 /* h = v << 24 */ 119 lwi r12, r8, 8 /* v = *(as + 8) */ 120 bsrli r9, r12, 8 /* t1 = v >> 8 */ 121 or r9, r11, r9 /* t1 = h | t1 */ 122 swi r9, r5, 4 /* *(d + 4) = t1 */ 123 bslli r11, r12, 24 /* h = v << 24 */ 124 lwi r12, r8, 12 /* v = *(as + 12) */ 125 bsrli r9, r12, 8 /* t1 = v >> 8 */ 126 or r9, r11, r9 /* t1 = h | t1 */ 127 swi r9, r5, 8 /* *(d + 8) = t1 */ 128 bslli r11, r12, 24 /* h = v << 24 */ 129 lwi r12, r8, 16 /* v = *(as + 16) */ 130 bsrli r9, r12, 8 /* t1 = v >> 8 */ 131 or r9, r11, r9 /* t1 = h | t1 */ 132 swi r9, r5, 12 /* *(d + 12) = t1 */ 133 bslli r11, r12, 24 /* h = v << 24 */ 134 lwi r12, r8, 20 /* v = *(as + 20) */ 135 bsrli r9, r12, 8 /* t1 = v >> 8 */ 136 or r9, r11, r9 /* t1 = h | t1 */ 137 swi r9, r5, 16 /* *(d + 16) = t1 */ 138 bslli r11, r12, 24 /* h = v << 24 */ 139 lwi r12, r8, 24 /* v = *(as + 24) */ 140 bsrli r9, r12, 8 /* t1 = v >> 8 */ 141 or r9, r11, r9 /* t1 = h | t1 */ 142 swi r9, r5, 20 /* *(d + 20) = t1 */ 143 bslli r11, r12, 24 /* h = v << 24 */ 144 lwi r12, r8, 28 /* v = *(as + 28) */ 145 bsrli r9, r12, 8 /* t1 = v >> 8 */ 146 or r9, r11, r9 /* t1 = h | t1 */ 147 swi r9, r5, 24 /* *(d + 24) = t1 */ 148 bslli r11, r12, 24 /* h = v << 24 */ 149 lwi r12, r8, 32 /* v = *(as + 32) */ 150 bsrli r9, r12, 8 /* t1 = v >> 8 */ 151 or r9, r11, r9 /* t1 = h | t1 */ 152 swi r9, r5, 28 /* *(d + 28) = t1 */ 153 bslli r11, r12, 24 /* h = v << 24 */ 154 addi r8, r8, 32 /* as = as + 32 */ 155 addi r4, r4, -32 /* n = n - 32 */ 156 bneid r4, a_bu3_loop /* while (n) loop */ 157 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 158 bri a_block_done 159 160a_block_u1: 161 bslli r11, r11, 8 /* h = h << 8 */ 162a_bu1_loop: 163 lwi r12, r8, 4 /* v = *(as + 4) */ 164 bsrli r9, r12, 24 /* t1 = v >> 24 */ 165 or r9, r11, r9 /* t1 = h | t1 */ 166 swi r9, r5, 0 /* *(d + 0) = t1 */ 167 bslli r11, r12, 8 /* h = v << 8 */ 168 lwi r12, r8, 8 /* v = *(as + 8) */ 169 bsrli r9, r12, 24 /* t1 = v >> 24 */ 170 or r9, r11, r9 /* t1 = h | t1 */ 171 swi r9, r5, 4 /* *(d + 4) = t1 */ 172 bslli r11, r12, 8 /* h = v << 8 */ 173 lwi r12, r8, 12 /* v = *(as + 12) */ 174 bsrli r9, r12, 24 /* t1 = v >> 24 */ 175 or r9, r11, r9 /* t1 = h | t1 */ 176 swi r9, r5, 8 /* *(d + 8) = t1 */ 177 bslli r11, r12, 8 /* h = v << 8 */ 178 lwi r12, r8, 16 /* v = *(as + 16) */ 179 bsrli r9, r12, 24 /* t1 = v >> 24 */ 180 or r9, r11, r9 /* t1 = h | t1 */ 181 swi r9, r5, 12 /* *(d + 12) = t1 */ 182 bslli r11, r12, 8 /* h = v << 8 */ 183 lwi r12, r8, 20 /* v = *(as + 20) */ 184 bsrli r9, r12, 24 /* t1 = v >> 24 */ 185 or r9, r11, r9 /* t1 = h | t1 */ 186 swi r9, r5, 16 /* *(d + 16) = t1 */ 187 bslli r11, r12, 8 /* h = v << 8 */ 188 lwi r12, r8, 24 /* v = *(as + 24) */ 189 bsrli r9, r12, 24 /* t1 = v >> 24 */ 190 or r9, r11, r9 /* t1 = h | t1 */ 191 swi r9, r5, 20 /* *(d + 20) = t1 */ 192 bslli r11, r12, 8 /* h = v << 8 */ 193 lwi r12, r8, 28 /* v = *(as + 28) */ 194 bsrli r9, r12, 24 /* t1 = v >> 24 */ 195 or r9, r11, r9 /* t1 = h | t1 */ 196 swi r9, r5, 24 /* *(d + 24) = t1 */ 197 bslli r11, r12, 8 /* h = v << 8 */ 198 lwi r12, r8, 32 /* v = *(as + 32) */ 199 bsrli r9, r12, 24 /* t1 = v >> 24 */ 200 or r9, r11, r9 /* t1 = h | t1 */ 201 swi r9, r5, 28 /* *(d + 28) = t1 */ 202 bslli r11, r12, 8 /* h = v << 8 */ 203 addi r8, r8, 32 /* as = as + 32 */ 204 addi r4, r4, -32 /* n = n - 32 */ 205 bneid r4, a_bu1_loop /* while (n) loop */ 206 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 207 bri a_block_done 208 209a_block_u2: 210 bslli r11, r11, 16 /* h = h << 16 */ 211a_bu2_loop: 212 lwi r12, r8, 4 /* v = *(as + 4) */ 213 bsrli r9, r12, 16 /* t1 = v >> 16 */ 214 or r9, r11, r9 /* t1 = h | t1 */ 215 swi r9, r5, 0 /* *(d + 0) = t1 */ 216 bslli r11, r12, 16 /* h = v << 16 */ 217 lwi r12, r8, 8 /* v = *(as + 8) */ 218 bsrli r9, r12, 16 /* t1 = v >> 16 */ 219 or r9, r11, r9 /* t1 = h | t1 */ 220 swi r9, r5, 4 /* *(d + 4) = t1 */ 221 bslli r11, r12, 16 /* h = v << 16 */ 222 lwi r12, r8, 12 /* v = *(as + 12) */ 223 bsrli r9, r12, 16 /* t1 = v >> 16 */ 224 or r9, r11, r9 /* t1 = h | t1 */ 225 swi r9, r5, 8 /* *(d + 8) = t1 */ 226 bslli r11, r12, 16 /* h = v << 16 */ 227 lwi r12, r8, 16 /* v = *(as + 16) */ 228 bsrli r9, r12, 16 /* t1 = v >> 16 */ 229 or r9, r11, r9 /* t1 = h | t1 */ 230 swi r9, r5, 12 /* *(d + 12) = t1 */ 231 bslli r11, r12, 16 /* h = v << 16 */ 232 lwi r12, r8, 20 /* v = *(as + 20) */ 233 bsrli r9, r12, 16 /* t1 = v >> 16 */ 234 or r9, r11, r9 /* t1 = h | t1 */ 235 swi r9, r5, 16 /* *(d + 16) = t1 */ 236 bslli r11, r12, 16 /* h = v << 16 */ 237 lwi r12, r8, 24 /* v = *(as + 24) */ 238 bsrli r9, r12, 16 /* t1 = v >> 16 */ 239 or r9, r11, r9 /* t1 = h | t1 */ 240 swi r9, r5, 20 /* *(d + 20) = t1 */ 241 bslli r11, r12, 16 /* h = v << 16 */ 242 lwi r12, r8, 28 /* v = *(as + 28) */ 243 bsrli r9, r12, 16 /* t1 = v >> 16 */ 244 or r9, r11, r9 /* t1 = h | t1 */ 245 swi r9, r5, 24 /* *(d + 24) = t1 */ 246 bslli r11, r12, 16 /* h = v << 16 */ 247 lwi r12, r8, 32 /* v = *(as + 32) */ 248 bsrli r9, r12, 16 /* t1 = v >> 16 */ 249 or r9, r11, r9 /* t1 = h | t1 */ 250 swi r9, r5, 28 /* *(d + 28) = t1 */ 251 bslli r11, r12, 16 /* h = v << 16 */ 252 addi r8, r8, 32 /* as = as + 32 */ 253 addi r4, r4, -32 /* n = n - 32 */ 254 bneid r4, a_bu2_loop /* while (n) loop */ 255 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 256 257a_block_done: 258 addi r4, r0, 4 /* n = 4 */ 259 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 260 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 261 262a_word_xfer: 263 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 264 addi r10, r0, 0 /* offset = 0 */ 265 266 andi r9, r6, 3 /* t1 = s & 3 */ 267 /* if temp != 0, unaligned transfers needed */ 268 bnei r9, a_word_unaligned 269 270a_word_aligned: 271 lw r9, r6, r10 /* t1 = *(s+offset) */ 272 sw r9, r5, r10 /* *(d+offset) = t1 */ 273 addi r4, r4,-4 /* n-- */ 274 bneid r4, a_word_aligned /* loop */ 275 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ 276 277 bri a_word_done 278 279a_word_unaligned: 280 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 281 lwi r11, r8, 0 /* h = *(as + 0) */ 282 addi r8, r8, 4 /* as = as + 4 */ 283 284 addi r9, r9, -1 285 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ 286 addi r9, r9, -1 287 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ 288 289a_word_u3: 290 bslli r11, r11, 24 /* h = h << 24 */ 291a_wu3_loop: 292 lw r12, r8, r10 /* v = *(as + offset) */ 293 bsrli r9, r12, 8 /* t1 = v >> 8 */ 294 or r9, r11, r9 /* t1 = h | t1 */ 295 sw r9, r5, r10 /* *(d + offset) = t1 */ 296 bslli r11, r12, 24 /* h = v << 24 */ 297 addi r4, r4,-4 /* n = n - 4 */ 298 bneid r4, a_wu3_loop /* while (n) loop */ 299 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 300 301 bri a_word_done 302 303a_word_u1: 304 bslli r11, r11, 8 /* h = h << 8 */ 305a_wu1_loop: 306 lw r12, r8, r10 /* v = *(as + offset) */ 307 bsrli r9, r12, 24 /* t1 = v >> 24 */ 308 or r9, r11, r9 /* t1 = h | t1 */ 309 sw r9, r5, r10 /* *(d + offset) = t1 */ 310 bslli r11, r12, 8 /* h = v << 8 */ 311 addi r4, r4,-4 /* n = n - 4 */ 312 bneid r4, a_wu1_loop /* while (n) loop */ 313 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 314 315 bri a_word_done 316 317a_word_u2: 318 bslli r11, r11, 16 /* h = h << 16 */ 319a_wu2_loop: 320 lw r12, r8, r10 /* v = *(as + offset) */ 321 bsrli r9, r12, 16 /* t1 = v >> 16 */ 322 or r9, r11, r9 /* t1 = h | t1 */ 323 sw r9, r5, r10 /* *(d + offset) = t1 */ 324 bslli r11, r12, 16 /* h = v << 16 */ 325 addi r4, r4,-4 /* n = n - 4 */ 326 bneid r4, a_wu2_loop /* while (n) loop */ 327 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 328 329a_word_done: 330 add r5, r5, r10 /* d = d + offset */ 331 add r6, r6, r10 /* s = s + offset */ 332 rsub r7, r10, r7 /* c = c - offset */ 333 334a_xfer_end: 335a_xfer_end_loop: 336 beqi r7, a_done /* while (c) */ 337 lbui r9, r6, 0 /* t1 = *s */ 338 addi r6, r6, 1 /* s++ */ 339 sbi r9, r5, 0 /* *d = t1 */ 340 addi r7, r7, -1 /* c-- */ 341 brid a_xfer_end_loop /* loop */ 342 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ 343 344a_done: 345 rtsd r15, 8 346 nop 347 348.end memcpy 349/*----------------------------------------------------------------------------*/ 350 .globl memmove 351 .ent memmove 352 353memmove: 354 cmpu r4, r5, r6 /* n = s - d */ 355 bgei r4,fast_memcpy_ascending 356 357fast_memcpy_descending: 358 /* move d to return register as value of function */ 359 addi r3, r5, 0 360 361 add r5, r5, r7 /* d = d + c */ 362 add r6, r6, r7 /* s = s + c */ 363 364 addi r4, r0, 4 /* n = 4 */ 365 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 366 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 367 368 /* transfer first 0~3 bytes to get aligned dest address */ 369 andi r4, r5, 3 /* n = d & 3 */ 370 /* if zero, destination already aligned */ 371 beqi r4,d_dalign_done 372 rsub r7, r4, r7 /* c = c - n adjust c */ 373 374d_xfer_first_loop: 375 /* if no bytes left to transfer, transfer the bulk */ 376 beqi r4,d_dalign_done 377 addi r6, r6, -1 /* s-- */ 378 addi r5, r5, -1 /* d-- */ 379 lbui r11, r6, 0 /* h = *s */ 380 sbi r11, r5, 0 /* *d = h */ 381 brid d_xfer_first_loop /* loop */ 382 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 383 384d_dalign_done: 385 addi r4, r0, 32 /* n = 32 */ 386 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 387 /* if n < 0, less than one block to transfer */ 388 blti r4, d_block_done 389 390d_block_xfer: 391 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 392 rsub r7, r4, r7 /* c = c - n */ 393 394 andi r9, r6, 3 /* t1 = s & 3 */ 395 /* if temp != 0, unaligned transfers needed */ 396 bnei r9, d_block_unaligned 397 398d_block_aligned: 399 addi r6, r6, -32 /* s = s - 32 */ 400 addi r5, r5, -32 /* d = d - 32 */ 401 lwi r9, r6, 28 /* t1 = *(s + 28) */ 402 lwi r10, r6, 24 /* t2 = *(s + 24) */ 403 lwi r11, r6, 20 /* t3 = *(s + 20) */ 404 lwi r12, r6, 16 /* t4 = *(s + 16) */ 405 swi r9, r5, 28 /* *(d + 28) = t1 */ 406 swi r10, r5, 24 /* *(d + 24) = t2 */ 407 swi r11, r5, 20 /* *(d + 20) = t3 */ 408 swi r12, r5, 16 /* *(d + 16) = t4 */ 409 lwi r9, r6, 12 /* t1 = *(s + 12) */ 410 lwi r10, r6, 8 /* t2 = *(s + 8) */ 411 lwi r11, r6, 4 /* t3 = *(s + 4) */ 412 lwi r12, r6, 0 /* t4 = *(s + 0) */ 413 swi r9, r5, 12 /* *(d + 12) = t1 */ 414 swi r10, r5, 8 /* *(d + 8) = t2 */ 415 swi r11, r5, 4 /* *(d + 4) = t3 */ 416 addi r4, r4, -32 /* n = n - 32 */ 417 bneid r4, d_block_aligned /* while (n) loop */ 418 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ 419 bri d_block_done 420 421d_block_unaligned: 422 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 423 rsub r6, r4, r6 /* s = s - n */ 424 lwi r11, r8, 0 /* h = *(as + 0) */ 425 426 addi r9, r9, -1 427 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ 428 addi r9, r9, -1 429 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ 430 431d_block_u3: 432 bsrli r11, r11, 8 /* h = h >> 8 */ 433d_bu3_loop: 434 addi r8, r8, -32 /* as = as - 32 */ 435 addi r5, r5, -32 /* d = d - 32 */ 436 lwi r12, r8, 28 /* v = *(as + 28) */ 437 bslli r9, r12, 24 /* t1 = v << 24 */ 438 or r9, r11, r9 /* t1 = h | t1 */ 439 swi r9, r5, 28 /* *(d + 28) = t1 */ 440 bsrli r11, r12, 8 /* h = v >> 8 */ 441 lwi r12, r8, 24 /* v = *(as + 24) */ 442 bslli r9, r12, 24 /* t1 = v << 24 */ 443 or r9, r11, r9 /* t1 = h | t1 */ 444 swi r9, r5, 24 /* *(d + 24) = t1 */ 445 bsrli r11, r12, 8 /* h = v >> 8 */ 446 lwi r12, r8, 20 /* v = *(as + 20) */ 447 bslli r9, r12, 24 /* t1 = v << 24 */ 448 or r9, r11, r9 /* t1 = h | t1 */ 449 swi r9, r5, 20 /* *(d + 20) = t1 */ 450 bsrli r11, r12, 8 /* h = v >> 8 */ 451 lwi r12, r8, 16 /* v = *(as + 16) */ 452 bslli r9, r12, 24 /* t1 = v << 24 */ 453 or r9, r11, r9 /* t1 = h | t1 */ 454 swi r9, r5, 16 /* *(d + 16) = t1 */ 455 bsrli r11, r12, 8 /* h = v >> 8 */ 456 lwi r12, r8, 12 /* v = *(as + 12) */ 457 bslli r9, r12, 24 /* t1 = v << 24 */ 458 or r9, r11, r9 /* t1 = h | t1 */ 459 swi r9, r5, 12 /* *(d + 112) = t1 */ 460 bsrli r11, r12, 8 /* h = v >> 8 */ 461 lwi r12, r8, 8 /* v = *(as + 8) */ 462 bslli r9, r12, 24 /* t1 = v << 24 */ 463 or r9, r11, r9 /* t1 = h | t1 */ 464 swi r9, r5, 8 /* *(d + 8) = t1 */ 465 bsrli r11, r12, 8 /* h = v >> 8 */ 466 lwi r12, r8, 4 /* v = *(as + 4) */ 467 bslli r9, r12, 24 /* t1 = v << 24 */ 468 or r9, r11, r9 /* t1 = h | t1 */ 469 swi r9, r5, 4 /* *(d + 4) = t1 */ 470 bsrli r11, r12, 8 /* h = v >> 8 */ 471 lwi r12, r8, 0 /* v = *(as + 0) */ 472 bslli r9, r12, 24 /* t1 = v << 24 */ 473 or r9, r11, r9 /* t1 = h | t1 */ 474 swi r9, r5, 0 /* *(d + 0) = t1 */ 475 addi r4, r4, -32 /* n = n - 32 */ 476 bneid r4, d_bu3_loop /* while (n) loop */ 477 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 478 bri d_block_done 479 480d_block_u1: 481 bsrli r11, r11, 24 /* h = h >> 24 */ 482d_bu1_loop: 483 addi r8, r8, -32 /* as = as - 32 */ 484 addi r5, r5, -32 /* d = d - 32 */ 485 lwi r12, r8, 28 /* v = *(as + 28) */ 486 bslli r9, r12, 8 /* t1 = v << 8 */ 487 or r9, r11, r9 /* t1 = h | t1 */ 488 swi r9, r5, 28 /* *(d + 28) = t1 */ 489 bsrli r11, r12, 24 /* h = v >> 24 */ 490 lwi r12, r8, 24 /* v = *(as + 24) */ 491 bslli r9, r12, 8 /* t1 = v << 8 */ 492 or r9, r11, r9 /* t1 = h | t1 */ 493 swi r9, r5, 24 /* *(d + 24) = t1 */ 494 bsrli r11, r12, 24 /* h = v >> 24 */ 495 lwi r12, r8, 20 /* v = *(as + 20) */ 496 bslli r9, r12, 8 /* t1 = v << 8 */ 497 or r9, r11, r9 /* t1 = h | t1 */ 498 swi r9, r5, 20 /* *(d + 20) = t1 */ 499 bsrli r11, r12, 24 /* h = v >> 24 */ 500 lwi r12, r8, 16 /* v = *(as + 16) */ 501 bslli r9, r12, 8 /* t1 = v << 8 */ 502 or r9, r11, r9 /* t1 = h | t1 */ 503 swi r9, r5, 16 /* *(d + 16) = t1 */ 504 bsrli r11, r12, 24 /* h = v >> 24 */ 505 lwi r12, r8, 12 /* v = *(as + 12) */ 506 bslli r9, r12, 8 /* t1 = v << 8 */ 507 or r9, r11, r9 /* t1 = h | t1 */ 508 swi r9, r5, 12 /* *(d + 112) = t1 */ 509 bsrli r11, r12, 24 /* h = v >> 24 */ 510 lwi r12, r8, 8 /* v = *(as + 8) */ 511 bslli r9, r12, 8 /* t1 = v << 8 */ 512 or r9, r11, r9 /* t1 = h | t1 */ 513 swi r9, r5, 8 /* *(d + 8) = t1 */ 514 bsrli r11, r12, 24 /* h = v >> 24 */ 515 lwi r12, r8, 4 /* v = *(as + 4) */ 516 bslli r9, r12, 8 /* t1 = v << 8 */ 517 or r9, r11, r9 /* t1 = h | t1 */ 518 swi r9, r5, 4 /* *(d + 4) = t1 */ 519 bsrli r11, r12, 24 /* h = v >> 24 */ 520 lwi r12, r8, 0 /* v = *(as + 0) */ 521 bslli r9, r12, 8 /* t1 = v << 8 */ 522 or r9, r11, r9 /* t1 = h | t1 */ 523 swi r9, r5, 0 /* *(d + 0) = t1 */ 524 addi r4, r4, -32 /* n = n - 32 */ 525 bneid r4, d_bu1_loop /* while (n) loop */ 526 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 527 bri d_block_done 528 529d_block_u2: 530 bsrli r11, r11, 16 /* h = h >> 16 */ 531d_bu2_loop: 532 addi r8, r8, -32 /* as = as - 32 */ 533 addi r5, r5, -32 /* d = d - 32 */ 534 lwi r12, r8, 28 /* v = *(as + 28) */ 535 bslli r9, r12, 16 /* t1 = v << 16 */ 536 or r9, r11, r9 /* t1 = h | t1 */ 537 swi r9, r5, 28 /* *(d + 28) = t1 */ 538 bsrli r11, r12, 16 /* h = v >> 16 */ 539 lwi r12, r8, 24 /* v = *(as + 24) */ 540 bslli r9, r12, 16 /* t1 = v << 16 */ 541 or r9, r11, r9 /* t1 = h | t1 */ 542 swi r9, r5, 24 /* *(d + 24) = t1 */ 543 bsrli r11, r12, 16 /* h = v >> 16 */ 544 lwi r12, r8, 20 /* v = *(as + 20) */ 545 bslli r9, r12, 16 /* t1 = v << 16 */ 546 or r9, r11, r9 /* t1 = h | t1 */ 547 swi r9, r5, 20 /* *(d + 20) = t1 */ 548 bsrli r11, r12, 16 /* h = v >> 16 */ 549 lwi r12, r8, 16 /* v = *(as + 16) */ 550 bslli r9, r12, 16 /* t1 = v << 16 */ 551 or r9, r11, r9 /* t1 = h | t1 */ 552 swi r9, r5, 16 /* *(d + 16) = t1 */ 553 bsrli r11, r12, 16 /* h = v >> 16 */ 554 lwi r12, r8, 12 /* v = *(as + 12) */ 555 bslli r9, r12, 16 /* t1 = v << 16 */ 556 or r9, r11, r9 /* t1 = h | t1 */ 557 swi r9, r5, 12 /* *(d + 112) = t1 */ 558 bsrli r11, r12, 16 /* h = v >> 16 */ 559 lwi r12, r8, 8 /* v = *(as + 8) */ 560 bslli r9, r12, 16 /* t1 = v << 16 */ 561 or r9, r11, r9 /* t1 = h | t1 */ 562 swi r9, r5, 8 /* *(d + 8) = t1 */ 563 bsrli r11, r12, 16 /* h = v >> 16 */ 564 lwi r12, r8, 4 /* v = *(as + 4) */ 565 bslli r9, r12, 16 /* t1 = v << 16 */ 566 or r9, r11, r9 /* t1 = h | t1 */ 567 swi r9, r5, 4 /* *(d + 4) = t1 */ 568 bsrli r11, r12, 16 /* h = v >> 16 */ 569 lwi r12, r8, 0 /* v = *(as + 0) */ 570 bslli r9, r12, 16 /* t1 = v << 16 */ 571 or r9, r11, r9 /* t1 = h | t1 */ 572 swi r9, r5, 0 /* *(d + 0) = t1 */ 573 addi r4, r4, -32 /* n = n - 32 */ 574 bneid r4, d_bu2_loop /* while (n) loop */ 575 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 576 577d_block_done: 578 addi r4, r0, 4 /* n = 4 */ 579 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 580 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 581 582d_word_xfer: 583 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 584 rsub r5, r4, r5 /* d = d - n */ 585 rsub r6, r4, r6 /* s = s - n */ 586 rsub r7, r4, r7 /* c = c - n */ 587 588 andi r9, r6, 3 /* t1 = s & 3 */ 589 /* if temp != 0, unaligned transfers needed */ 590 bnei r9, d_word_unaligned 591 592d_word_aligned: 593 addi r4, r4,-4 /* n-- */ 594 lw r9, r6, r4 /* t1 = *(s+n) */ 595 bneid r4, d_word_aligned /* loop */ 596 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ 597 598 bri d_word_done 599 600d_word_unaligned: 601 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 602 lw r11, r8, r4 /* h = *(as + n) */ 603 604 addi r9, r9, -1 605 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ 606 addi r9, r9, -1 607 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ 608 609d_word_u3: 610 bsrli r11, r11, 8 /* h = h >> 8 */ 611d_wu3_loop: 612 addi r4, r4,-4 /* n = n - 4 */ 613 lw r12, r8, r4 /* v = *(as + n) */ 614 bslli r9, r12, 24 /* t1 = v << 24 */ 615 or r9, r11, r9 /* t1 = h | t1 */ 616 sw r9, r5, r4 /* *(d + n) = t1 */ 617 bneid r4, d_wu3_loop /* while (n) loop */ 618 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 619 620 bri d_word_done 621 622d_word_u1: 623 bsrli r11, r11, 24 /* h = h >> 24 */ 624d_wu1_loop: 625 addi r4, r4,-4 /* n = n - 4 */ 626 lw r12, r8, r4 /* v = *(as + n) */ 627 bslli r9, r12, 8 /* t1 = v << 8 */ 628 or r9, r11, r9 /* t1 = h | t1 */ 629 sw r9, r5, r4 /* *(d + n) = t1 */ 630 bneid r4, d_wu1_loop /* while (n) loop */ 631 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 632 633 bri d_word_done 634 635d_word_u2: 636 bsrli r11, r11, 16 /* h = h >> 16 */ 637d_wu2_loop: 638 addi r4, r4,-4 /* n = n - 4 */ 639 lw r12, r8, r4 /* v = *(as + n) */ 640 bslli r9, r12, 16 /* t1 = v << 16 */ 641 or r9, r11, r9 /* t1 = h | t1 */ 642 sw r9, r5, r4 /* *(d + n) = t1 */ 643 bneid r4, d_wu2_loop /* while (n) loop */ 644 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 645 646d_word_done: 647 648d_xfer_end: 649d_xfer_end_loop: 650 beqi r7, a_done /* while (c) */ 651 addi r6, r6, -1 /* s-- */ 652 lbui r9, r6, 0 /* t1 = *s */ 653 addi r5, r5, -1 /* d-- */ 654 sbi r9, r5, 0 /* *d = t1 */ 655 brid d_xfer_end_loop /* loop */ 656 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ 657 658d_done: 659 rtsd r15, 8 660 nop 661 662.end memmove 663