1/* 2 * arch/alpha/lib/ev6-memset.S 3 * 4 * This is an efficient (and relatively small) implementation of the C library 5 * "memset()" function for the 21264 implementation of Alpha. 6 * 7 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 8 * 9 * Much of the information about 21264 scheduling/coding comes from: 10 * Compiler Writer's Guide for the Alpha 21264 11 * abbreviated as 'CWG' in other comments here 12 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 13 * Scheduling notation: 14 * E - either cluster 15 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 16 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 17 * The algorithm for the leading and trailing quadwords remains the same, 18 * however the loop has been unrolled to enable better memory throughput, 19 * and the code has been replicated for each of the entry points: __memset 20 * and __memsetw to permit better scheduling to eliminate the stalling 21 * encountered during the mask replication. 22 * A future enhancement might be to put in a byte store loop for really 23 * small (say < 32 bytes) memset()s. Whether or not that change would be 24 * a win in the kernel would depend upon the contextual usage. 25 * WARNING: Maintaining this is going to be more work than the above version, 26 * as fixes will need to be made in multiple places. The performance gain 27 * is worth it. 28 */ 29 30 .set noat 31 .set noreorder 32.text 33 .globl __memset 34 .globl __memsetw 35 .globl __constant_c_memset 36 .globl memset 37 38 .ent __memset 39.align 5 40__memset: 41 .frame $30,0,$26,0 42 .prologue 0 43 44 /* 45 * Serious stalling happens. The only way to mitigate this is to 46 * undertake a major re-write to interleave the constant materialization 47 * with other parts of the fall-through code. This is important, even 48 * though it makes maintenance tougher. 49 * Do this later. 50 */ 51 and $17,255,$1 # E : 00000000000000ch 52 insbl $17,1,$2 # U : 000000000000ch00 53 bis $16,$16,$0 # E : return value 54 ble $18,end_b # U : zero length requested? 55 56 addq $18,$16,$6 # E : max address to write to 57 bis $1,$2,$17 # E : 000000000000chch 58 insbl $1,2,$3 # U : 0000000000ch0000 59 insbl $1,3,$4 # U : 00000000ch000000 60 61 or $3,$4,$3 # E : 00000000chch0000 62 inswl $17,4,$5 # U : 0000chch00000000 63 xor $16,$6,$1 # E : will complete write be within one quadword? 64 inswl $17,6,$2 # U : chch000000000000 65 66 or $17,$3,$17 # E : 00000000chchchch 67 or $2,$5,$2 # E : chchchch00000000 68 bic $1,7,$1 # E : fit within a single quadword? 69 and $16,7,$3 # E : Target addr misalignment 70 71 or $17,$2,$17 # E : chchchchchchchch 72 beq $1,within_quad_b # U : 73 nop # E : 74 beq $3,aligned_b # U : target is 0mod8 75 76 /* 77 * Target address is misaligned, and won't fit within a quadword 78 */ 79 ldq_u $4,0($16) # L : Fetch first partial 80 bis $16,$16,$5 # E : Save the address 81 insql $17,$16,$2 # U : Insert new bytes 82 subq $3,8,$3 # E : Invert (for addressing uses) 83 84 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 85 mskql $4,$16,$4 # U : clear relevant parts of the quad 86 subq $16,$3,$16 # E : $16 is new aligned destination 87 bis $2,$4,$1 # E : Final bytes 88 89 nop 90 stq_u $1,0($5) # L : Store result 91 nop 92 nop 93 94.align 4 95aligned_b: 96 /* 97 * We are now guaranteed to be quad aligned, with at least 98 * one partial quad to write. 99 */ 100 101 sra $18,3,$3 # U : Number of remaining quads to write 102 and $18,7,$18 # E : Number of trailing bytes to write 103 bis $16,$16,$5 # E : Save dest address 104 beq $3,no_quad_b # U : tail stuff only 105 106 /* 107 * it's worth the effort to unroll this and use wh64 if possible 108 * Lifted a bunch of code from clear_user.S 109 * At this point, entry values are: 110 * $16 Current destination address 111 * $5 A copy of $16 112 * $6 The max quadword address to write to 113 * $18 Number trailer bytes 114 * $3 Number quads to write 115 */ 116 117 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 118 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 119 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 120 blt $4, loop_b # U : 121 122 /* 123 * We know we've got at least 16 quads, minimum of one trip 124 * through unrolled loop. Do a quad at a time to get us 0mod64 125 * aligned. 126 */ 127 128 nop # E : 129 nop # E : 130 nop # E : 131 beq $1, $bigalign_b # U : 132 133$alignmod64_b: 134 stq $17, 0($5) # L : 135 subq $3, 1, $3 # E : For consistency later 136 addq $1, 8, $1 # E : Increment towards zero for alignment 137 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 138 139 nop 140 nop 141 addq $5, 8, $5 # E : Inc address 142 blt $1, $alignmod64_b # U : 143 144$bigalign_b: 145 /* 146 * $3 - number quads left to go 147 * $5 - target address (aligned 0mod64) 148 * $17 - mask of stuff to store 149 * Scratch registers available: $7, $2, $4, $1 150 * we know that we'll be taking a minimum of one trip through 151 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 152 * Assumes the wh64 needs to be for 2 trips through the loop in the future 153 * The wh64 is issued on for the starting destination address for trip +2 154 * through the loop, and if there are less than two trips left, the target 155 * address will be for the current trip. 156 */ 157 158$do_wh64_b: 159 wh64 ($4) # L1 : memory subsystem write hint 160 subq $3, 24, $2 # E : For determining future wh64 addresses 161 stq $17, 0($5) # L : 162 nop # E : 163 164 addq $5, 128, $4 # E : speculative target of next wh64 165 stq $17, 8($5) # L : 166 stq $17, 16($5) # L : 167 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 168 169 stq $17, 24($5) # L : 170 stq $17, 32($5) # L : 171 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 172 nop 173 174 stq $17, 40($5) # L : 175 stq $17, 48($5) # L : 176 subq $3, 16, $2 # E : Repeat the loop at least once more? 177 nop 178 179 stq $17, 56($5) # L : 180 addq $5, 64, $5 # E : 181 subq $3, 8, $3 # E : 182 bge $2, $do_wh64_b # U : 183 184 nop 185 nop 186 nop 187 beq $3, no_quad_b # U : Might have finished already 188 189.align 4 190 /* 191 * Simple loop for trailing quadwords, or for small amounts 192 * of data (where we can't use an unrolled loop and wh64) 193 */ 194loop_b: 195 stq $17,0($5) # L : 196 subq $3,1,$3 # E : Decrement number quads left 197 addq $5,8,$5 # E : Inc address 198 bne $3,loop_b # U : more? 199 200no_quad_b: 201 /* 202 * Write 0..7 trailing bytes. 203 */ 204 nop # E : 205 beq $18,end_b # U : All done? 206 ldq $7,0($5) # L : 207 mskqh $7,$6,$2 # U : Mask final quad 208 209 insqh $17,$6,$4 # U : New bits 210 bis $2,$4,$1 # E : Put it all together 211 stq $1,0($5) # L : And back to memory 212 ret $31,($26),1 # L0 : 213 214within_quad_b: 215 ldq_u $1,0($16) # L : 216 insql $17,$16,$2 # U : New bits 217 mskql $1,$16,$4 # U : Clear old 218 bis $2,$4,$2 # E : New result 219 220 mskql $2,$6,$4 # U : 221 mskqh $1,$6,$2 # U : 222 bis $2,$4,$1 # E : 223 stq_u $1,0($16) # L : 224 225end_b: 226 nop 227 nop 228 nop 229 ret $31,($26),1 # L0 : 230 .end __memset 231 232 /* 233 * This is the original body of code, prior to replication and 234 * rescheduling. Leave it here, as there may be calls to this 235 * entry point. 236 */ 237.align 4 238 .ent __constant_c_memset 239__constant_c_memset: 240 .frame $30,0,$26,0 241 .prologue 0 242 243 addq $18,$16,$6 # E : max address to write to 244 bis $16,$16,$0 # E : return value 245 xor $16,$6,$1 # E : will complete write be within one quadword? 246 ble $18,end # U : zero length requested? 247 248 bic $1,7,$1 # E : fit within a single quadword 249 beq $1,within_one_quad # U : 250 and $16,7,$3 # E : Target addr misalignment 251 beq $3,aligned # U : target is 0mod8 252 253 /* 254 * Target address is misaligned, and won't fit within a quadword 255 */ 256 ldq_u $4,0($16) # L : Fetch first partial 257 bis $16,$16,$5 # E : Save the address 258 insql $17,$16,$2 # U : Insert new bytes 259 subq $3,8,$3 # E : Invert (for addressing uses) 260 261 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 262 mskql $4,$16,$4 # U : clear relevant parts of the quad 263 subq $16,$3,$16 # E : $16 is new aligned destination 264 bis $2,$4,$1 # E : Final bytes 265 266 nop 267 stq_u $1,0($5) # L : Store result 268 nop 269 nop 270 271.align 4 272aligned: 273 /* 274 * We are now guaranteed to be quad aligned, with at least 275 * one partial quad to write. 276 */ 277 278 sra $18,3,$3 # U : Number of remaining quads to write 279 and $18,7,$18 # E : Number of trailing bytes to write 280 bis $16,$16,$5 # E : Save dest address 281 beq $3,no_quad # U : tail stuff only 282 283 /* 284 * it's worth the effort to unroll this and use wh64 if possible 285 * Lifted a bunch of code from clear_user.S 286 * At this point, entry values are: 287 * $16 Current destination address 288 * $5 A copy of $16 289 * $6 The max quadword address to write to 290 * $18 Number trailer bytes 291 * $3 Number quads to write 292 */ 293 294 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 295 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 296 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 297 blt $4, loop # U : 298 299 /* 300 * We know we've got at least 16 quads, minimum of one trip 301 * through unrolled loop. Do a quad at a time to get us 0mod64 302 * aligned. 303 */ 304 305 nop # E : 306 nop # E : 307 nop # E : 308 beq $1, $bigalign # U : 309 310$alignmod64: 311 stq $17, 0($5) # L : 312 subq $3, 1, $3 # E : For consistency later 313 addq $1, 8, $1 # E : Increment towards zero for alignment 314 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 315 316 nop 317 nop 318 addq $5, 8, $5 # E : Inc address 319 blt $1, $alignmod64 # U : 320 321$bigalign: 322 /* 323 * $3 - number quads left to go 324 * $5 - target address (aligned 0mod64) 325 * $17 - mask of stuff to store 326 * Scratch registers available: $7, $2, $4, $1 327 * we know that we'll be taking a minimum of one trip through 328 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 329 * Assumes the wh64 needs to be for 2 trips through the loop in the future 330 * The wh64 is issued on for the starting destination address for trip +2 331 * through the loop, and if there are less than two trips left, the target 332 * address will be for the current trip. 333 */ 334 335$do_wh64: 336 wh64 ($4) # L1 : memory subsystem write hint 337 subq $3, 24, $2 # E : For determining future wh64 addresses 338 stq $17, 0($5) # L : 339 nop # E : 340 341 addq $5, 128, $4 # E : speculative target of next wh64 342 stq $17, 8($5) # L : 343 stq $17, 16($5) # L : 344 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 345 346 stq $17, 24($5) # L : 347 stq $17, 32($5) # L : 348 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 349 nop 350 351 stq $17, 40($5) # L : 352 stq $17, 48($5) # L : 353 subq $3, 16, $2 # E : Repeat the loop at least once more? 354 nop 355 356 stq $17, 56($5) # L : 357 addq $5, 64, $5 # E : 358 subq $3, 8, $3 # E : 359 bge $2, $do_wh64 # U : 360 361 nop 362 nop 363 nop 364 beq $3, no_quad # U : Might have finished already 365 366.align 4 367 /* 368 * Simple loop for trailing quadwords, or for small amounts 369 * of data (where we can't use an unrolled loop and wh64) 370 */ 371loop: 372 stq $17,0($5) # L : 373 subq $3,1,$3 # E : Decrement number quads left 374 addq $5,8,$5 # E : Inc address 375 bne $3,loop # U : more? 376 377no_quad: 378 /* 379 * Write 0..7 trailing bytes. 380 */ 381 nop # E : 382 beq $18,end # U : All done? 383 ldq $7,0($5) # L : 384 mskqh $7,$6,$2 # U : Mask final quad 385 386 insqh $17,$6,$4 # U : New bits 387 bis $2,$4,$1 # E : Put it all together 388 stq $1,0($5) # L : And back to memory 389 ret $31,($26),1 # L0 : 390 391within_one_quad: 392 ldq_u $1,0($16) # L : 393 insql $17,$16,$2 # U : New bits 394 mskql $1,$16,$4 # U : Clear old 395 bis $2,$4,$2 # E : New result 396 397 mskql $2,$6,$4 # U : 398 mskqh $1,$6,$2 # U : 399 bis $2,$4,$1 # E : 400 stq_u $1,0($16) # L : 401 402end: 403 nop 404 nop 405 nop 406 ret $31,($26),1 # L0 : 407 .end __constant_c_memset 408 409 /* 410 * This is a replicant of the __constant_c_memset code, rescheduled 411 * to mask stalls. Note that entry point names also had to change 412 */ 413 .align 5 414 .ent __memsetw 415 416__memsetw: 417 .frame $30,0,$26,0 418 .prologue 0 419 420 inswl $17,0,$5 # U : 000000000000c1c2 421 inswl $17,2,$2 # U : 00000000c1c20000 422 bis $16,$16,$0 # E : return value 423 addq $18,$16,$6 # E : max address to write to 424 425 ble $18, end_w # U : zero length requested? 426 inswl $17,4,$3 # U : 0000c1c200000000 427 inswl $17,6,$4 # U : c1c2000000000000 428 xor $16,$6,$1 # E : will complete write be within one quadword? 429 430 or $2,$5,$2 # E : 00000000c1c2c1c2 431 or $3,$4,$17 # E : c1c2c1c200000000 432 bic $1,7,$1 # E : fit within a single quadword 433 and $16,7,$3 # E : Target addr misalignment 434 435 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 436 beq $1,within_quad_w # U : 437 nop 438 beq $3,aligned_w # U : target is 0mod8 439 440 /* 441 * Target address is misaligned, and won't fit within a quadword 442 */ 443 ldq_u $4,0($16) # L : Fetch first partial 444 bis $16,$16,$5 # E : Save the address 445 insql $17,$16,$2 # U : Insert new bytes 446 subq $3,8,$3 # E : Invert (for addressing uses) 447 448 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 449 mskql $4,$16,$4 # U : clear relevant parts of the quad 450 subq $16,$3,$16 # E : $16 is new aligned destination 451 bis $2,$4,$1 # E : Final bytes 452 453 nop 454 stq_u $1,0($5) # L : Store result 455 nop 456 nop 457 458.align 4 459aligned_w: 460 /* 461 * We are now guaranteed to be quad aligned, with at least 462 * one partial quad to write. 463 */ 464 465 sra $18,3,$3 # U : Number of remaining quads to write 466 and $18,7,$18 # E : Number of trailing bytes to write 467 bis $16,$16,$5 # E : Save dest address 468 beq $3,no_quad_w # U : tail stuff only 469 470 /* 471 * it's worth the effort to unroll this and use wh64 if possible 472 * Lifted a bunch of code from clear_user.S 473 * At this point, entry values are: 474 * $16 Current destination address 475 * $5 A copy of $16 476 * $6 The max quadword address to write to 477 * $18 Number trailer bytes 478 * $3 Number quads to write 479 */ 480 481 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 482 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 483 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 484 blt $4, loop_w # U : 485 486 /* 487 * We know we've got at least 16 quads, minimum of one trip 488 * through unrolled loop. Do a quad at a time to get us 0mod64 489 * aligned. 490 */ 491 492 nop # E : 493 nop # E : 494 nop # E : 495 beq $1, $bigalign_w # U : 496 497$alignmod64_w: 498 stq $17, 0($5) # L : 499 subq $3, 1, $3 # E : For consistency later 500 addq $1, 8, $1 # E : Increment towards zero for alignment 501 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 502 503 nop 504 nop 505 addq $5, 8, $5 # E : Inc address 506 blt $1, $alignmod64_w # U : 507 508$bigalign_w: 509 /* 510 * $3 - number quads left to go 511 * $5 - target address (aligned 0mod64) 512 * $17 - mask of stuff to store 513 * Scratch registers available: $7, $2, $4, $1 514 * we know that we'll be taking a minimum of one trip through 515 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 516 * Assumes the wh64 needs to be for 2 trips through the loop in the future 517 * The wh64 is issued on for the starting destination address for trip +2 518 * through the loop, and if there are less than two trips left, the target 519 * address will be for the current trip. 520 */ 521 522$do_wh64_w: 523 wh64 ($4) # L1 : memory subsystem write hint 524 subq $3, 24, $2 # E : For determining future wh64 addresses 525 stq $17, 0($5) # L : 526 nop # E : 527 528 addq $5, 128, $4 # E : speculative target of next wh64 529 stq $17, 8($5) # L : 530 stq $17, 16($5) # L : 531 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 532 533 stq $17, 24($5) # L : 534 stq $17, 32($5) # L : 535 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 536 nop 537 538 stq $17, 40($5) # L : 539 stq $17, 48($5) # L : 540 subq $3, 16, $2 # E : Repeat the loop at least once more? 541 nop 542 543 stq $17, 56($5) # L : 544 addq $5, 64, $5 # E : 545 subq $3, 8, $3 # E : 546 bge $2, $do_wh64_w # U : 547 548 nop 549 nop 550 nop 551 beq $3, no_quad_w # U : Might have finished already 552 553.align 4 554 /* 555 * Simple loop for trailing quadwords, or for small amounts 556 * of data (where we can't use an unrolled loop and wh64) 557 */ 558loop_w: 559 stq $17,0($5) # L : 560 subq $3,1,$3 # E : Decrement number quads left 561 addq $5,8,$5 # E : Inc address 562 bne $3,loop_w # U : more? 563 564no_quad_w: 565 /* 566 * Write 0..7 trailing bytes. 567 */ 568 nop # E : 569 beq $18,end_w # U : All done? 570 ldq $7,0($5) # L : 571 mskqh $7,$6,$2 # U : Mask final quad 572 573 insqh $17,$6,$4 # U : New bits 574 bis $2,$4,$1 # E : Put it all together 575 stq $1,0($5) # L : And back to memory 576 ret $31,($26),1 # L0 : 577 578within_quad_w: 579 ldq_u $1,0($16) # L : 580 insql $17,$16,$2 # U : New bits 581 mskql $1,$16,$4 # U : Clear old 582 bis $2,$4,$2 # E : New result 583 584 mskql $2,$6,$4 # U : 585 mskqh $1,$6,$2 # U : 586 bis $2,$4,$1 # E : 587 stq_u $1,0($16) # L : 588 589end_w: 590 nop 591 nop 592 nop 593 ret $31,($26),1 # L0 : 594 595 .end __memsetw 596 597memset = __memset 598