1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * arch/alpha/lib/ev6-memset.S 4 * 5 * This is an efficient (and relatively small) implementation of the C library 6 * "memset()" function for the 21264 implementation of Alpha. 7 * 8 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 9 * 10 * Much of the information about 21264 scheduling/coding comes from: 11 * Compiler Writer's Guide for the Alpha 21264 12 * abbreviated as 'CWG' in other comments here 13 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 14 * Scheduling notation: 15 * E - either cluster 16 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 17 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 18 * The algorithm for the leading and trailing quadwords remains the same, 19 * however the loop has been unrolled to enable better memory throughput, 20 * and the code has been replicated for each of the entry points: __memset 21 * and __memset16 to permit better scheduling to eliminate the stalling 22 * encountered during the mask replication. 23 * A future enhancement might be to put in a byte store loop for really 24 * small (say < 32 bytes) memset()s. Whether or not that change would be 25 * a win in the kernel would depend upon the contextual usage. 26 * WARNING: Maintaining this is going to be more work than the above version, 27 * as fixes will need to be made in multiple places. The performance gain 28 * is worth it. 29 */ 30#include <asm/export.h> 31 .set noat 32 .set noreorder 33.text 34 .globl memset 35 .globl __memset 36 .globl ___memset 37 .globl __memset16 38 .globl __constant_c_memset 39 40 .ent ___memset 41.align 5 42___memset: 43 .frame $30,0,$26,0 44 .prologue 0 45 46 /* 47 * Serious stalling happens. The only way to mitigate this is to 48 * undertake a major re-write to interleave the constant materialization 49 * with other parts of the fall-through code. This is important, even 50 * though it makes maintenance tougher. 51 * Do this later. 52 */ 53 and $17,255,$1 # E : 00000000000000ch 54 insbl $17,1,$2 # U : 000000000000ch00 55 bis $16,$16,$0 # E : return value 56 ble $18,end_b # U : zero length requested? 57 58 addq $18,$16,$6 # E : max address to write to 59 bis $1,$2,$17 # E : 000000000000chch 60 insbl $1,2,$3 # U : 0000000000ch0000 61 insbl $1,3,$4 # U : 00000000ch000000 62 63 or $3,$4,$3 # E : 00000000chch0000 64 inswl $17,4,$5 # U : 0000chch00000000 65 xor $16,$6,$1 # E : will complete write be within one quadword? 66 inswl $17,6,$2 # U : chch000000000000 67 68 or $17,$3,$17 # E : 00000000chchchch 69 or $2,$5,$2 # E : chchchch00000000 70 bic $1,7,$1 # E : fit within a single quadword? 71 and $16,7,$3 # E : Target addr misalignment 72 73 or $17,$2,$17 # E : chchchchchchchch 74 beq $1,within_quad_b # U : 75 nop # E : 76 beq $3,aligned_b # U : target is 0mod8 77 78 /* 79 * Target address is misaligned, and won't fit within a quadword 80 */ 81 ldq_u $4,0($16) # L : Fetch first partial 82 bis $16,$16,$5 # E : Save the address 83 insql $17,$16,$2 # U : Insert new bytes 84 subq $3,8,$3 # E : Invert (for addressing uses) 85 86 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 87 mskql $4,$16,$4 # U : clear relevant parts of the quad 88 subq $16,$3,$16 # E : $16 is new aligned destination 89 bis $2,$4,$1 # E : Final bytes 90 91 nop 92 stq_u $1,0($5) # L : Store result 93 nop 94 nop 95 96.align 4 97aligned_b: 98 /* 99 * We are now guaranteed to be quad aligned, with at least 100 * one partial quad to write. 101 */ 102 103 sra $18,3,$3 # U : Number of remaining quads to write 104 and $18,7,$18 # E : Number of trailing bytes to write 105 bis $16,$16,$5 # E : Save dest address 106 beq $3,no_quad_b # U : tail stuff only 107 108 /* 109 * it's worth the effort to unroll this and use wh64 if possible 110 * Lifted a bunch of code from clear_user.S 111 * At this point, entry values are: 112 * $16 Current destination address 113 * $5 A copy of $16 114 * $6 The max quadword address to write to 115 * $18 Number trailer bytes 116 * $3 Number quads to write 117 */ 118 119 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 120 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 121 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 122 blt $4, loop_b # U : 123 124 /* 125 * We know we've got at least 16 quads, minimum of one trip 126 * through unrolled loop. Do a quad at a time to get us 0mod64 127 * aligned. 128 */ 129 130 nop # E : 131 nop # E : 132 nop # E : 133 beq $1, $bigalign_b # U : 134 135$alignmod64_b: 136 stq $17, 0($5) # L : 137 subq $3, 1, $3 # E : For consistency later 138 addq $1, 8, $1 # E : Increment towards zero for alignment 139 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 140 141 nop 142 nop 143 addq $5, 8, $5 # E : Inc address 144 blt $1, $alignmod64_b # U : 145 146$bigalign_b: 147 /* 148 * $3 - number quads left to go 149 * $5 - target address (aligned 0mod64) 150 * $17 - mask of stuff to store 151 * Scratch registers available: $7, $2, $4, $1 152 * we know that we'll be taking a minimum of one trip through 153 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 154 * Assumes the wh64 needs to be for 2 trips through the loop in the future 155 * The wh64 is issued on for the starting destination address for trip +2 156 * through the loop, and if there are less than two trips left, the target 157 * address will be for the current trip. 158 */ 159 160$do_wh64_b: 161 wh64 ($4) # L1 : memory subsystem write hint 162 subq $3, 24, $2 # E : For determining future wh64 addresses 163 stq $17, 0($5) # L : 164 nop # E : 165 166 addq $5, 128, $4 # E : speculative target of next wh64 167 stq $17, 8($5) # L : 168 stq $17, 16($5) # L : 169 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 170 171 stq $17, 24($5) # L : 172 stq $17, 32($5) # L : 173 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 174 nop 175 176 stq $17, 40($5) # L : 177 stq $17, 48($5) # L : 178 subq $3, 16, $2 # E : Repeat the loop at least once more? 179 nop 180 181 stq $17, 56($5) # L : 182 addq $5, 64, $5 # E : 183 subq $3, 8, $3 # E : 184 bge $2, $do_wh64_b # U : 185 186 nop 187 nop 188 nop 189 beq $3, no_quad_b # U : Might have finished already 190 191.align 4 192 /* 193 * Simple loop for trailing quadwords, or for small amounts 194 * of data (where we can't use an unrolled loop and wh64) 195 */ 196loop_b: 197 stq $17,0($5) # L : 198 subq $3,1,$3 # E : Decrement number quads left 199 addq $5,8,$5 # E : Inc address 200 bne $3,loop_b # U : more? 201 202no_quad_b: 203 /* 204 * Write 0..7 trailing bytes. 205 */ 206 nop # E : 207 beq $18,end_b # U : All done? 208 ldq $7,0($5) # L : 209 mskqh $7,$6,$2 # U : Mask final quad 210 211 insqh $17,$6,$4 # U : New bits 212 bis $2,$4,$1 # E : Put it all together 213 stq $1,0($5) # L : And back to memory 214 ret $31,($26),1 # L0 : 215 216within_quad_b: 217 ldq_u $1,0($16) # L : 218 insql $17,$16,$2 # U : New bits 219 mskql $1,$16,$4 # U : Clear old 220 bis $2,$4,$2 # E : New result 221 222 mskql $2,$6,$4 # U : 223 mskqh $1,$6,$2 # U : 224 bis $2,$4,$1 # E : 225 stq_u $1,0($16) # L : 226 227end_b: 228 nop 229 nop 230 nop 231 ret $31,($26),1 # L0 : 232 .end ___memset 233 EXPORT_SYMBOL(___memset) 234 235 /* 236 * This is the original body of code, prior to replication and 237 * rescheduling. Leave it here, as there may be calls to this 238 * entry point. 239 */ 240.align 4 241 .ent __constant_c_memset 242__constant_c_memset: 243 .frame $30,0,$26,0 244 .prologue 0 245 246 addq $18,$16,$6 # E : max address to write to 247 bis $16,$16,$0 # E : return value 248 xor $16,$6,$1 # E : will complete write be within one quadword? 249 ble $18,end # U : zero length requested? 250 251 bic $1,7,$1 # E : fit within a single quadword 252 beq $1,within_one_quad # U : 253 and $16,7,$3 # E : Target addr misalignment 254 beq $3,aligned # U : target is 0mod8 255 256 /* 257 * Target address is misaligned, and won't fit within a quadword 258 */ 259 ldq_u $4,0($16) # L : Fetch first partial 260 bis $16,$16,$5 # E : Save the address 261 insql $17,$16,$2 # U : Insert new bytes 262 subq $3,8,$3 # E : Invert (for addressing uses) 263 264 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 265 mskql $4,$16,$4 # U : clear relevant parts of the quad 266 subq $16,$3,$16 # E : $16 is new aligned destination 267 bis $2,$4,$1 # E : Final bytes 268 269 nop 270 stq_u $1,0($5) # L : Store result 271 nop 272 nop 273 274.align 4 275aligned: 276 /* 277 * We are now guaranteed to be quad aligned, with at least 278 * one partial quad to write. 279 */ 280 281 sra $18,3,$3 # U : Number of remaining quads to write 282 and $18,7,$18 # E : Number of trailing bytes to write 283 bis $16,$16,$5 # E : Save dest address 284 beq $3,no_quad # U : tail stuff only 285 286 /* 287 * it's worth the effort to unroll this and use wh64 if possible 288 * Lifted a bunch of code from clear_user.S 289 * At this point, entry values are: 290 * $16 Current destination address 291 * $5 A copy of $16 292 * $6 The max quadword address to write to 293 * $18 Number trailer bytes 294 * $3 Number quads to write 295 */ 296 297 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 298 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 299 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 300 blt $4, loop # U : 301 302 /* 303 * We know we've got at least 16 quads, minimum of one trip 304 * through unrolled loop. Do a quad at a time to get us 0mod64 305 * aligned. 306 */ 307 308 nop # E : 309 nop # E : 310 nop # E : 311 beq $1, $bigalign # U : 312 313$alignmod64: 314 stq $17, 0($5) # L : 315 subq $3, 1, $3 # E : For consistency later 316 addq $1, 8, $1 # E : Increment towards zero for alignment 317 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 318 319 nop 320 nop 321 addq $5, 8, $5 # E : Inc address 322 blt $1, $alignmod64 # U : 323 324$bigalign: 325 /* 326 * $3 - number quads left to go 327 * $5 - target address (aligned 0mod64) 328 * $17 - mask of stuff to store 329 * Scratch registers available: $7, $2, $4, $1 330 * we know that we'll be taking a minimum of one trip through 331 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 332 * Assumes the wh64 needs to be for 2 trips through the loop in the future 333 * The wh64 is issued on for the starting destination address for trip +2 334 * through the loop, and if there are less than two trips left, the target 335 * address will be for the current trip. 336 */ 337 338$do_wh64: 339 wh64 ($4) # L1 : memory subsystem write hint 340 subq $3, 24, $2 # E : For determining future wh64 addresses 341 stq $17, 0($5) # L : 342 nop # E : 343 344 addq $5, 128, $4 # E : speculative target of next wh64 345 stq $17, 8($5) # L : 346 stq $17, 16($5) # L : 347 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 348 349 stq $17, 24($5) # L : 350 stq $17, 32($5) # L : 351 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 352 nop 353 354 stq $17, 40($5) # L : 355 stq $17, 48($5) # L : 356 subq $3, 16, $2 # E : Repeat the loop at least once more? 357 nop 358 359 stq $17, 56($5) # L : 360 addq $5, 64, $5 # E : 361 subq $3, 8, $3 # E : 362 bge $2, $do_wh64 # U : 363 364 nop 365 nop 366 nop 367 beq $3, no_quad # U : Might have finished already 368 369.align 4 370 /* 371 * Simple loop for trailing quadwords, or for small amounts 372 * of data (where we can't use an unrolled loop and wh64) 373 */ 374loop: 375 stq $17,0($5) # L : 376 subq $3,1,$3 # E : Decrement number quads left 377 addq $5,8,$5 # E : Inc address 378 bne $3,loop # U : more? 379 380no_quad: 381 /* 382 * Write 0..7 trailing bytes. 383 */ 384 nop # E : 385 beq $18,end # U : All done? 386 ldq $7,0($5) # L : 387 mskqh $7,$6,$2 # U : Mask final quad 388 389 insqh $17,$6,$4 # U : New bits 390 bis $2,$4,$1 # E : Put it all together 391 stq $1,0($5) # L : And back to memory 392 ret $31,($26),1 # L0 : 393 394within_one_quad: 395 ldq_u $1,0($16) # L : 396 insql $17,$16,$2 # U : New bits 397 mskql $1,$16,$4 # U : Clear old 398 bis $2,$4,$2 # E : New result 399 400 mskql $2,$6,$4 # U : 401 mskqh $1,$6,$2 # U : 402 bis $2,$4,$1 # E : 403 stq_u $1,0($16) # L : 404 405end: 406 nop 407 nop 408 nop 409 ret $31,($26),1 # L0 : 410 .end __constant_c_memset 411 EXPORT_SYMBOL(__constant_c_memset) 412 413 /* 414 * This is a replicant of the __constant_c_memset code, rescheduled 415 * to mask stalls. Note that entry point names also had to change 416 */ 417 .align 5 418 .ent __memset16 419 420__memset16: 421 .frame $30,0,$26,0 422 .prologue 0 423 424 inswl $17,0,$5 # U : 000000000000c1c2 425 inswl $17,2,$2 # U : 00000000c1c20000 426 bis $16,$16,$0 # E : return value 427 addq $18,$16,$6 # E : max address to write to 428 429 ble $18, end_w # U : zero length requested? 430 inswl $17,4,$3 # U : 0000c1c200000000 431 inswl $17,6,$4 # U : c1c2000000000000 432 xor $16,$6,$1 # E : will complete write be within one quadword? 433 434 or $2,$5,$2 # E : 00000000c1c2c1c2 435 or $3,$4,$17 # E : c1c2c1c200000000 436 bic $1,7,$1 # E : fit within a single quadword 437 and $16,7,$3 # E : Target addr misalignment 438 439 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 440 beq $1,within_quad_w # U : 441 nop 442 beq $3,aligned_w # U : target is 0mod8 443 444 /* 445 * Target address is misaligned, and won't fit within a quadword 446 */ 447 ldq_u $4,0($16) # L : Fetch first partial 448 bis $16,$16,$5 # E : Save the address 449 insql $17,$16,$2 # U : Insert new bytes 450 subq $3,8,$3 # E : Invert (for addressing uses) 451 452 addq $18,$3,$18 # E : $18 is new count ($3 is negative) 453 mskql $4,$16,$4 # U : clear relevant parts of the quad 454 subq $16,$3,$16 # E : $16 is new aligned destination 455 bis $2,$4,$1 # E : Final bytes 456 457 nop 458 stq_u $1,0($5) # L : Store result 459 nop 460 nop 461 462.align 4 463aligned_w: 464 /* 465 * We are now guaranteed to be quad aligned, with at least 466 * one partial quad to write. 467 */ 468 469 sra $18,3,$3 # U : Number of remaining quads to write 470 and $18,7,$18 # E : Number of trailing bytes to write 471 bis $16,$16,$5 # E : Save dest address 472 beq $3,no_quad_w # U : tail stuff only 473 474 /* 475 * it's worth the effort to unroll this and use wh64 if possible 476 * Lifted a bunch of code from clear_user.S 477 * At this point, entry values are: 478 * $16 Current destination address 479 * $5 A copy of $16 480 * $6 The max quadword address to write to 481 * $18 Number trailer bytes 482 * $3 Number quads to write 483 */ 484 485 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 486 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 487 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 488 blt $4, loop_w # U : 489 490 /* 491 * We know we've got at least 16 quads, minimum of one trip 492 * through unrolled loop. Do a quad at a time to get us 0mod64 493 * aligned. 494 */ 495 496 nop # E : 497 nop # E : 498 nop # E : 499 beq $1, $bigalign_w # U : 500 501$alignmod64_w: 502 stq $17, 0($5) # L : 503 subq $3, 1, $3 # E : For consistency later 504 addq $1, 8, $1 # E : Increment towards zero for alignment 505 addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 506 507 nop 508 nop 509 addq $5, 8, $5 # E : Inc address 510 blt $1, $alignmod64_w # U : 511 512$bigalign_w: 513 /* 514 * $3 - number quads left to go 515 * $5 - target address (aligned 0mod64) 516 * $17 - mask of stuff to store 517 * Scratch registers available: $7, $2, $4, $1 518 * we know that we'll be taking a minimum of one trip through 519 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 520 * Assumes the wh64 needs to be for 2 trips through the loop in the future 521 * The wh64 is issued on for the starting destination address for trip +2 522 * through the loop, and if there are less than two trips left, the target 523 * address will be for the current trip. 524 */ 525 526$do_wh64_w: 527 wh64 ($4) # L1 : memory subsystem write hint 528 subq $3, 24, $2 # E : For determining future wh64 addresses 529 stq $17, 0($5) # L : 530 nop # E : 531 532 addq $5, 128, $4 # E : speculative target of next wh64 533 stq $17, 8($5) # L : 534 stq $17, 16($5) # L : 535 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 536 537 stq $17, 24($5) # L : 538 stq $17, 32($5) # L : 539 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 540 nop 541 542 stq $17, 40($5) # L : 543 stq $17, 48($5) # L : 544 subq $3, 16, $2 # E : Repeat the loop at least once more? 545 nop 546 547 stq $17, 56($5) # L : 548 addq $5, 64, $5 # E : 549 subq $3, 8, $3 # E : 550 bge $2, $do_wh64_w # U : 551 552 nop 553 nop 554 nop 555 beq $3, no_quad_w # U : Might have finished already 556 557.align 4 558 /* 559 * Simple loop for trailing quadwords, or for small amounts 560 * of data (where we can't use an unrolled loop and wh64) 561 */ 562loop_w: 563 stq $17,0($5) # L : 564 subq $3,1,$3 # E : Decrement number quads left 565 addq $5,8,$5 # E : Inc address 566 bne $3,loop_w # U : more? 567 568no_quad_w: 569 /* 570 * Write 0..7 trailing bytes. 571 */ 572 nop # E : 573 beq $18,end_w # U : All done? 574 ldq $7,0($5) # L : 575 mskqh $7,$6,$2 # U : Mask final quad 576 577 insqh $17,$6,$4 # U : New bits 578 bis $2,$4,$1 # E : Put it all together 579 stq $1,0($5) # L : And back to memory 580 ret $31,($26),1 # L0 : 581 582within_quad_w: 583 ldq_u $1,0($16) # L : 584 insql $17,$16,$2 # U : New bits 585 mskql $1,$16,$4 # U : Clear old 586 bis $2,$4,$2 # E : New result 587 588 mskql $2,$6,$4 # U : 589 mskqh $1,$6,$2 # U : 590 bis $2,$4,$1 # E : 591 stq_u $1,0($16) # L : 592 593end_w: 594 nop 595 nop 596 nop 597 ret $31,($26),1 # L0 : 598 599 .end __memset16 600 EXPORT_SYMBOL(__memset16) 601 602memset = ___memset 603__memset = ___memset 604 EXPORT_SYMBOL(memset) 605 EXPORT_SYMBOL(__memset) 606