1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2# 3# Accelerated chacha20 implementation for ppc64le. 4# 5# Copyright 2023- IBM Corp. All rights reserved 6# 7#=================================================================================== 8# Written by Danny Tsen <dtsen@us.ibm.com> 9# 10# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, 11# size_t len, int nrounds); 12# 13# do rounds, 8 quarter rounds 14# 1. a += b; d ^= a; d <<<= 16; 15# 2. c += d; b ^= c; b <<<= 12; 16# 3. a += b; d ^= a; d <<<= 8; 17# 4. c += d; b ^= c; b <<<= 7 18# 19# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 20# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 21# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 22# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 23# 24# 4 blocks (a b c d) 25# 26# a0 b0 c0 d0 27# a1 b1 c1 d1 28# ... 29# a4 b4 c4 d4 30# ... 31# a8 b8 c8 d8 32# ... 33# a12 b12 c12 d12 34# a13 ... 35# a14 ... 36# a15 b15 c15 d15 37# 38# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 39# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 40# 41 42#include <asm/ppc_asm.h> 43#include <asm/asm-offsets.h> 44#include <asm/asm-compat.h> 45#include <linux/linkage.h> 46 47.machine "any" 48.text 49 50.macro SAVE_GPR GPR OFFSET FRAME 51 std \GPR,\OFFSET(\FRAME) 52.endm 53 54.macro SAVE_VRS VRS OFFSET FRAME 55 li 16, \OFFSET 56 stvx \VRS, 16, \FRAME 57.endm 58 59.macro SAVE_VSX VSX OFFSET FRAME 60 li 16, \OFFSET 61 stxvx \VSX, 16, \FRAME 62.endm 63 64.macro RESTORE_GPR GPR OFFSET FRAME 65 ld \GPR,\OFFSET(\FRAME) 66.endm 67 68.macro RESTORE_VRS VRS OFFSET FRAME 69 li 16, \OFFSET 70 lvx \VRS, 16, \FRAME 71.endm 72 73.macro RESTORE_VSX VSX OFFSET FRAME 74 li 16, \OFFSET 75 lxvx \VSX, 16, \FRAME 76.endm 77 78.macro SAVE_REGS 79 mflr 0 80 std 0, 16(1) 81 stdu 1,-752(1) 82 83 SAVE_GPR 14, 112, 1 84 SAVE_GPR 15, 120, 1 85 SAVE_GPR 16, 128, 1 86 SAVE_GPR 17, 136, 1 87 SAVE_GPR 18, 144, 1 88 SAVE_GPR 19, 152, 1 89 SAVE_GPR 20, 160, 1 90 SAVE_GPR 21, 168, 1 91 SAVE_GPR 22, 176, 1 92 SAVE_GPR 23, 184, 1 93 SAVE_GPR 24, 192, 1 94 SAVE_GPR 25, 200, 1 95 SAVE_GPR 26, 208, 1 96 SAVE_GPR 27, 216, 1 97 SAVE_GPR 28, 224, 1 98 SAVE_GPR 29, 232, 1 99 SAVE_GPR 30, 240, 1 100 SAVE_GPR 31, 248, 1 101 102 addi 9, 1, 256 103 SAVE_VRS 20, 0, 9 104 SAVE_VRS 21, 16, 9 105 SAVE_VRS 22, 32, 9 106 SAVE_VRS 23, 48, 9 107 SAVE_VRS 24, 64, 9 108 SAVE_VRS 25, 80, 9 109 SAVE_VRS 26, 96, 9 110 SAVE_VRS 27, 112, 9 111 SAVE_VRS 28, 128, 9 112 SAVE_VRS 29, 144, 9 113 SAVE_VRS 30, 160, 9 114 SAVE_VRS 31, 176, 9 115 116 SAVE_VSX 14, 192, 9 117 SAVE_VSX 15, 208, 9 118 SAVE_VSX 16, 224, 9 119 SAVE_VSX 17, 240, 9 120 SAVE_VSX 18, 256, 9 121 SAVE_VSX 19, 272, 9 122 SAVE_VSX 20, 288, 9 123 SAVE_VSX 21, 304, 9 124 SAVE_VSX 22, 320, 9 125 SAVE_VSX 23, 336, 9 126 SAVE_VSX 24, 352, 9 127 SAVE_VSX 25, 368, 9 128 SAVE_VSX 26, 384, 9 129 SAVE_VSX 27, 400, 9 130 SAVE_VSX 28, 416, 9 131 SAVE_VSX 29, 432, 9 132 SAVE_VSX 30, 448, 9 133 SAVE_VSX 31, 464, 9 134.endm # SAVE_REGS 135 136.macro RESTORE_REGS 137 addi 9, 1, 256 138 RESTORE_VRS 20, 0, 9 139 RESTORE_VRS 21, 16, 9 140 RESTORE_VRS 22, 32, 9 141 RESTORE_VRS 23, 48, 9 142 RESTORE_VRS 24, 64, 9 143 RESTORE_VRS 25, 80, 9 144 RESTORE_VRS 26, 96, 9 145 RESTORE_VRS 27, 112, 9 146 RESTORE_VRS 28, 128, 9 147 RESTORE_VRS 29, 144, 9 148 RESTORE_VRS 30, 160, 9 149 RESTORE_VRS 31, 176, 9 150 151 RESTORE_VSX 14, 192, 9 152 RESTORE_VSX 15, 208, 9 153 RESTORE_VSX 16, 224, 9 154 RESTORE_VSX 17, 240, 9 155 RESTORE_VSX 18, 256, 9 156 RESTORE_VSX 19, 272, 9 157 RESTORE_VSX 20, 288, 9 158 RESTORE_VSX 21, 304, 9 159 RESTORE_VSX 22, 320, 9 160 RESTORE_VSX 23, 336, 9 161 RESTORE_VSX 24, 352, 9 162 RESTORE_VSX 25, 368, 9 163 RESTORE_VSX 26, 384, 9 164 RESTORE_VSX 27, 400, 9 165 RESTORE_VSX 28, 416, 9 166 RESTORE_VSX 29, 432, 9 167 RESTORE_VSX 30, 448, 9 168 RESTORE_VSX 31, 464, 9 169 170 RESTORE_GPR 14, 112, 1 171 RESTORE_GPR 15, 120, 1 172 RESTORE_GPR 16, 128, 1 173 RESTORE_GPR 17, 136, 1 174 RESTORE_GPR 18, 144, 1 175 RESTORE_GPR 19, 152, 1 176 RESTORE_GPR 20, 160, 1 177 RESTORE_GPR 21, 168, 1 178 RESTORE_GPR 22, 176, 1 179 RESTORE_GPR 23, 184, 1 180 RESTORE_GPR 24, 192, 1 181 RESTORE_GPR 25, 200, 1 182 RESTORE_GPR 26, 208, 1 183 RESTORE_GPR 27, 216, 1 184 RESTORE_GPR 28, 224, 1 185 RESTORE_GPR 29, 232, 1 186 RESTORE_GPR 30, 240, 1 187 RESTORE_GPR 31, 248, 1 188 189 addi 1, 1, 752 190 ld 0, 16(1) 191 mtlr 0 192.endm # RESTORE_REGS 193 194.macro QT_loop_8x 195 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 196 xxlor 0, 32+25, 32+25 197 xxlor 32+25, 20, 20 198 vadduwm 0, 0, 4 199 vadduwm 1, 1, 5 200 vadduwm 2, 2, 6 201 vadduwm 3, 3, 7 202 vadduwm 16, 16, 20 203 vadduwm 17, 17, 21 204 vadduwm 18, 18, 22 205 vadduwm 19, 19, 23 206 207 vpermxor 12, 12, 0, 25 208 vpermxor 13, 13, 1, 25 209 vpermxor 14, 14, 2, 25 210 vpermxor 15, 15, 3, 25 211 vpermxor 28, 28, 16, 25 212 vpermxor 29, 29, 17, 25 213 vpermxor 30, 30, 18, 25 214 vpermxor 31, 31, 19, 25 215 xxlor 32+25, 0, 0 216 vadduwm 8, 8, 12 217 vadduwm 9, 9, 13 218 vadduwm 10, 10, 14 219 vadduwm 11, 11, 15 220 vadduwm 24, 24, 28 221 vadduwm 25, 25, 29 222 vadduwm 26, 26, 30 223 vadduwm 27, 27, 31 224 vxor 4, 4, 8 225 vxor 5, 5, 9 226 vxor 6, 6, 10 227 vxor 7, 7, 11 228 vxor 20, 20, 24 229 vxor 21, 21, 25 230 vxor 22, 22, 26 231 vxor 23, 23, 27 232 233 xxlor 0, 32+25, 32+25 234 xxlor 32+25, 21, 21 235 vrlw 4, 4, 25 # 236 vrlw 5, 5, 25 237 vrlw 6, 6, 25 238 vrlw 7, 7, 25 239 vrlw 20, 20, 25 # 240 vrlw 21, 21, 25 241 vrlw 22, 22, 25 242 vrlw 23, 23, 25 243 xxlor 32+25, 0, 0 244 vadduwm 0, 0, 4 245 vadduwm 1, 1, 5 246 vadduwm 2, 2, 6 247 vadduwm 3, 3, 7 248 vadduwm 16, 16, 20 249 vadduwm 17, 17, 21 250 vadduwm 18, 18, 22 251 vadduwm 19, 19, 23 252 253 xxlor 0, 32+25, 32+25 254 xxlor 32+25, 22, 22 255 vpermxor 12, 12, 0, 25 256 vpermxor 13, 13, 1, 25 257 vpermxor 14, 14, 2, 25 258 vpermxor 15, 15, 3, 25 259 vpermxor 28, 28, 16, 25 260 vpermxor 29, 29, 17, 25 261 vpermxor 30, 30, 18, 25 262 vpermxor 31, 31, 19, 25 263 xxlor 32+25, 0, 0 264 vadduwm 8, 8, 12 265 vadduwm 9, 9, 13 266 vadduwm 10, 10, 14 267 vadduwm 11, 11, 15 268 vadduwm 24, 24, 28 269 vadduwm 25, 25, 29 270 vadduwm 26, 26, 30 271 vadduwm 27, 27, 31 272 xxlor 0, 32+28, 32+28 273 xxlor 32+28, 23, 23 274 vxor 4, 4, 8 275 vxor 5, 5, 9 276 vxor 6, 6, 10 277 vxor 7, 7, 11 278 vxor 20, 20, 24 279 vxor 21, 21, 25 280 vxor 22, 22, 26 281 vxor 23, 23, 27 282 vrlw 4, 4, 28 # 283 vrlw 5, 5, 28 284 vrlw 6, 6, 28 285 vrlw 7, 7, 28 286 vrlw 20, 20, 28 # 287 vrlw 21, 21, 28 288 vrlw 22, 22, 28 289 vrlw 23, 23, 28 290 xxlor 32+28, 0, 0 291 292 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 293 xxlor 0, 32+25, 32+25 294 xxlor 32+25, 20, 20 295 vadduwm 0, 0, 5 296 vadduwm 1, 1, 6 297 vadduwm 2, 2, 7 298 vadduwm 3, 3, 4 299 vadduwm 16, 16, 21 300 vadduwm 17, 17, 22 301 vadduwm 18, 18, 23 302 vadduwm 19, 19, 20 303 304 vpermxor 15, 15, 0, 25 305 vpermxor 12, 12, 1, 25 306 vpermxor 13, 13, 2, 25 307 vpermxor 14, 14, 3, 25 308 vpermxor 31, 31, 16, 25 309 vpermxor 28, 28, 17, 25 310 vpermxor 29, 29, 18, 25 311 vpermxor 30, 30, 19, 25 312 313 xxlor 32+25, 0, 0 314 vadduwm 10, 10, 15 315 vadduwm 11, 11, 12 316 vadduwm 8, 8, 13 317 vadduwm 9, 9, 14 318 vadduwm 26, 26, 31 319 vadduwm 27, 27, 28 320 vadduwm 24, 24, 29 321 vadduwm 25, 25, 30 322 vxor 5, 5, 10 323 vxor 6, 6, 11 324 vxor 7, 7, 8 325 vxor 4, 4, 9 326 vxor 21, 21, 26 327 vxor 22, 22, 27 328 vxor 23, 23, 24 329 vxor 20, 20, 25 330 331 xxlor 0, 32+25, 32+25 332 xxlor 32+25, 21, 21 333 vrlw 5, 5, 25 334 vrlw 6, 6, 25 335 vrlw 7, 7, 25 336 vrlw 4, 4, 25 337 vrlw 21, 21, 25 338 vrlw 22, 22, 25 339 vrlw 23, 23, 25 340 vrlw 20, 20, 25 341 xxlor 32+25, 0, 0 342 343 vadduwm 0, 0, 5 344 vadduwm 1, 1, 6 345 vadduwm 2, 2, 7 346 vadduwm 3, 3, 4 347 vadduwm 16, 16, 21 348 vadduwm 17, 17, 22 349 vadduwm 18, 18, 23 350 vadduwm 19, 19, 20 351 352 xxlor 0, 32+25, 32+25 353 xxlor 32+25, 22, 22 354 vpermxor 15, 15, 0, 25 355 vpermxor 12, 12, 1, 25 356 vpermxor 13, 13, 2, 25 357 vpermxor 14, 14, 3, 25 358 vpermxor 31, 31, 16, 25 359 vpermxor 28, 28, 17, 25 360 vpermxor 29, 29, 18, 25 361 vpermxor 30, 30, 19, 25 362 xxlor 32+25, 0, 0 363 364 vadduwm 10, 10, 15 365 vadduwm 11, 11, 12 366 vadduwm 8, 8, 13 367 vadduwm 9, 9, 14 368 vadduwm 26, 26, 31 369 vadduwm 27, 27, 28 370 vadduwm 24, 24, 29 371 vadduwm 25, 25, 30 372 373 xxlor 0, 32+28, 32+28 374 xxlor 32+28, 23, 23 375 vxor 5, 5, 10 376 vxor 6, 6, 11 377 vxor 7, 7, 8 378 vxor 4, 4, 9 379 vxor 21, 21, 26 380 vxor 22, 22, 27 381 vxor 23, 23, 24 382 vxor 20, 20, 25 383 vrlw 5, 5, 28 384 vrlw 6, 6, 28 385 vrlw 7, 7, 28 386 vrlw 4, 4, 28 387 vrlw 21, 21, 28 388 vrlw 22, 22, 28 389 vrlw 23, 23, 28 390 vrlw 20, 20, 28 391 xxlor 32+28, 0, 0 392.endm 393 394.macro QT_loop_4x 395 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 396 vadduwm 0, 0, 4 397 vadduwm 1, 1, 5 398 vadduwm 2, 2, 6 399 vadduwm 3, 3, 7 400 vpermxor 12, 12, 0, 20 401 vpermxor 13, 13, 1, 20 402 vpermxor 14, 14, 2, 20 403 vpermxor 15, 15, 3, 20 404 vadduwm 8, 8, 12 405 vadduwm 9, 9, 13 406 vadduwm 10, 10, 14 407 vadduwm 11, 11, 15 408 vxor 4, 4, 8 409 vxor 5, 5, 9 410 vxor 6, 6, 10 411 vxor 7, 7, 11 412 vrlw 4, 4, 21 413 vrlw 5, 5, 21 414 vrlw 6, 6, 21 415 vrlw 7, 7, 21 416 vadduwm 0, 0, 4 417 vadduwm 1, 1, 5 418 vadduwm 2, 2, 6 419 vadduwm 3, 3, 7 420 vpermxor 12, 12, 0, 22 421 vpermxor 13, 13, 1, 22 422 vpermxor 14, 14, 2, 22 423 vpermxor 15, 15, 3, 22 424 vadduwm 8, 8, 12 425 vadduwm 9, 9, 13 426 vadduwm 10, 10, 14 427 vadduwm 11, 11, 15 428 vxor 4, 4, 8 429 vxor 5, 5, 9 430 vxor 6, 6, 10 431 vxor 7, 7, 11 432 vrlw 4, 4, 23 433 vrlw 5, 5, 23 434 vrlw 6, 6, 23 435 vrlw 7, 7, 23 436 437 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 438 vadduwm 0, 0, 5 439 vadduwm 1, 1, 6 440 vadduwm 2, 2, 7 441 vadduwm 3, 3, 4 442 vpermxor 15, 15, 0, 20 443 vpermxor 12, 12, 1, 20 444 vpermxor 13, 13, 2, 20 445 vpermxor 14, 14, 3, 20 446 vadduwm 10, 10, 15 447 vadduwm 11, 11, 12 448 vadduwm 8, 8, 13 449 vadduwm 9, 9, 14 450 vxor 5, 5, 10 451 vxor 6, 6, 11 452 vxor 7, 7, 8 453 vxor 4, 4, 9 454 vrlw 5, 5, 21 455 vrlw 6, 6, 21 456 vrlw 7, 7, 21 457 vrlw 4, 4, 21 458 vadduwm 0, 0, 5 459 vadduwm 1, 1, 6 460 vadduwm 2, 2, 7 461 vadduwm 3, 3, 4 462 vpermxor 15, 15, 0, 22 463 vpermxor 12, 12, 1, 22 464 vpermxor 13, 13, 2, 22 465 vpermxor 14, 14, 3, 22 466 vadduwm 10, 10, 15 467 vadduwm 11, 11, 12 468 vadduwm 8, 8, 13 469 vadduwm 9, 9, 14 470 vxor 5, 5, 10 471 vxor 6, 6, 11 472 vxor 7, 7, 8 473 vxor 4, 4, 9 474 vrlw 5, 5, 23 475 vrlw 6, 6, 23 476 vrlw 7, 7, 23 477 vrlw 4, 4, 23 478.endm 479 480# Transpose 481.macro TP_4x a0 a1 a2 a3 482 xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 483 xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 484 xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 485 xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 486 xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 487 xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 488 xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 489 xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 490.endm 491 492# key stream = working state + state 493.macro Add_state S 494 vadduwm \S+0, \S+0, 16-\S 495 vadduwm \S+4, \S+4, 17-\S 496 vadduwm \S+8, \S+8, 18-\S 497 vadduwm \S+12, \S+12, 19-\S 498 499 vadduwm \S+1, \S+1, 16-\S 500 vadduwm \S+5, \S+5, 17-\S 501 vadduwm \S+9, \S+9, 18-\S 502 vadduwm \S+13, \S+13, 19-\S 503 504 vadduwm \S+2, \S+2, 16-\S 505 vadduwm \S+6, \S+6, 17-\S 506 vadduwm \S+10, \S+10, 18-\S 507 vadduwm \S+14, \S+14, 19-\S 508 509 vadduwm \S+3, \S+3, 16-\S 510 vadduwm \S+7, \S+7, 17-\S 511 vadduwm \S+11, \S+11, 18-\S 512 vadduwm \S+15, \S+15, 19-\S 513.endm 514 515# 516# write 256 bytes 517# 518.macro Write_256 S 519 add 9, 14, 5 520 add 16, 14, 4 521 lxvw4x 0, 0, 9 522 lxvw4x 1, 17, 9 523 lxvw4x 2, 18, 9 524 lxvw4x 3, 19, 9 525 lxvw4x 4, 20, 9 526 lxvw4x 5, 21, 9 527 lxvw4x 6, 22, 9 528 lxvw4x 7, 23, 9 529 lxvw4x 8, 24, 9 530 lxvw4x 9, 25, 9 531 lxvw4x 10, 26, 9 532 lxvw4x 11, 27, 9 533 lxvw4x 12, 28, 9 534 lxvw4x 13, 29, 9 535 lxvw4x 14, 30, 9 536 lxvw4x 15, 31, 9 537 538 xxlxor \S+32, \S+32, 0 539 xxlxor \S+36, \S+36, 1 540 xxlxor \S+40, \S+40, 2 541 xxlxor \S+44, \S+44, 3 542 xxlxor \S+33, \S+33, 4 543 xxlxor \S+37, \S+37, 5 544 xxlxor \S+41, \S+41, 6 545 xxlxor \S+45, \S+45, 7 546 xxlxor \S+34, \S+34, 8 547 xxlxor \S+38, \S+38, 9 548 xxlxor \S+42, \S+42, 10 549 xxlxor \S+46, \S+46, 11 550 xxlxor \S+35, \S+35, 12 551 xxlxor \S+39, \S+39, 13 552 xxlxor \S+43, \S+43, 14 553 xxlxor \S+47, \S+47, 15 554 555 stxvw4x \S+32, 0, 16 556 stxvw4x \S+36, 17, 16 557 stxvw4x \S+40, 18, 16 558 stxvw4x \S+44, 19, 16 559 560 stxvw4x \S+33, 20, 16 561 stxvw4x \S+37, 21, 16 562 stxvw4x \S+41, 22, 16 563 stxvw4x \S+45, 23, 16 564 565 stxvw4x \S+34, 24, 16 566 stxvw4x \S+38, 25, 16 567 stxvw4x \S+42, 26, 16 568 stxvw4x \S+46, 27, 16 569 570 stxvw4x \S+35, 28, 16 571 stxvw4x \S+39, 29, 16 572 stxvw4x \S+43, 30, 16 573 stxvw4x \S+47, 31, 16 574 575.endm 576 577# 578# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds); 579# 580SYM_FUNC_START(chacha_p10le_8x) 581.align 5 582 cmpdi 6, 0 583 ble Out_no_chacha 584 585 SAVE_REGS 586 587 # r17 - r31 mainly for Write_256 macro. 588 li 17, 16 589 li 18, 32 590 li 19, 48 591 li 20, 64 592 li 21, 80 593 li 22, 96 594 li 23, 112 595 li 24, 128 596 li 25, 144 597 li 26, 160 598 li 27, 176 599 li 28, 192 600 li 29, 208 601 li 30, 224 602 li 31, 240 603 604 mr 15, 6 # len 605 li 14, 0 # offset to inp and outp 606 607 lxvw4x 48, 0, 3 # vr16, constants 608 lxvw4x 49, 17, 3 # vr17, key 1 609 lxvw4x 50, 18, 3 # vr18, key 2 610 lxvw4x 51, 19, 3 # vr19, counter, nonce 611 612 # create (0, 1, 2, 3) counters 613 vspltisw 0, 0 614 vspltisw 1, 1 615 vspltisw 2, 2 616 vspltisw 3, 3 617 vmrghw 4, 0, 1 618 vmrglw 5, 2, 3 619 vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) 620 621 vspltisw 21, 12 622 vspltisw 23, 7 623 624 addis 11, 2, permx@toc@ha 625 addi 11, 11, permx@toc@l 626 lxvw4x 32+20, 0, 11 627 lxvw4x 32+22, 17, 11 628 629 sradi 8, 7, 1 630 631 mtctr 8 632 633 # save constants to vsx 634 xxlor 16, 48, 48 635 xxlor 17, 49, 49 636 xxlor 18, 50, 50 637 xxlor 19, 51, 51 638 639 vspltisw 25, 4 640 vspltisw 26, 8 641 642 xxlor 25, 32+26, 32+26 643 xxlor 24, 32+25, 32+25 644 645 vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) 646 xxlor 30, 32+30, 32+30 647 xxlor 31, 32+31, 32+31 648 649 xxlor 20, 32+20, 32+20 650 xxlor 21, 32+21, 32+21 651 xxlor 22, 32+22, 32+22 652 xxlor 23, 32+23, 32+23 653 654 cmpdi 6, 512 655 blt Loop_last 656 657Loop_8x: 658 xxspltw 32+0, 16, 0 659 xxspltw 32+1, 16, 1 660 xxspltw 32+2, 16, 2 661 xxspltw 32+3, 16, 3 662 663 xxspltw 32+4, 17, 0 664 xxspltw 32+5, 17, 1 665 xxspltw 32+6, 17, 2 666 xxspltw 32+7, 17, 3 667 xxspltw 32+8, 18, 0 668 xxspltw 32+9, 18, 1 669 xxspltw 32+10, 18, 2 670 xxspltw 32+11, 18, 3 671 xxspltw 32+12, 19, 0 672 xxspltw 32+13, 19, 1 673 xxspltw 32+14, 19, 2 674 xxspltw 32+15, 19, 3 675 vadduwm 12, 12, 30 # increase counter 676 677 xxspltw 32+16, 16, 0 678 xxspltw 32+17, 16, 1 679 xxspltw 32+18, 16, 2 680 xxspltw 32+19, 16, 3 681 682 xxspltw 32+20, 17, 0 683 xxspltw 32+21, 17, 1 684 xxspltw 32+22, 17, 2 685 xxspltw 32+23, 17, 3 686 xxspltw 32+24, 18, 0 687 xxspltw 32+25, 18, 1 688 xxspltw 32+26, 18, 2 689 xxspltw 32+27, 18, 3 690 xxspltw 32+28, 19, 0 691 xxspltw 32+29, 19, 1 692 vadduwm 28, 28, 31 # increase counter 693 xxspltw 32+30, 19, 2 694 xxspltw 32+31, 19, 3 695 696.align 5 697quarter_loop_8x: 698 QT_loop_8x 699 700 bdnz quarter_loop_8x 701 702 xxlor 0, 32+30, 32+30 703 xxlor 32+30, 30, 30 704 vadduwm 12, 12, 30 705 xxlor 32+30, 0, 0 706 TP_4x 0, 1, 2, 3 707 TP_4x 4, 5, 6, 7 708 TP_4x 8, 9, 10, 11 709 TP_4x 12, 13, 14, 15 710 711 xxlor 0, 48, 48 712 xxlor 1, 49, 49 713 xxlor 2, 50, 50 714 xxlor 3, 51, 51 715 xxlor 48, 16, 16 716 xxlor 49, 17, 17 717 xxlor 50, 18, 18 718 xxlor 51, 19, 19 719 Add_state 0 720 xxlor 48, 0, 0 721 xxlor 49, 1, 1 722 xxlor 50, 2, 2 723 xxlor 51, 3, 3 724 Write_256 0 725 addi 14, 14, 256 # offset +=256 726 addi 15, 15, -256 # len -=256 727 728 xxlor 5, 32+31, 32+31 729 xxlor 32+31, 31, 31 730 vadduwm 28, 28, 31 731 xxlor 32+31, 5, 5 732 TP_4x 16+0, 16+1, 16+2, 16+3 733 TP_4x 16+4, 16+5, 16+6, 16+7 734 TP_4x 16+8, 16+9, 16+10, 16+11 735 TP_4x 16+12, 16+13, 16+14, 16+15 736 737 xxlor 32, 16, 16 738 xxlor 33, 17, 17 739 xxlor 34, 18, 18 740 xxlor 35, 19, 19 741 Add_state 16 742 Write_256 16 743 addi 14, 14, 256 # offset +=256 744 addi 15, 15, -256 # len +=256 745 746 xxlor 32+24, 24, 24 747 xxlor 32+25, 25, 25 748 xxlor 32+30, 30, 30 749 vadduwm 30, 30, 25 750 vadduwm 31, 30, 24 751 xxlor 30, 32+30, 32+30 752 xxlor 31, 32+31, 32+31 753 754 cmpdi 15, 0 755 beq Out_loop 756 757 cmpdi 15, 512 758 blt Loop_last 759 760 mtctr 8 761 b Loop_8x 762 763Loop_last: 764 lxvw4x 48, 0, 3 # vr16, constants 765 lxvw4x 49, 17, 3 # vr17, key 1 766 lxvw4x 50, 18, 3 # vr18, key 2 767 lxvw4x 51, 19, 3 # vr19, counter, nonce 768 769 vspltisw 21, 12 770 vspltisw 23, 7 771 addis 11, 2, permx@toc@ha 772 addi 11, 11, permx@toc@l 773 lxvw4x 32+20, 0, 11 774 lxvw4x 32+22, 17, 11 775 776 sradi 8, 7, 1 777 mtctr 8 778 779Loop_4x: 780 vspltw 0, 16, 0 781 vspltw 1, 16, 1 782 vspltw 2, 16, 2 783 vspltw 3, 16, 3 784 785 vspltw 4, 17, 0 786 vspltw 5, 17, 1 787 vspltw 6, 17, 2 788 vspltw 7, 17, 3 789 vspltw 8, 18, 0 790 vspltw 9, 18, 1 791 vspltw 10, 18, 2 792 vspltw 11, 18, 3 793 vspltw 12, 19, 0 794 vadduwm 12, 12, 30 # increase counter 795 vspltw 13, 19, 1 796 vspltw 14, 19, 2 797 vspltw 15, 19, 3 798 799.align 5 800quarter_loop: 801 QT_loop_4x 802 803 bdnz quarter_loop 804 805 vadduwm 12, 12, 30 806 TP_4x 0, 1, 2, 3 807 TP_4x 4, 5, 6, 7 808 TP_4x 8, 9, 10, 11 809 TP_4x 12, 13, 14, 15 810 811 Add_state 0 812 Write_256 0 813 addi 14, 14, 256 # offset += 256 814 addi 15, 15, -256 # len += 256 815 816 # Update state counter 817 vspltisw 25, 4 818 vadduwm 30, 30, 25 819 820 cmpdi 15, 0 821 beq Out_loop 822 cmpdi 15, 256 823 blt Out_loop 824 825 mtctr 8 826 b Loop_4x 827 828Out_loop: 829 RESTORE_REGS 830 blr 831 832Out_no_chacha: 833 li 3, 0 834 blr 835SYM_FUNC_END(chacha_p10le_8x) 836 837SYM_DATA_START_LOCAL(PERMX) 838.align 5 839permx: 840.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd 841.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc 842SYM_DATA_END(PERMX) 843