1/* 2 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.section .rodata.cst32.ROT8, "aM", @progbits, 32 15.align 32 16ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 17 .octa 0x0e0d0c0f0a09080b0605040702010003 18 19.section .rodata.cst32.ROT16, "aM", @progbits, 32 20.align 32 21ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 22 .octa 0x0d0c0f0e09080b0a0504070601000302 23 24.section .rodata.cst32.CTRINC, "aM", @progbits, 32 25.align 32 26CTRINC: .octa 0x00000003000000020000000100000000 27 .octa 0x00000007000000060000000500000004 28 29.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 30.align 32 31CTR2BL: .octa 0x00000000000000000000000000000000 32 .octa 0x00000000000000000000000000000001 33 34.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 35.align 32 36CTR4BL: .octa 0x00000000000000000000000000000002 37 .octa 0x00000000000000000000000000000003 38 39.text 40 41ENTRY(chacha_2block_xor_avx2) 42 # %rdi: Input state matrix, s 43 # %rsi: up to 2 data blocks output, o 44 # %rdx: up to 2 data blocks input, i 45 # %rcx: input/output length in bytes 46 # %r8d: nrounds 47 48 # This function encrypts two ChaCha blocks by loading the state 49 # matrix twice across four AVX registers. It performs matrix operations 50 # on four words in each matrix in parallel, but requires shuffling to 51 # rearrange the words after each round. 52 53 vzeroupper 54 55 # x0..3[0-2] = s0..3 56 vbroadcasti128 0x00(%rdi),%ymm0 57 vbroadcasti128 0x10(%rdi),%ymm1 58 vbroadcasti128 0x20(%rdi),%ymm2 59 vbroadcasti128 0x30(%rdi),%ymm3 60 61 vpaddd CTR2BL(%rip),%ymm3,%ymm3 62 63 vmovdqa %ymm0,%ymm8 64 vmovdqa %ymm1,%ymm9 65 vmovdqa %ymm2,%ymm10 66 vmovdqa %ymm3,%ymm11 67 68 vmovdqa ROT8(%rip),%ymm4 69 vmovdqa ROT16(%rip),%ymm5 70 71 mov %rcx,%rax 72 73.Ldoubleround: 74 75 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 76 vpaddd %ymm1,%ymm0,%ymm0 77 vpxor %ymm0,%ymm3,%ymm3 78 vpshufb %ymm5,%ymm3,%ymm3 79 80 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 81 vpaddd %ymm3,%ymm2,%ymm2 82 vpxor %ymm2,%ymm1,%ymm1 83 vmovdqa %ymm1,%ymm6 84 vpslld $12,%ymm6,%ymm6 85 vpsrld $20,%ymm1,%ymm1 86 vpor %ymm6,%ymm1,%ymm1 87 88 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 89 vpaddd %ymm1,%ymm0,%ymm0 90 vpxor %ymm0,%ymm3,%ymm3 91 vpshufb %ymm4,%ymm3,%ymm3 92 93 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 94 vpaddd %ymm3,%ymm2,%ymm2 95 vpxor %ymm2,%ymm1,%ymm1 96 vmovdqa %ymm1,%ymm7 97 vpslld $7,%ymm7,%ymm7 98 vpsrld $25,%ymm1,%ymm1 99 vpor %ymm7,%ymm1,%ymm1 100 101 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 102 vpshufd $0x39,%ymm1,%ymm1 103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 104 vpshufd $0x4e,%ymm2,%ymm2 105 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 106 vpshufd $0x93,%ymm3,%ymm3 107 108 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 109 vpaddd %ymm1,%ymm0,%ymm0 110 vpxor %ymm0,%ymm3,%ymm3 111 vpshufb %ymm5,%ymm3,%ymm3 112 113 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 114 vpaddd %ymm3,%ymm2,%ymm2 115 vpxor %ymm2,%ymm1,%ymm1 116 vmovdqa %ymm1,%ymm6 117 vpslld $12,%ymm6,%ymm6 118 vpsrld $20,%ymm1,%ymm1 119 vpor %ymm6,%ymm1,%ymm1 120 121 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 122 vpaddd %ymm1,%ymm0,%ymm0 123 vpxor %ymm0,%ymm3,%ymm3 124 vpshufb %ymm4,%ymm3,%ymm3 125 126 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 127 vpaddd %ymm3,%ymm2,%ymm2 128 vpxor %ymm2,%ymm1,%ymm1 129 vmovdqa %ymm1,%ymm7 130 vpslld $7,%ymm7,%ymm7 131 vpsrld $25,%ymm1,%ymm1 132 vpor %ymm7,%ymm1,%ymm1 133 134 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 135 vpshufd $0x93,%ymm1,%ymm1 136 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 137 vpshufd $0x4e,%ymm2,%ymm2 138 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 139 vpshufd $0x39,%ymm3,%ymm3 140 141 sub $2,%r8d 142 jnz .Ldoubleround 143 144 # o0 = i0 ^ (x0 + s0) 145 vpaddd %ymm8,%ymm0,%ymm7 146 cmp $0x10,%rax 147 jl .Lxorpart2 148 vpxor 0x00(%rdx),%xmm7,%xmm6 149 vmovdqu %xmm6,0x00(%rsi) 150 vextracti128 $1,%ymm7,%xmm0 151 # o1 = i1 ^ (x1 + s1) 152 vpaddd %ymm9,%ymm1,%ymm7 153 cmp $0x20,%rax 154 jl .Lxorpart2 155 vpxor 0x10(%rdx),%xmm7,%xmm6 156 vmovdqu %xmm6,0x10(%rsi) 157 vextracti128 $1,%ymm7,%xmm1 158 # o2 = i2 ^ (x2 + s2) 159 vpaddd %ymm10,%ymm2,%ymm7 160 cmp $0x30,%rax 161 jl .Lxorpart2 162 vpxor 0x20(%rdx),%xmm7,%xmm6 163 vmovdqu %xmm6,0x20(%rsi) 164 vextracti128 $1,%ymm7,%xmm2 165 # o3 = i3 ^ (x3 + s3) 166 vpaddd %ymm11,%ymm3,%ymm7 167 cmp $0x40,%rax 168 jl .Lxorpart2 169 vpxor 0x30(%rdx),%xmm7,%xmm6 170 vmovdqu %xmm6,0x30(%rsi) 171 vextracti128 $1,%ymm7,%xmm3 172 173 # xor and write second block 174 vmovdqa %xmm0,%xmm7 175 cmp $0x50,%rax 176 jl .Lxorpart2 177 vpxor 0x40(%rdx),%xmm7,%xmm6 178 vmovdqu %xmm6,0x40(%rsi) 179 180 vmovdqa %xmm1,%xmm7 181 cmp $0x60,%rax 182 jl .Lxorpart2 183 vpxor 0x50(%rdx),%xmm7,%xmm6 184 vmovdqu %xmm6,0x50(%rsi) 185 186 vmovdqa %xmm2,%xmm7 187 cmp $0x70,%rax 188 jl .Lxorpart2 189 vpxor 0x60(%rdx),%xmm7,%xmm6 190 vmovdqu %xmm6,0x60(%rsi) 191 192 vmovdqa %xmm3,%xmm7 193 cmp $0x80,%rax 194 jl .Lxorpart2 195 vpxor 0x70(%rdx),%xmm7,%xmm6 196 vmovdqu %xmm6,0x70(%rsi) 197 198.Ldone2: 199 vzeroupper 200 ret 201 202.Lxorpart2: 203 # xor remaining bytes from partial register into output 204 mov %rax,%r9 205 and $0x0f,%r9 206 jz .Ldone2 207 and $~0x0f,%rax 208 209 mov %rsi,%r11 210 211 lea 8(%rsp),%r10 212 sub $0x10,%rsp 213 and $~31,%rsp 214 215 lea (%rdx,%rax),%rsi 216 mov %rsp,%rdi 217 mov %r9,%rcx 218 rep movsb 219 220 vpxor 0x00(%rsp),%xmm7,%xmm7 221 vmovdqa %xmm7,0x00(%rsp) 222 223 mov %rsp,%rsi 224 lea (%r11,%rax),%rdi 225 mov %r9,%rcx 226 rep movsb 227 228 lea -8(%r10),%rsp 229 jmp .Ldone2 230 231ENDPROC(chacha_2block_xor_avx2) 232 233ENTRY(chacha_4block_xor_avx2) 234 # %rdi: Input state matrix, s 235 # %rsi: up to 4 data blocks output, o 236 # %rdx: up to 4 data blocks input, i 237 # %rcx: input/output length in bytes 238 # %r8d: nrounds 239 240 # This function encrypts four ChaCha blocks by loading the state 241 # matrix four times across eight AVX registers. It performs matrix 242 # operations on four words in two matrices in parallel, sequentially 243 # to the operations on the four words of the other two matrices. The 244 # required word shuffling has a rather high latency, we can do the 245 # arithmetic on two matrix-pairs without much slowdown. 246 247 vzeroupper 248 249 # x0..3[0-4] = s0..3 250 vbroadcasti128 0x00(%rdi),%ymm0 251 vbroadcasti128 0x10(%rdi),%ymm1 252 vbroadcasti128 0x20(%rdi),%ymm2 253 vbroadcasti128 0x30(%rdi),%ymm3 254 255 vmovdqa %ymm0,%ymm4 256 vmovdqa %ymm1,%ymm5 257 vmovdqa %ymm2,%ymm6 258 vmovdqa %ymm3,%ymm7 259 260 vpaddd CTR2BL(%rip),%ymm3,%ymm3 261 vpaddd CTR4BL(%rip),%ymm7,%ymm7 262 263 vmovdqa %ymm0,%ymm11 264 vmovdqa %ymm1,%ymm12 265 vmovdqa %ymm2,%ymm13 266 vmovdqa %ymm3,%ymm14 267 vmovdqa %ymm7,%ymm15 268 269 vmovdqa ROT8(%rip),%ymm8 270 vmovdqa ROT16(%rip),%ymm9 271 272 mov %rcx,%rax 273 274.Ldoubleround4: 275 276 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 277 vpaddd %ymm1,%ymm0,%ymm0 278 vpxor %ymm0,%ymm3,%ymm3 279 vpshufb %ymm9,%ymm3,%ymm3 280 281 vpaddd %ymm5,%ymm4,%ymm4 282 vpxor %ymm4,%ymm7,%ymm7 283 vpshufb %ymm9,%ymm7,%ymm7 284 285 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 286 vpaddd %ymm3,%ymm2,%ymm2 287 vpxor %ymm2,%ymm1,%ymm1 288 vmovdqa %ymm1,%ymm10 289 vpslld $12,%ymm10,%ymm10 290 vpsrld $20,%ymm1,%ymm1 291 vpor %ymm10,%ymm1,%ymm1 292 293 vpaddd %ymm7,%ymm6,%ymm6 294 vpxor %ymm6,%ymm5,%ymm5 295 vmovdqa %ymm5,%ymm10 296 vpslld $12,%ymm10,%ymm10 297 vpsrld $20,%ymm5,%ymm5 298 vpor %ymm10,%ymm5,%ymm5 299 300 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 301 vpaddd %ymm1,%ymm0,%ymm0 302 vpxor %ymm0,%ymm3,%ymm3 303 vpshufb %ymm8,%ymm3,%ymm3 304 305 vpaddd %ymm5,%ymm4,%ymm4 306 vpxor %ymm4,%ymm7,%ymm7 307 vpshufb %ymm8,%ymm7,%ymm7 308 309 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 310 vpaddd %ymm3,%ymm2,%ymm2 311 vpxor %ymm2,%ymm1,%ymm1 312 vmovdqa %ymm1,%ymm10 313 vpslld $7,%ymm10,%ymm10 314 vpsrld $25,%ymm1,%ymm1 315 vpor %ymm10,%ymm1,%ymm1 316 317 vpaddd %ymm7,%ymm6,%ymm6 318 vpxor %ymm6,%ymm5,%ymm5 319 vmovdqa %ymm5,%ymm10 320 vpslld $7,%ymm10,%ymm10 321 vpsrld $25,%ymm5,%ymm5 322 vpor %ymm10,%ymm5,%ymm5 323 324 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 325 vpshufd $0x39,%ymm1,%ymm1 326 vpshufd $0x39,%ymm5,%ymm5 327 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 328 vpshufd $0x4e,%ymm2,%ymm2 329 vpshufd $0x4e,%ymm6,%ymm6 330 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 331 vpshufd $0x93,%ymm3,%ymm3 332 vpshufd $0x93,%ymm7,%ymm7 333 334 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 335 vpaddd %ymm1,%ymm0,%ymm0 336 vpxor %ymm0,%ymm3,%ymm3 337 vpshufb %ymm9,%ymm3,%ymm3 338 339 vpaddd %ymm5,%ymm4,%ymm4 340 vpxor %ymm4,%ymm7,%ymm7 341 vpshufb %ymm9,%ymm7,%ymm7 342 343 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 344 vpaddd %ymm3,%ymm2,%ymm2 345 vpxor %ymm2,%ymm1,%ymm1 346 vmovdqa %ymm1,%ymm10 347 vpslld $12,%ymm10,%ymm10 348 vpsrld $20,%ymm1,%ymm1 349 vpor %ymm10,%ymm1,%ymm1 350 351 vpaddd %ymm7,%ymm6,%ymm6 352 vpxor %ymm6,%ymm5,%ymm5 353 vmovdqa %ymm5,%ymm10 354 vpslld $12,%ymm10,%ymm10 355 vpsrld $20,%ymm5,%ymm5 356 vpor %ymm10,%ymm5,%ymm5 357 358 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 359 vpaddd %ymm1,%ymm0,%ymm0 360 vpxor %ymm0,%ymm3,%ymm3 361 vpshufb %ymm8,%ymm3,%ymm3 362 363 vpaddd %ymm5,%ymm4,%ymm4 364 vpxor %ymm4,%ymm7,%ymm7 365 vpshufb %ymm8,%ymm7,%ymm7 366 367 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 368 vpaddd %ymm3,%ymm2,%ymm2 369 vpxor %ymm2,%ymm1,%ymm1 370 vmovdqa %ymm1,%ymm10 371 vpslld $7,%ymm10,%ymm10 372 vpsrld $25,%ymm1,%ymm1 373 vpor %ymm10,%ymm1,%ymm1 374 375 vpaddd %ymm7,%ymm6,%ymm6 376 vpxor %ymm6,%ymm5,%ymm5 377 vmovdqa %ymm5,%ymm10 378 vpslld $7,%ymm10,%ymm10 379 vpsrld $25,%ymm5,%ymm5 380 vpor %ymm10,%ymm5,%ymm5 381 382 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 383 vpshufd $0x93,%ymm1,%ymm1 384 vpshufd $0x93,%ymm5,%ymm5 385 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 386 vpshufd $0x4e,%ymm2,%ymm2 387 vpshufd $0x4e,%ymm6,%ymm6 388 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 389 vpshufd $0x39,%ymm3,%ymm3 390 vpshufd $0x39,%ymm7,%ymm7 391 392 sub $2,%r8d 393 jnz .Ldoubleround4 394 395 # o0 = i0 ^ (x0 + s0), first block 396 vpaddd %ymm11,%ymm0,%ymm10 397 cmp $0x10,%rax 398 jl .Lxorpart4 399 vpxor 0x00(%rdx),%xmm10,%xmm9 400 vmovdqu %xmm9,0x00(%rsi) 401 vextracti128 $1,%ymm10,%xmm0 402 # o1 = i1 ^ (x1 + s1), first block 403 vpaddd %ymm12,%ymm1,%ymm10 404 cmp $0x20,%rax 405 jl .Lxorpart4 406 vpxor 0x10(%rdx),%xmm10,%xmm9 407 vmovdqu %xmm9,0x10(%rsi) 408 vextracti128 $1,%ymm10,%xmm1 409 # o2 = i2 ^ (x2 + s2), first block 410 vpaddd %ymm13,%ymm2,%ymm10 411 cmp $0x30,%rax 412 jl .Lxorpart4 413 vpxor 0x20(%rdx),%xmm10,%xmm9 414 vmovdqu %xmm9,0x20(%rsi) 415 vextracti128 $1,%ymm10,%xmm2 416 # o3 = i3 ^ (x3 + s3), first block 417 vpaddd %ymm14,%ymm3,%ymm10 418 cmp $0x40,%rax 419 jl .Lxorpart4 420 vpxor 0x30(%rdx),%xmm10,%xmm9 421 vmovdqu %xmm9,0x30(%rsi) 422 vextracti128 $1,%ymm10,%xmm3 423 424 # xor and write second block 425 vmovdqa %xmm0,%xmm10 426 cmp $0x50,%rax 427 jl .Lxorpart4 428 vpxor 0x40(%rdx),%xmm10,%xmm9 429 vmovdqu %xmm9,0x40(%rsi) 430 431 vmovdqa %xmm1,%xmm10 432 cmp $0x60,%rax 433 jl .Lxorpart4 434 vpxor 0x50(%rdx),%xmm10,%xmm9 435 vmovdqu %xmm9,0x50(%rsi) 436 437 vmovdqa %xmm2,%xmm10 438 cmp $0x70,%rax 439 jl .Lxorpart4 440 vpxor 0x60(%rdx),%xmm10,%xmm9 441 vmovdqu %xmm9,0x60(%rsi) 442 443 vmovdqa %xmm3,%xmm10 444 cmp $0x80,%rax 445 jl .Lxorpart4 446 vpxor 0x70(%rdx),%xmm10,%xmm9 447 vmovdqu %xmm9,0x70(%rsi) 448 449 # o0 = i0 ^ (x0 + s0), third block 450 vpaddd %ymm11,%ymm4,%ymm10 451 cmp $0x90,%rax 452 jl .Lxorpart4 453 vpxor 0x80(%rdx),%xmm10,%xmm9 454 vmovdqu %xmm9,0x80(%rsi) 455 vextracti128 $1,%ymm10,%xmm4 456 # o1 = i1 ^ (x1 + s1), third block 457 vpaddd %ymm12,%ymm5,%ymm10 458 cmp $0xa0,%rax 459 jl .Lxorpart4 460 vpxor 0x90(%rdx),%xmm10,%xmm9 461 vmovdqu %xmm9,0x90(%rsi) 462 vextracti128 $1,%ymm10,%xmm5 463 # o2 = i2 ^ (x2 + s2), third block 464 vpaddd %ymm13,%ymm6,%ymm10 465 cmp $0xb0,%rax 466 jl .Lxorpart4 467 vpxor 0xa0(%rdx),%xmm10,%xmm9 468 vmovdqu %xmm9,0xa0(%rsi) 469 vextracti128 $1,%ymm10,%xmm6 470 # o3 = i3 ^ (x3 + s3), third block 471 vpaddd %ymm15,%ymm7,%ymm10 472 cmp $0xc0,%rax 473 jl .Lxorpart4 474 vpxor 0xb0(%rdx),%xmm10,%xmm9 475 vmovdqu %xmm9,0xb0(%rsi) 476 vextracti128 $1,%ymm10,%xmm7 477 478 # xor and write fourth block 479 vmovdqa %xmm4,%xmm10 480 cmp $0xd0,%rax 481 jl .Lxorpart4 482 vpxor 0xc0(%rdx),%xmm10,%xmm9 483 vmovdqu %xmm9,0xc0(%rsi) 484 485 vmovdqa %xmm5,%xmm10 486 cmp $0xe0,%rax 487 jl .Lxorpart4 488 vpxor 0xd0(%rdx),%xmm10,%xmm9 489 vmovdqu %xmm9,0xd0(%rsi) 490 491 vmovdqa %xmm6,%xmm10 492 cmp $0xf0,%rax 493 jl .Lxorpart4 494 vpxor 0xe0(%rdx),%xmm10,%xmm9 495 vmovdqu %xmm9,0xe0(%rsi) 496 497 vmovdqa %xmm7,%xmm10 498 cmp $0x100,%rax 499 jl .Lxorpart4 500 vpxor 0xf0(%rdx),%xmm10,%xmm9 501 vmovdqu %xmm9,0xf0(%rsi) 502 503.Ldone4: 504 vzeroupper 505 ret 506 507.Lxorpart4: 508 # xor remaining bytes from partial register into output 509 mov %rax,%r9 510 and $0x0f,%r9 511 jz .Ldone4 512 and $~0x0f,%rax 513 514 mov %rsi,%r11 515 516 lea 8(%rsp),%r10 517 sub $0x10,%rsp 518 and $~31,%rsp 519 520 lea (%rdx,%rax),%rsi 521 mov %rsp,%rdi 522 mov %r9,%rcx 523 rep movsb 524 525 vpxor 0x00(%rsp),%xmm10,%xmm10 526 vmovdqa %xmm10,0x00(%rsp) 527 528 mov %rsp,%rsi 529 lea (%r11,%rax),%rdi 530 mov %r9,%rcx 531 rep movsb 532 533 lea -8(%r10),%rsp 534 jmp .Ldone4 535 536ENDPROC(chacha_4block_xor_avx2) 537 538ENTRY(chacha_8block_xor_avx2) 539 # %rdi: Input state matrix, s 540 # %rsi: up to 8 data blocks output, o 541 # %rdx: up to 8 data blocks input, i 542 # %rcx: input/output length in bytes 543 # %r8d: nrounds 544 545 # This function encrypts eight consecutive ChaCha blocks by loading 546 # the state matrix in AVX registers eight times. As we need some 547 # scratch registers, we save the first four registers on the stack. The 548 # algorithm performs each operation on the corresponding word of each 549 # state matrix, hence requires no word shuffling. For final XORing step 550 # we transpose the matrix by interleaving 32-, 64- and then 128-bit 551 # words, which allows us to do XOR in AVX registers. 8/16-bit word 552 # rotation is done with the slightly better performing byte shuffling, 553 # 7/12-bit word rotation uses traditional shift+OR. 554 555 vzeroupper 556 # 4 * 32 byte stack, 32-byte aligned 557 lea 8(%rsp),%r10 558 and $~31, %rsp 559 sub $0x80, %rsp 560 mov %rcx,%rax 561 562 # x0..15[0-7] = s[0..15] 563 vpbroadcastd 0x00(%rdi),%ymm0 564 vpbroadcastd 0x04(%rdi),%ymm1 565 vpbroadcastd 0x08(%rdi),%ymm2 566 vpbroadcastd 0x0c(%rdi),%ymm3 567 vpbroadcastd 0x10(%rdi),%ymm4 568 vpbroadcastd 0x14(%rdi),%ymm5 569 vpbroadcastd 0x18(%rdi),%ymm6 570 vpbroadcastd 0x1c(%rdi),%ymm7 571 vpbroadcastd 0x20(%rdi),%ymm8 572 vpbroadcastd 0x24(%rdi),%ymm9 573 vpbroadcastd 0x28(%rdi),%ymm10 574 vpbroadcastd 0x2c(%rdi),%ymm11 575 vpbroadcastd 0x30(%rdi),%ymm12 576 vpbroadcastd 0x34(%rdi),%ymm13 577 vpbroadcastd 0x38(%rdi),%ymm14 578 vpbroadcastd 0x3c(%rdi),%ymm15 579 # x0..3 on stack 580 vmovdqa %ymm0,0x00(%rsp) 581 vmovdqa %ymm1,0x20(%rsp) 582 vmovdqa %ymm2,0x40(%rsp) 583 vmovdqa %ymm3,0x60(%rsp) 584 585 vmovdqa CTRINC(%rip),%ymm1 586 vmovdqa ROT8(%rip),%ymm2 587 vmovdqa ROT16(%rip),%ymm3 588 589 # x12 += counter values 0-3 590 vpaddd %ymm1,%ymm12,%ymm12 591 592.Ldoubleround8: 593 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 594 vpaddd 0x00(%rsp),%ymm4,%ymm0 595 vmovdqa %ymm0,0x00(%rsp) 596 vpxor %ymm0,%ymm12,%ymm12 597 vpshufb %ymm3,%ymm12,%ymm12 598 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 599 vpaddd 0x20(%rsp),%ymm5,%ymm0 600 vmovdqa %ymm0,0x20(%rsp) 601 vpxor %ymm0,%ymm13,%ymm13 602 vpshufb %ymm3,%ymm13,%ymm13 603 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 604 vpaddd 0x40(%rsp),%ymm6,%ymm0 605 vmovdqa %ymm0,0x40(%rsp) 606 vpxor %ymm0,%ymm14,%ymm14 607 vpshufb %ymm3,%ymm14,%ymm14 608 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 609 vpaddd 0x60(%rsp),%ymm7,%ymm0 610 vmovdqa %ymm0,0x60(%rsp) 611 vpxor %ymm0,%ymm15,%ymm15 612 vpshufb %ymm3,%ymm15,%ymm15 613 614 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 615 vpaddd %ymm12,%ymm8,%ymm8 616 vpxor %ymm8,%ymm4,%ymm4 617 vpslld $12,%ymm4,%ymm0 618 vpsrld $20,%ymm4,%ymm4 619 vpor %ymm0,%ymm4,%ymm4 620 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 621 vpaddd %ymm13,%ymm9,%ymm9 622 vpxor %ymm9,%ymm5,%ymm5 623 vpslld $12,%ymm5,%ymm0 624 vpsrld $20,%ymm5,%ymm5 625 vpor %ymm0,%ymm5,%ymm5 626 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 627 vpaddd %ymm14,%ymm10,%ymm10 628 vpxor %ymm10,%ymm6,%ymm6 629 vpslld $12,%ymm6,%ymm0 630 vpsrld $20,%ymm6,%ymm6 631 vpor %ymm0,%ymm6,%ymm6 632 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 633 vpaddd %ymm15,%ymm11,%ymm11 634 vpxor %ymm11,%ymm7,%ymm7 635 vpslld $12,%ymm7,%ymm0 636 vpsrld $20,%ymm7,%ymm7 637 vpor %ymm0,%ymm7,%ymm7 638 639 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 640 vpaddd 0x00(%rsp),%ymm4,%ymm0 641 vmovdqa %ymm0,0x00(%rsp) 642 vpxor %ymm0,%ymm12,%ymm12 643 vpshufb %ymm2,%ymm12,%ymm12 644 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 645 vpaddd 0x20(%rsp),%ymm5,%ymm0 646 vmovdqa %ymm0,0x20(%rsp) 647 vpxor %ymm0,%ymm13,%ymm13 648 vpshufb %ymm2,%ymm13,%ymm13 649 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 650 vpaddd 0x40(%rsp),%ymm6,%ymm0 651 vmovdqa %ymm0,0x40(%rsp) 652 vpxor %ymm0,%ymm14,%ymm14 653 vpshufb %ymm2,%ymm14,%ymm14 654 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 655 vpaddd 0x60(%rsp),%ymm7,%ymm0 656 vmovdqa %ymm0,0x60(%rsp) 657 vpxor %ymm0,%ymm15,%ymm15 658 vpshufb %ymm2,%ymm15,%ymm15 659 660 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 661 vpaddd %ymm12,%ymm8,%ymm8 662 vpxor %ymm8,%ymm4,%ymm4 663 vpslld $7,%ymm4,%ymm0 664 vpsrld $25,%ymm4,%ymm4 665 vpor %ymm0,%ymm4,%ymm4 666 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 667 vpaddd %ymm13,%ymm9,%ymm9 668 vpxor %ymm9,%ymm5,%ymm5 669 vpslld $7,%ymm5,%ymm0 670 vpsrld $25,%ymm5,%ymm5 671 vpor %ymm0,%ymm5,%ymm5 672 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 673 vpaddd %ymm14,%ymm10,%ymm10 674 vpxor %ymm10,%ymm6,%ymm6 675 vpslld $7,%ymm6,%ymm0 676 vpsrld $25,%ymm6,%ymm6 677 vpor %ymm0,%ymm6,%ymm6 678 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 679 vpaddd %ymm15,%ymm11,%ymm11 680 vpxor %ymm11,%ymm7,%ymm7 681 vpslld $7,%ymm7,%ymm0 682 vpsrld $25,%ymm7,%ymm7 683 vpor %ymm0,%ymm7,%ymm7 684 685 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 686 vpaddd 0x00(%rsp),%ymm5,%ymm0 687 vmovdqa %ymm0,0x00(%rsp) 688 vpxor %ymm0,%ymm15,%ymm15 689 vpshufb %ymm3,%ymm15,%ymm15 690 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 691 vpaddd 0x20(%rsp),%ymm6,%ymm0 692 vmovdqa %ymm0,0x20(%rsp) 693 vpxor %ymm0,%ymm12,%ymm12 694 vpshufb %ymm3,%ymm12,%ymm12 695 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 696 vpaddd 0x40(%rsp),%ymm7,%ymm0 697 vmovdqa %ymm0,0x40(%rsp) 698 vpxor %ymm0,%ymm13,%ymm13 699 vpshufb %ymm3,%ymm13,%ymm13 700 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 701 vpaddd 0x60(%rsp),%ymm4,%ymm0 702 vmovdqa %ymm0,0x60(%rsp) 703 vpxor %ymm0,%ymm14,%ymm14 704 vpshufb %ymm3,%ymm14,%ymm14 705 706 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 707 vpaddd %ymm15,%ymm10,%ymm10 708 vpxor %ymm10,%ymm5,%ymm5 709 vpslld $12,%ymm5,%ymm0 710 vpsrld $20,%ymm5,%ymm5 711 vpor %ymm0,%ymm5,%ymm5 712 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 713 vpaddd %ymm12,%ymm11,%ymm11 714 vpxor %ymm11,%ymm6,%ymm6 715 vpslld $12,%ymm6,%ymm0 716 vpsrld $20,%ymm6,%ymm6 717 vpor %ymm0,%ymm6,%ymm6 718 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 719 vpaddd %ymm13,%ymm8,%ymm8 720 vpxor %ymm8,%ymm7,%ymm7 721 vpslld $12,%ymm7,%ymm0 722 vpsrld $20,%ymm7,%ymm7 723 vpor %ymm0,%ymm7,%ymm7 724 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 725 vpaddd %ymm14,%ymm9,%ymm9 726 vpxor %ymm9,%ymm4,%ymm4 727 vpslld $12,%ymm4,%ymm0 728 vpsrld $20,%ymm4,%ymm4 729 vpor %ymm0,%ymm4,%ymm4 730 731 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 732 vpaddd 0x00(%rsp),%ymm5,%ymm0 733 vmovdqa %ymm0,0x00(%rsp) 734 vpxor %ymm0,%ymm15,%ymm15 735 vpshufb %ymm2,%ymm15,%ymm15 736 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 737 vpaddd 0x20(%rsp),%ymm6,%ymm0 738 vmovdqa %ymm0,0x20(%rsp) 739 vpxor %ymm0,%ymm12,%ymm12 740 vpshufb %ymm2,%ymm12,%ymm12 741 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 742 vpaddd 0x40(%rsp),%ymm7,%ymm0 743 vmovdqa %ymm0,0x40(%rsp) 744 vpxor %ymm0,%ymm13,%ymm13 745 vpshufb %ymm2,%ymm13,%ymm13 746 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 747 vpaddd 0x60(%rsp),%ymm4,%ymm0 748 vmovdqa %ymm0,0x60(%rsp) 749 vpxor %ymm0,%ymm14,%ymm14 750 vpshufb %ymm2,%ymm14,%ymm14 751 752 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 753 vpaddd %ymm15,%ymm10,%ymm10 754 vpxor %ymm10,%ymm5,%ymm5 755 vpslld $7,%ymm5,%ymm0 756 vpsrld $25,%ymm5,%ymm5 757 vpor %ymm0,%ymm5,%ymm5 758 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 759 vpaddd %ymm12,%ymm11,%ymm11 760 vpxor %ymm11,%ymm6,%ymm6 761 vpslld $7,%ymm6,%ymm0 762 vpsrld $25,%ymm6,%ymm6 763 vpor %ymm0,%ymm6,%ymm6 764 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 765 vpaddd %ymm13,%ymm8,%ymm8 766 vpxor %ymm8,%ymm7,%ymm7 767 vpslld $7,%ymm7,%ymm0 768 vpsrld $25,%ymm7,%ymm7 769 vpor %ymm0,%ymm7,%ymm7 770 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 771 vpaddd %ymm14,%ymm9,%ymm9 772 vpxor %ymm9,%ymm4,%ymm4 773 vpslld $7,%ymm4,%ymm0 774 vpsrld $25,%ymm4,%ymm4 775 vpor %ymm0,%ymm4,%ymm4 776 777 sub $2,%r8d 778 jnz .Ldoubleround8 779 780 # x0..15[0-3] += s[0..15] 781 vpbroadcastd 0x00(%rdi),%ymm0 782 vpaddd 0x00(%rsp),%ymm0,%ymm0 783 vmovdqa %ymm0,0x00(%rsp) 784 vpbroadcastd 0x04(%rdi),%ymm0 785 vpaddd 0x20(%rsp),%ymm0,%ymm0 786 vmovdqa %ymm0,0x20(%rsp) 787 vpbroadcastd 0x08(%rdi),%ymm0 788 vpaddd 0x40(%rsp),%ymm0,%ymm0 789 vmovdqa %ymm0,0x40(%rsp) 790 vpbroadcastd 0x0c(%rdi),%ymm0 791 vpaddd 0x60(%rsp),%ymm0,%ymm0 792 vmovdqa %ymm0,0x60(%rsp) 793 vpbroadcastd 0x10(%rdi),%ymm0 794 vpaddd %ymm0,%ymm4,%ymm4 795 vpbroadcastd 0x14(%rdi),%ymm0 796 vpaddd %ymm0,%ymm5,%ymm5 797 vpbroadcastd 0x18(%rdi),%ymm0 798 vpaddd %ymm0,%ymm6,%ymm6 799 vpbroadcastd 0x1c(%rdi),%ymm0 800 vpaddd %ymm0,%ymm7,%ymm7 801 vpbroadcastd 0x20(%rdi),%ymm0 802 vpaddd %ymm0,%ymm8,%ymm8 803 vpbroadcastd 0x24(%rdi),%ymm0 804 vpaddd %ymm0,%ymm9,%ymm9 805 vpbroadcastd 0x28(%rdi),%ymm0 806 vpaddd %ymm0,%ymm10,%ymm10 807 vpbroadcastd 0x2c(%rdi),%ymm0 808 vpaddd %ymm0,%ymm11,%ymm11 809 vpbroadcastd 0x30(%rdi),%ymm0 810 vpaddd %ymm0,%ymm12,%ymm12 811 vpbroadcastd 0x34(%rdi),%ymm0 812 vpaddd %ymm0,%ymm13,%ymm13 813 vpbroadcastd 0x38(%rdi),%ymm0 814 vpaddd %ymm0,%ymm14,%ymm14 815 vpbroadcastd 0x3c(%rdi),%ymm0 816 vpaddd %ymm0,%ymm15,%ymm15 817 818 # x12 += counter values 0-3 819 vpaddd %ymm1,%ymm12,%ymm12 820 821 # interleave 32-bit words in state n, n+1 822 vmovdqa 0x00(%rsp),%ymm0 823 vmovdqa 0x20(%rsp),%ymm1 824 vpunpckldq %ymm1,%ymm0,%ymm2 825 vpunpckhdq %ymm1,%ymm0,%ymm1 826 vmovdqa %ymm2,0x00(%rsp) 827 vmovdqa %ymm1,0x20(%rsp) 828 vmovdqa 0x40(%rsp),%ymm0 829 vmovdqa 0x60(%rsp),%ymm1 830 vpunpckldq %ymm1,%ymm0,%ymm2 831 vpunpckhdq %ymm1,%ymm0,%ymm1 832 vmovdqa %ymm2,0x40(%rsp) 833 vmovdqa %ymm1,0x60(%rsp) 834 vmovdqa %ymm4,%ymm0 835 vpunpckldq %ymm5,%ymm0,%ymm4 836 vpunpckhdq %ymm5,%ymm0,%ymm5 837 vmovdqa %ymm6,%ymm0 838 vpunpckldq %ymm7,%ymm0,%ymm6 839 vpunpckhdq %ymm7,%ymm0,%ymm7 840 vmovdqa %ymm8,%ymm0 841 vpunpckldq %ymm9,%ymm0,%ymm8 842 vpunpckhdq %ymm9,%ymm0,%ymm9 843 vmovdqa %ymm10,%ymm0 844 vpunpckldq %ymm11,%ymm0,%ymm10 845 vpunpckhdq %ymm11,%ymm0,%ymm11 846 vmovdqa %ymm12,%ymm0 847 vpunpckldq %ymm13,%ymm0,%ymm12 848 vpunpckhdq %ymm13,%ymm0,%ymm13 849 vmovdqa %ymm14,%ymm0 850 vpunpckldq %ymm15,%ymm0,%ymm14 851 vpunpckhdq %ymm15,%ymm0,%ymm15 852 853 # interleave 64-bit words in state n, n+2 854 vmovdqa 0x00(%rsp),%ymm0 855 vmovdqa 0x40(%rsp),%ymm2 856 vpunpcklqdq %ymm2,%ymm0,%ymm1 857 vpunpckhqdq %ymm2,%ymm0,%ymm2 858 vmovdqa %ymm1,0x00(%rsp) 859 vmovdqa %ymm2,0x40(%rsp) 860 vmovdqa 0x20(%rsp),%ymm0 861 vmovdqa 0x60(%rsp),%ymm2 862 vpunpcklqdq %ymm2,%ymm0,%ymm1 863 vpunpckhqdq %ymm2,%ymm0,%ymm2 864 vmovdqa %ymm1,0x20(%rsp) 865 vmovdqa %ymm2,0x60(%rsp) 866 vmovdqa %ymm4,%ymm0 867 vpunpcklqdq %ymm6,%ymm0,%ymm4 868 vpunpckhqdq %ymm6,%ymm0,%ymm6 869 vmovdqa %ymm5,%ymm0 870 vpunpcklqdq %ymm7,%ymm0,%ymm5 871 vpunpckhqdq %ymm7,%ymm0,%ymm7 872 vmovdqa %ymm8,%ymm0 873 vpunpcklqdq %ymm10,%ymm0,%ymm8 874 vpunpckhqdq %ymm10,%ymm0,%ymm10 875 vmovdqa %ymm9,%ymm0 876 vpunpcklqdq %ymm11,%ymm0,%ymm9 877 vpunpckhqdq %ymm11,%ymm0,%ymm11 878 vmovdqa %ymm12,%ymm0 879 vpunpcklqdq %ymm14,%ymm0,%ymm12 880 vpunpckhqdq %ymm14,%ymm0,%ymm14 881 vmovdqa %ymm13,%ymm0 882 vpunpcklqdq %ymm15,%ymm0,%ymm13 883 vpunpckhqdq %ymm15,%ymm0,%ymm15 884 885 # interleave 128-bit words in state n, n+4 886 # xor/write first four blocks 887 vmovdqa 0x00(%rsp),%ymm1 888 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 889 cmp $0x0020,%rax 890 jl .Lxorpart8 891 vpxor 0x0000(%rdx),%ymm0,%ymm0 892 vmovdqu %ymm0,0x0000(%rsi) 893 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 894 895 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 896 cmp $0x0040,%rax 897 jl .Lxorpart8 898 vpxor 0x0020(%rdx),%ymm0,%ymm0 899 vmovdqu %ymm0,0x0020(%rsi) 900 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 901 902 vmovdqa 0x40(%rsp),%ymm1 903 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 904 cmp $0x0060,%rax 905 jl .Lxorpart8 906 vpxor 0x0040(%rdx),%ymm0,%ymm0 907 vmovdqu %ymm0,0x0040(%rsi) 908 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 909 910 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 911 cmp $0x0080,%rax 912 jl .Lxorpart8 913 vpxor 0x0060(%rdx),%ymm0,%ymm0 914 vmovdqu %ymm0,0x0060(%rsi) 915 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 916 917 vmovdqa 0x20(%rsp),%ymm1 918 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 919 cmp $0x00a0,%rax 920 jl .Lxorpart8 921 vpxor 0x0080(%rdx),%ymm0,%ymm0 922 vmovdqu %ymm0,0x0080(%rsi) 923 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 924 925 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 926 cmp $0x00c0,%rax 927 jl .Lxorpart8 928 vpxor 0x00a0(%rdx),%ymm0,%ymm0 929 vmovdqu %ymm0,0x00a0(%rsi) 930 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 931 932 vmovdqa 0x60(%rsp),%ymm1 933 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 934 cmp $0x00e0,%rax 935 jl .Lxorpart8 936 vpxor 0x00c0(%rdx),%ymm0,%ymm0 937 vmovdqu %ymm0,0x00c0(%rsi) 938 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 939 940 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 941 cmp $0x0100,%rax 942 jl .Lxorpart8 943 vpxor 0x00e0(%rdx),%ymm0,%ymm0 944 vmovdqu %ymm0,0x00e0(%rsi) 945 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 946 947 # xor remaining blocks, write to output 948 vmovdqa %ymm4,%ymm0 949 cmp $0x0120,%rax 950 jl .Lxorpart8 951 vpxor 0x0100(%rdx),%ymm0,%ymm0 952 vmovdqu %ymm0,0x0100(%rsi) 953 954 vmovdqa %ymm12,%ymm0 955 cmp $0x0140,%rax 956 jl .Lxorpart8 957 vpxor 0x0120(%rdx),%ymm0,%ymm0 958 vmovdqu %ymm0,0x0120(%rsi) 959 960 vmovdqa %ymm6,%ymm0 961 cmp $0x0160,%rax 962 jl .Lxorpart8 963 vpxor 0x0140(%rdx),%ymm0,%ymm0 964 vmovdqu %ymm0,0x0140(%rsi) 965 966 vmovdqa %ymm14,%ymm0 967 cmp $0x0180,%rax 968 jl .Lxorpart8 969 vpxor 0x0160(%rdx),%ymm0,%ymm0 970 vmovdqu %ymm0,0x0160(%rsi) 971 972 vmovdqa %ymm5,%ymm0 973 cmp $0x01a0,%rax 974 jl .Lxorpart8 975 vpxor 0x0180(%rdx),%ymm0,%ymm0 976 vmovdqu %ymm0,0x0180(%rsi) 977 978 vmovdqa %ymm13,%ymm0 979 cmp $0x01c0,%rax 980 jl .Lxorpart8 981 vpxor 0x01a0(%rdx),%ymm0,%ymm0 982 vmovdqu %ymm0,0x01a0(%rsi) 983 984 vmovdqa %ymm7,%ymm0 985 cmp $0x01e0,%rax 986 jl .Lxorpart8 987 vpxor 0x01c0(%rdx),%ymm0,%ymm0 988 vmovdqu %ymm0,0x01c0(%rsi) 989 990 vmovdqa %ymm15,%ymm0 991 cmp $0x0200,%rax 992 jl .Lxorpart8 993 vpxor 0x01e0(%rdx),%ymm0,%ymm0 994 vmovdqu %ymm0,0x01e0(%rsi) 995 996.Ldone8: 997 vzeroupper 998 lea -8(%r10),%rsp 999 ret 1000 1001.Lxorpart8: 1002 # xor remaining bytes from partial register into output 1003 mov %rax,%r9 1004 and $0x1f,%r9 1005 jz .Ldone8 1006 and $~0x1f,%rax 1007 1008 mov %rsi,%r11 1009 1010 lea (%rdx,%rax),%rsi 1011 mov %rsp,%rdi 1012 mov %r9,%rcx 1013 rep movsb 1014 1015 vpxor 0x00(%rsp),%ymm0,%ymm0 1016 vmovdqa %ymm0,0x00(%rsp) 1017 1018 mov %rsp,%rsi 1019 lea (%r11,%rax),%rdi 1020 mov %r9,%rcx 1021 rep movsb 1022 1023 jmp .Ldone8 1024 1025ENDPROC(chacha_8block_xor_avx2) 1026