1/* 2 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/frame.h> 14 15.section .rodata.cst16.ROT8, "aM", @progbits, 16 16.align 16 17ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 18.section .rodata.cst16.ROT16, "aM", @progbits, 16 19.align 16 20ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 21.section .rodata.cst16.CTRINC, "aM", @progbits, 16 22.align 16 23CTRINC: .octa 0x00000003000000020000000100000000 24 25.text 26 27/* 28 * chacha_permute - permute one block 29 * 30 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 31 * function performs matrix operations on four words in parallel, but requires 32 * shuffling to rearrange the words after each round. 8/16-bit word rotation is 33 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 34 * rotation uses traditional shift+OR. 35 * 36 * The round count is given in %r8d. 37 * 38 * Clobbers: %r8d, %xmm4-%xmm7 39 */ 40chacha_permute: 41 42 movdqa ROT8(%rip),%xmm4 43 movdqa ROT16(%rip),%xmm5 44 45.Ldoubleround: 46 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 47 paddd %xmm1,%xmm0 48 pxor %xmm0,%xmm3 49 pshufb %xmm5,%xmm3 50 51 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 52 paddd %xmm3,%xmm2 53 pxor %xmm2,%xmm1 54 movdqa %xmm1,%xmm6 55 pslld $12,%xmm6 56 psrld $20,%xmm1 57 por %xmm6,%xmm1 58 59 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 60 paddd %xmm1,%xmm0 61 pxor %xmm0,%xmm3 62 pshufb %xmm4,%xmm3 63 64 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 65 paddd %xmm3,%xmm2 66 pxor %xmm2,%xmm1 67 movdqa %xmm1,%xmm7 68 pslld $7,%xmm7 69 psrld $25,%xmm1 70 por %xmm7,%xmm1 71 72 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 73 pshufd $0x39,%xmm1,%xmm1 74 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 75 pshufd $0x4e,%xmm2,%xmm2 76 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 77 pshufd $0x93,%xmm3,%xmm3 78 79 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 80 paddd %xmm1,%xmm0 81 pxor %xmm0,%xmm3 82 pshufb %xmm5,%xmm3 83 84 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 85 paddd %xmm3,%xmm2 86 pxor %xmm2,%xmm1 87 movdqa %xmm1,%xmm6 88 pslld $12,%xmm6 89 psrld $20,%xmm1 90 por %xmm6,%xmm1 91 92 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 93 paddd %xmm1,%xmm0 94 pxor %xmm0,%xmm3 95 pshufb %xmm4,%xmm3 96 97 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 98 paddd %xmm3,%xmm2 99 pxor %xmm2,%xmm1 100 movdqa %xmm1,%xmm7 101 pslld $7,%xmm7 102 psrld $25,%xmm1 103 por %xmm7,%xmm1 104 105 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 106 pshufd $0x93,%xmm1,%xmm1 107 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 108 pshufd $0x4e,%xmm2,%xmm2 109 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 110 pshufd $0x39,%xmm3,%xmm3 111 112 sub $2,%r8d 113 jnz .Ldoubleround 114 115 ret 116ENDPROC(chacha_permute) 117 118ENTRY(chacha_block_xor_ssse3) 119 # %rdi: Input state matrix, s 120 # %rsi: up to 1 data block output, o 121 # %rdx: up to 1 data block input, i 122 # %rcx: input/output length in bytes 123 # %r8d: nrounds 124 FRAME_BEGIN 125 126 # x0..3 = s0..3 127 movdqa 0x00(%rdi),%xmm0 128 movdqa 0x10(%rdi),%xmm1 129 movdqa 0x20(%rdi),%xmm2 130 movdqa 0x30(%rdi),%xmm3 131 movdqa %xmm0,%xmm8 132 movdqa %xmm1,%xmm9 133 movdqa %xmm2,%xmm10 134 movdqa %xmm3,%xmm11 135 136 mov %rcx,%rax 137 call chacha_permute 138 139 # o0 = i0 ^ (x0 + s0) 140 paddd %xmm8,%xmm0 141 cmp $0x10,%rax 142 jl .Lxorpart 143 movdqu 0x00(%rdx),%xmm4 144 pxor %xmm4,%xmm0 145 movdqu %xmm0,0x00(%rsi) 146 # o1 = i1 ^ (x1 + s1) 147 paddd %xmm9,%xmm1 148 movdqa %xmm1,%xmm0 149 cmp $0x20,%rax 150 jl .Lxorpart 151 movdqu 0x10(%rdx),%xmm0 152 pxor %xmm1,%xmm0 153 movdqu %xmm0,0x10(%rsi) 154 # o2 = i2 ^ (x2 + s2) 155 paddd %xmm10,%xmm2 156 movdqa %xmm2,%xmm0 157 cmp $0x30,%rax 158 jl .Lxorpart 159 movdqu 0x20(%rdx),%xmm0 160 pxor %xmm2,%xmm0 161 movdqu %xmm0,0x20(%rsi) 162 # o3 = i3 ^ (x3 + s3) 163 paddd %xmm11,%xmm3 164 movdqa %xmm3,%xmm0 165 cmp $0x40,%rax 166 jl .Lxorpart 167 movdqu 0x30(%rdx),%xmm0 168 pxor %xmm3,%xmm0 169 movdqu %xmm0,0x30(%rsi) 170 171.Ldone: 172 FRAME_END 173 ret 174 175.Lxorpart: 176 # xor remaining bytes from partial register into output 177 mov %rax,%r9 178 and $0x0f,%r9 179 jz .Ldone 180 and $~0x0f,%rax 181 182 mov %rsi,%r11 183 184 lea 8(%rsp),%r10 185 sub $0x10,%rsp 186 and $~31,%rsp 187 188 lea (%rdx,%rax),%rsi 189 mov %rsp,%rdi 190 mov %r9,%rcx 191 rep movsb 192 193 pxor 0x00(%rsp),%xmm0 194 movdqa %xmm0,0x00(%rsp) 195 196 mov %rsp,%rsi 197 lea (%r11,%rax),%rdi 198 mov %r9,%rcx 199 rep movsb 200 201 lea -8(%r10),%rsp 202 jmp .Ldone 203 204ENDPROC(chacha_block_xor_ssse3) 205 206ENTRY(hchacha_block_ssse3) 207 # %rdi: Input state matrix, s 208 # %rsi: output (8 32-bit words) 209 # %edx: nrounds 210 FRAME_BEGIN 211 212 movdqa 0x00(%rdi),%xmm0 213 movdqa 0x10(%rdi),%xmm1 214 movdqa 0x20(%rdi),%xmm2 215 movdqa 0x30(%rdi),%xmm3 216 217 mov %edx,%r8d 218 call chacha_permute 219 220 movdqu %xmm0,0x00(%rsi) 221 movdqu %xmm3,0x10(%rsi) 222 223 FRAME_END 224 ret 225ENDPROC(hchacha_block_ssse3) 226 227ENTRY(chacha_4block_xor_ssse3) 228 # %rdi: Input state matrix, s 229 # %rsi: up to 4 data blocks output, o 230 # %rdx: up to 4 data blocks input, i 231 # %rcx: input/output length in bytes 232 # %r8d: nrounds 233 234 # This function encrypts four consecutive ChaCha blocks by loading the 235 # the state matrix in SSE registers four times. As we need some scratch 236 # registers, we save the first four registers on the stack. The 237 # algorithm performs each operation on the corresponding word of each 238 # state matrix, hence requires no word shuffling. For final XORing step 239 # we transpose the matrix by interleaving 32- and then 64-bit words, 240 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 241 # done with the slightly better performing SSSE3 byte shuffling, 242 # 7/12-bit word rotation uses traditional shift+OR. 243 244 lea 8(%rsp),%r10 245 sub $0x80,%rsp 246 and $~63,%rsp 247 mov %rcx,%rax 248 249 # x0..15[0-3] = s0..3[0..3] 250 movq 0x00(%rdi),%xmm1 251 pshufd $0x00,%xmm1,%xmm0 252 pshufd $0x55,%xmm1,%xmm1 253 movq 0x08(%rdi),%xmm3 254 pshufd $0x00,%xmm3,%xmm2 255 pshufd $0x55,%xmm3,%xmm3 256 movq 0x10(%rdi),%xmm5 257 pshufd $0x00,%xmm5,%xmm4 258 pshufd $0x55,%xmm5,%xmm5 259 movq 0x18(%rdi),%xmm7 260 pshufd $0x00,%xmm7,%xmm6 261 pshufd $0x55,%xmm7,%xmm7 262 movq 0x20(%rdi),%xmm9 263 pshufd $0x00,%xmm9,%xmm8 264 pshufd $0x55,%xmm9,%xmm9 265 movq 0x28(%rdi),%xmm11 266 pshufd $0x00,%xmm11,%xmm10 267 pshufd $0x55,%xmm11,%xmm11 268 movq 0x30(%rdi),%xmm13 269 pshufd $0x00,%xmm13,%xmm12 270 pshufd $0x55,%xmm13,%xmm13 271 movq 0x38(%rdi),%xmm15 272 pshufd $0x00,%xmm15,%xmm14 273 pshufd $0x55,%xmm15,%xmm15 274 # x0..3 on stack 275 movdqa %xmm0,0x00(%rsp) 276 movdqa %xmm1,0x10(%rsp) 277 movdqa %xmm2,0x20(%rsp) 278 movdqa %xmm3,0x30(%rsp) 279 280 movdqa CTRINC(%rip),%xmm1 281 movdqa ROT8(%rip),%xmm2 282 movdqa ROT16(%rip),%xmm3 283 284 # x12 += counter values 0-3 285 paddd %xmm1,%xmm12 286 287.Ldoubleround4: 288 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 289 movdqa 0x00(%rsp),%xmm0 290 paddd %xmm4,%xmm0 291 movdqa %xmm0,0x00(%rsp) 292 pxor %xmm0,%xmm12 293 pshufb %xmm3,%xmm12 294 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 295 movdqa 0x10(%rsp),%xmm0 296 paddd %xmm5,%xmm0 297 movdqa %xmm0,0x10(%rsp) 298 pxor %xmm0,%xmm13 299 pshufb %xmm3,%xmm13 300 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 301 movdqa 0x20(%rsp),%xmm0 302 paddd %xmm6,%xmm0 303 movdqa %xmm0,0x20(%rsp) 304 pxor %xmm0,%xmm14 305 pshufb %xmm3,%xmm14 306 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 307 movdqa 0x30(%rsp),%xmm0 308 paddd %xmm7,%xmm0 309 movdqa %xmm0,0x30(%rsp) 310 pxor %xmm0,%xmm15 311 pshufb %xmm3,%xmm15 312 313 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 314 paddd %xmm12,%xmm8 315 pxor %xmm8,%xmm4 316 movdqa %xmm4,%xmm0 317 pslld $12,%xmm0 318 psrld $20,%xmm4 319 por %xmm0,%xmm4 320 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 321 paddd %xmm13,%xmm9 322 pxor %xmm9,%xmm5 323 movdqa %xmm5,%xmm0 324 pslld $12,%xmm0 325 psrld $20,%xmm5 326 por %xmm0,%xmm5 327 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 328 paddd %xmm14,%xmm10 329 pxor %xmm10,%xmm6 330 movdqa %xmm6,%xmm0 331 pslld $12,%xmm0 332 psrld $20,%xmm6 333 por %xmm0,%xmm6 334 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 335 paddd %xmm15,%xmm11 336 pxor %xmm11,%xmm7 337 movdqa %xmm7,%xmm0 338 pslld $12,%xmm0 339 psrld $20,%xmm7 340 por %xmm0,%xmm7 341 342 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 343 movdqa 0x00(%rsp),%xmm0 344 paddd %xmm4,%xmm0 345 movdqa %xmm0,0x00(%rsp) 346 pxor %xmm0,%xmm12 347 pshufb %xmm2,%xmm12 348 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 349 movdqa 0x10(%rsp),%xmm0 350 paddd %xmm5,%xmm0 351 movdqa %xmm0,0x10(%rsp) 352 pxor %xmm0,%xmm13 353 pshufb %xmm2,%xmm13 354 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 355 movdqa 0x20(%rsp),%xmm0 356 paddd %xmm6,%xmm0 357 movdqa %xmm0,0x20(%rsp) 358 pxor %xmm0,%xmm14 359 pshufb %xmm2,%xmm14 360 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 361 movdqa 0x30(%rsp),%xmm0 362 paddd %xmm7,%xmm0 363 movdqa %xmm0,0x30(%rsp) 364 pxor %xmm0,%xmm15 365 pshufb %xmm2,%xmm15 366 367 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 368 paddd %xmm12,%xmm8 369 pxor %xmm8,%xmm4 370 movdqa %xmm4,%xmm0 371 pslld $7,%xmm0 372 psrld $25,%xmm4 373 por %xmm0,%xmm4 374 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 375 paddd %xmm13,%xmm9 376 pxor %xmm9,%xmm5 377 movdqa %xmm5,%xmm0 378 pslld $7,%xmm0 379 psrld $25,%xmm5 380 por %xmm0,%xmm5 381 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 382 paddd %xmm14,%xmm10 383 pxor %xmm10,%xmm6 384 movdqa %xmm6,%xmm0 385 pslld $7,%xmm0 386 psrld $25,%xmm6 387 por %xmm0,%xmm6 388 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 389 paddd %xmm15,%xmm11 390 pxor %xmm11,%xmm7 391 movdqa %xmm7,%xmm0 392 pslld $7,%xmm0 393 psrld $25,%xmm7 394 por %xmm0,%xmm7 395 396 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 397 movdqa 0x00(%rsp),%xmm0 398 paddd %xmm5,%xmm0 399 movdqa %xmm0,0x00(%rsp) 400 pxor %xmm0,%xmm15 401 pshufb %xmm3,%xmm15 402 # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 403 movdqa 0x10(%rsp),%xmm0 404 paddd %xmm6,%xmm0 405 movdqa %xmm0,0x10(%rsp) 406 pxor %xmm0,%xmm12 407 pshufb %xmm3,%xmm12 408 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 409 movdqa 0x20(%rsp),%xmm0 410 paddd %xmm7,%xmm0 411 movdqa %xmm0,0x20(%rsp) 412 pxor %xmm0,%xmm13 413 pshufb %xmm3,%xmm13 414 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 415 movdqa 0x30(%rsp),%xmm0 416 paddd %xmm4,%xmm0 417 movdqa %xmm0,0x30(%rsp) 418 pxor %xmm0,%xmm14 419 pshufb %xmm3,%xmm14 420 421 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 422 paddd %xmm15,%xmm10 423 pxor %xmm10,%xmm5 424 movdqa %xmm5,%xmm0 425 pslld $12,%xmm0 426 psrld $20,%xmm5 427 por %xmm0,%xmm5 428 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 429 paddd %xmm12,%xmm11 430 pxor %xmm11,%xmm6 431 movdqa %xmm6,%xmm0 432 pslld $12,%xmm0 433 psrld $20,%xmm6 434 por %xmm0,%xmm6 435 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 436 paddd %xmm13,%xmm8 437 pxor %xmm8,%xmm7 438 movdqa %xmm7,%xmm0 439 pslld $12,%xmm0 440 psrld $20,%xmm7 441 por %xmm0,%xmm7 442 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 443 paddd %xmm14,%xmm9 444 pxor %xmm9,%xmm4 445 movdqa %xmm4,%xmm0 446 pslld $12,%xmm0 447 psrld $20,%xmm4 448 por %xmm0,%xmm4 449 450 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 451 movdqa 0x00(%rsp),%xmm0 452 paddd %xmm5,%xmm0 453 movdqa %xmm0,0x00(%rsp) 454 pxor %xmm0,%xmm15 455 pshufb %xmm2,%xmm15 456 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 457 movdqa 0x10(%rsp),%xmm0 458 paddd %xmm6,%xmm0 459 movdqa %xmm0,0x10(%rsp) 460 pxor %xmm0,%xmm12 461 pshufb %xmm2,%xmm12 462 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 463 movdqa 0x20(%rsp),%xmm0 464 paddd %xmm7,%xmm0 465 movdqa %xmm0,0x20(%rsp) 466 pxor %xmm0,%xmm13 467 pshufb %xmm2,%xmm13 468 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 469 movdqa 0x30(%rsp),%xmm0 470 paddd %xmm4,%xmm0 471 movdqa %xmm0,0x30(%rsp) 472 pxor %xmm0,%xmm14 473 pshufb %xmm2,%xmm14 474 475 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 476 paddd %xmm15,%xmm10 477 pxor %xmm10,%xmm5 478 movdqa %xmm5,%xmm0 479 pslld $7,%xmm0 480 psrld $25,%xmm5 481 por %xmm0,%xmm5 482 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 483 paddd %xmm12,%xmm11 484 pxor %xmm11,%xmm6 485 movdqa %xmm6,%xmm0 486 pslld $7,%xmm0 487 psrld $25,%xmm6 488 por %xmm0,%xmm6 489 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 490 paddd %xmm13,%xmm8 491 pxor %xmm8,%xmm7 492 movdqa %xmm7,%xmm0 493 pslld $7,%xmm0 494 psrld $25,%xmm7 495 por %xmm0,%xmm7 496 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 497 paddd %xmm14,%xmm9 498 pxor %xmm9,%xmm4 499 movdqa %xmm4,%xmm0 500 pslld $7,%xmm0 501 psrld $25,%xmm4 502 por %xmm0,%xmm4 503 504 sub $2,%r8d 505 jnz .Ldoubleround4 506 507 # x0[0-3] += s0[0] 508 # x1[0-3] += s0[1] 509 movq 0x00(%rdi),%xmm3 510 pshufd $0x00,%xmm3,%xmm2 511 pshufd $0x55,%xmm3,%xmm3 512 paddd 0x00(%rsp),%xmm2 513 movdqa %xmm2,0x00(%rsp) 514 paddd 0x10(%rsp),%xmm3 515 movdqa %xmm3,0x10(%rsp) 516 # x2[0-3] += s0[2] 517 # x3[0-3] += s0[3] 518 movq 0x08(%rdi),%xmm3 519 pshufd $0x00,%xmm3,%xmm2 520 pshufd $0x55,%xmm3,%xmm3 521 paddd 0x20(%rsp),%xmm2 522 movdqa %xmm2,0x20(%rsp) 523 paddd 0x30(%rsp),%xmm3 524 movdqa %xmm3,0x30(%rsp) 525 526 # x4[0-3] += s1[0] 527 # x5[0-3] += s1[1] 528 movq 0x10(%rdi),%xmm3 529 pshufd $0x00,%xmm3,%xmm2 530 pshufd $0x55,%xmm3,%xmm3 531 paddd %xmm2,%xmm4 532 paddd %xmm3,%xmm5 533 # x6[0-3] += s1[2] 534 # x7[0-3] += s1[3] 535 movq 0x18(%rdi),%xmm3 536 pshufd $0x00,%xmm3,%xmm2 537 pshufd $0x55,%xmm3,%xmm3 538 paddd %xmm2,%xmm6 539 paddd %xmm3,%xmm7 540 541 # x8[0-3] += s2[0] 542 # x9[0-3] += s2[1] 543 movq 0x20(%rdi),%xmm3 544 pshufd $0x00,%xmm3,%xmm2 545 pshufd $0x55,%xmm3,%xmm3 546 paddd %xmm2,%xmm8 547 paddd %xmm3,%xmm9 548 # x10[0-3] += s2[2] 549 # x11[0-3] += s2[3] 550 movq 0x28(%rdi),%xmm3 551 pshufd $0x00,%xmm3,%xmm2 552 pshufd $0x55,%xmm3,%xmm3 553 paddd %xmm2,%xmm10 554 paddd %xmm3,%xmm11 555 556 # x12[0-3] += s3[0] 557 # x13[0-3] += s3[1] 558 movq 0x30(%rdi),%xmm3 559 pshufd $0x00,%xmm3,%xmm2 560 pshufd $0x55,%xmm3,%xmm3 561 paddd %xmm2,%xmm12 562 paddd %xmm3,%xmm13 563 # x14[0-3] += s3[2] 564 # x15[0-3] += s3[3] 565 movq 0x38(%rdi),%xmm3 566 pshufd $0x00,%xmm3,%xmm2 567 pshufd $0x55,%xmm3,%xmm3 568 paddd %xmm2,%xmm14 569 paddd %xmm3,%xmm15 570 571 # x12 += counter values 0-3 572 paddd %xmm1,%xmm12 573 574 # interleave 32-bit words in state n, n+1 575 movdqa 0x00(%rsp),%xmm0 576 movdqa 0x10(%rsp),%xmm1 577 movdqa %xmm0,%xmm2 578 punpckldq %xmm1,%xmm2 579 punpckhdq %xmm1,%xmm0 580 movdqa %xmm2,0x00(%rsp) 581 movdqa %xmm0,0x10(%rsp) 582 movdqa 0x20(%rsp),%xmm0 583 movdqa 0x30(%rsp),%xmm1 584 movdqa %xmm0,%xmm2 585 punpckldq %xmm1,%xmm2 586 punpckhdq %xmm1,%xmm0 587 movdqa %xmm2,0x20(%rsp) 588 movdqa %xmm0,0x30(%rsp) 589 movdqa %xmm4,%xmm0 590 punpckldq %xmm5,%xmm4 591 punpckhdq %xmm5,%xmm0 592 movdqa %xmm0,%xmm5 593 movdqa %xmm6,%xmm0 594 punpckldq %xmm7,%xmm6 595 punpckhdq %xmm7,%xmm0 596 movdqa %xmm0,%xmm7 597 movdqa %xmm8,%xmm0 598 punpckldq %xmm9,%xmm8 599 punpckhdq %xmm9,%xmm0 600 movdqa %xmm0,%xmm9 601 movdqa %xmm10,%xmm0 602 punpckldq %xmm11,%xmm10 603 punpckhdq %xmm11,%xmm0 604 movdqa %xmm0,%xmm11 605 movdqa %xmm12,%xmm0 606 punpckldq %xmm13,%xmm12 607 punpckhdq %xmm13,%xmm0 608 movdqa %xmm0,%xmm13 609 movdqa %xmm14,%xmm0 610 punpckldq %xmm15,%xmm14 611 punpckhdq %xmm15,%xmm0 612 movdqa %xmm0,%xmm15 613 614 # interleave 64-bit words in state n, n+2 615 movdqa 0x00(%rsp),%xmm0 616 movdqa 0x20(%rsp),%xmm1 617 movdqa %xmm0,%xmm2 618 punpcklqdq %xmm1,%xmm2 619 punpckhqdq %xmm1,%xmm0 620 movdqa %xmm2,0x00(%rsp) 621 movdqa %xmm0,0x20(%rsp) 622 movdqa 0x10(%rsp),%xmm0 623 movdqa 0x30(%rsp),%xmm1 624 movdqa %xmm0,%xmm2 625 punpcklqdq %xmm1,%xmm2 626 punpckhqdq %xmm1,%xmm0 627 movdqa %xmm2,0x10(%rsp) 628 movdqa %xmm0,0x30(%rsp) 629 movdqa %xmm4,%xmm0 630 punpcklqdq %xmm6,%xmm4 631 punpckhqdq %xmm6,%xmm0 632 movdqa %xmm0,%xmm6 633 movdqa %xmm5,%xmm0 634 punpcklqdq %xmm7,%xmm5 635 punpckhqdq %xmm7,%xmm0 636 movdqa %xmm0,%xmm7 637 movdqa %xmm8,%xmm0 638 punpcklqdq %xmm10,%xmm8 639 punpckhqdq %xmm10,%xmm0 640 movdqa %xmm0,%xmm10 641 movdqa %xmm9,%xmm0 642 punpcklqdq %xmm11,%xmm9 643 punpckhqdq %xmm11,%xmm0 644 movdqa %xmm0,%xmm11 645 movdqa %xmm12,%xmm0 646 punpcklqdq %xmm14,%xmm12 647 punpckhqdq %xmm14,%xmm0 648 movdqa %xmm0,%xmm14 649 movdqa %xmm13,%xmm0 650 punpcklqdq %xmm15,%xmm13 651 punpckhqdq %xmm15,%xmm0 652 movdqa %xmm0,%xmm15 653 654 # xor with corresponding input, write to output 655 movdqa 0x00(%rsp),%xmm0 656 cmp $0x10,%rax 657 jl .Lxorpart4 658 movdqu 0x00(%rdx),%xmm1 659 pxor %xmm1,%xmm0 660 movdqu %xmm0,0x00(%rsi) 661 662 movdqu %xmm4,%xmm0 663 cmp $0x20,%rax 664 jl .Lxorpart4 665 movdqu 0x10(%rdx),%xmm1 666 pxor %xmm1,%xmm0 667 movdqu %xmm0,0x10(%rsi) 668 669 movdqu %xmm8,%xmm0 670 cmp $0x30,%rax 671 jl .Lxorpart4 672 movdqu 0x20(%rdx),%xmm1 673 pxor %xmm1,%xmm0 674 movdqu %xmm0,0x20(%rsi) 675 676 movdqu %xmm12,%xmm0 677 cmp $0x40,%rax 678 jl .Lxorpart4 679 movdqu 0x30(%rdx),%xmm1 680 pxor %xmm1,%xmm0 681 movdqu %xmm0,0x30(%rsi) 682 683 movdqa 0x20(%rsp),%xmm0 684 cmp $0x50,%rax 685 jl .Lxorpart4 686 movdqu 0x40(%rdx),%xmm1 687 pxor %xmm1,%xmm0 688 movdqu %xmm0,0x40(%rsi) 689 690 movdqu %xmm6,%xmm0 691 cmp $0x60,%rax 692 jl .Lxorpart4 693 movdqu 0x50(%rdx),%xmm1 694 pxor %xmm1,%xmm0 695 movdqu %xmm0,0x50(%rsi) 696 697 movdqu %xmm10,%xmm0 698 cmp $0x70,%rax 699 jl .Lxorpart4 700 movdqu 0x60(%rdx),%xmm1 701 pxor %xmm1,%xmm0 702 movdqu %xmm0,0x60(%rsi) 703 704 movdqu %xmm14,%xmm0 705 cmp $0x80,%rax 706 jl .Lxorpart4 707 movdqu 0x70(%rdx),%xmm1 708 pxor %xmm1,%xmm0 709 movdqu %xmm0,0x70(%rsi) 710 711 movdqa 0x10(%rsp),%xmm0 712 cmp $0x90,%rax 713 jl .Lxorpart4 714 movdqu 0x80(%rdx),%xmm1 715 pxor %xmm1,%xmm0 716 movdqu %xmm0,0x80(%rsi) 717 718 movdqu %xmm5,%xmm0 719 cmp $0xa0,%rax 720 jl .Lxorpart4 721 movdqu 0x90(%rdx),%xmm1 722 pxor %xmm1,%xmm0 723 movdqu %xmm0,0x90(%rsi) 724 725 movdqu %xmm9,%xmm0 726 cmp $0xb0,%rax 727 jl .Lxorpart4 728 movdqu 0xa0(%rdx),%xmm1 729 pxor %xmm1,%xmm0 730 movdqu %xmm0,0xa0(%rsi) 731 732 movdqu %xmm13,%xmm0 733 cmp $0xc0,%rax 734 jl .Lxorpart4 735 movdqu 0xb0(%rdx),%xmm1 736 pxor %xmm1,%xmm0 737 movdqu %xmm0,0xb0(%rsi) 738 739 movdqa 0x30(%rsp),%xmm0 740 cmp $0xd0,%rax 741 jl .Lxorpart4 742 movdqu 0xc0(%rdx),%xmm1 743 pxor %xmm1,%xmm0 744 movdqu %xmm0,0xc0(%rsi) 745 746 movdqu %xmm7,%xmm0 747 cmp $0xe0,%rax 748 jl .Lxorpart4 749 movdqu 0xd0(%rdx),%xmm1 750 pxor %xmm1,%xmm0 751 movdqu %xmm0,0xd0(%rsi) 752 753 movdqu %xmm11,%xmm0 754 cmp $0xf0,%rax 755 jl .Lxorpart4 756 movdqu 0xe0(%rdx),%xmm1 757 pxor %xmm1,%xmm0 758 movdqu %xmm0,0xe0(%rsi) 759 760 movdqu %xmm15,%xmm0 761 cmp $0x100,%rax 762 jl .Lxorpart4 763 movdqu 0xf0(%rdx),%xmm1 764 pxor %xmm1,%xmm0 765 movdqu %xmm0,0xf0(%rsi) 766 767.Ldone4: 768 lea -8(%r10),%rsp 769 ret 770 771.Lxorpart4: 772 # xor remaining bytes from partial register into output 773 mov %rax,%r9 774 and $0x0f,%r9 775 jz .Ldone4 776 and $~0x0f,%rax 777 778 mov %rsi,%r11 779 780 lea (%rdx,%rax),%rsi 781 mov %rsp,%rdi 782 mov %r9,%rcx 783 rep movsb 784 785 pxor 0x00(%rsp),%xmm0 786 movdqa %xmm0,0x00(%rsp) 787 788 mov %rsp,%rsi 789 lea (%r11,%rax),%rdi 790 mov %r9,%rcx 791 rep movsb 792 793 jmp .Ldone4 794 795ENDPROC(chacha_4block_xor_ssse3) 796