1/* 2 * ChaCha/XChaCha NEON helper functions 3 * 4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * Originally based on: 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 * 13 * Copyright (C) 2015 Martin Willi 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2 of the License, or 18 * (at your option) any later version. 19 */ 20 21#include <linux/linkage.h> 22#include <asm/assembler.h> 23#include <asm/cache.h> 24 25 .text 26 .align 6 27 28/* 29 * chacha_permute - permute one block 30 * 31 * Permute one 64-byte block where the state matrix is stored in the four NEON 32 * registers v0-v3. It performs matrix operations on four words in parallel, 33 * but requires shuffling to rearrange the words after each round. 34 * 35 * The round count is given in w3. 36 * 37 * Clobbers: w3, x10, v4, v12 38 */ 39chacha_permute: 40 41 adr_l x10, ROT8 42 ld1 {v12.4s}, [x10] 43 44.Ldoubleround: 45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 46 add v0.4s, v0.4s, v1.4s 47 eor v3.16b, v3.16b, v0.16b 48 rev32 v3.8h, v3.8h 49 50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 51 add v2.4s, v2.4s, v3.4s 52 eor v4.16b, v1.16b, v2.16b 53 shl v1.4s, v4.4s, #12 54 sri v1.4s, v4.4s, #20 55 56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 57 add v0.4s, v0.4s, v1.4s 58 eor v3.16b, v3.16b, v0.16b 59 tbl v3.16b, {v3.16b}, v12.16b 60 61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 62 add v2.4s, v2.4s, v3.4s 63 eor v4.16b, v1.16b, v2.16b 64 shl v1.4s, v4.4s, #7 65 sri v1.4s, v4.4s, #25 66 67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 68 ext v1.16b, v1.16b, v1.16b, #4 69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 70 ext v2.16b, v2.16b, v2.16b, #8 71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 72 ext v3.16b, v3.16b, v3.16b, #12 73 74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 75 add v0.4s, v0.4s, v1.4s 76 eor v3.16b, v3.16b, v0.16b 77 rev32 v3.8h, v3.8h 78 79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 80 add v2.4s, v2.4s, v3.4s 81 eor v4.16b, v1.16b, v2.16b 82 shl v1.4s, v4.4s, #12 83 sri v1.4s, v4.4s, #20 84 85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 86 add v0.4s, v0.4s, v1.4s 87 eor v3.16b, v3.16b, v0.16b 88 tbl v3.16b, {v3.16b}, v12.16b 89 90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 91 add v2.4s, v2.4s, v3.4s 92 eor v4.16b, v1.16b, v2.16b 93 shl v1.4s, v4.4s, #7 94 sri v1.4s, v4.4s, #25 95 96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 97 ext v1.16b, v1.16b, v1.16b, #12 98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 99 ext v2.16b, v2.16b, v2.16b, #8 100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 101 ext v3.16b, v3.16b, v3.16b, #4 102 103 subs w3, w3, #2 104 b.ne .Ldoubleround 105 106 ret 107ENDPROC(chacha_permute) 108 109ENTRY(chacha_block_xor_neon) 110 // x0: Input state matrix, s 111 // x1: 1 data block output, o 112 // x2: 1 data block input, i 113 // w3: nrounds 114 115 stp x29, x30, [sp, #-16]! 116 mov x29, sp 117 118 // x0..3 = s0..3 119 ld1 {v0.4s-v3.4s}, [x0] 120 ld1 {v8.4s-v11.4s}, [x0] 121 122 bl chacha_permute 123 124 ld1 {v4.16b-v7.16b}, [x2] 125 126 // o0 = i0 ^ (x0 + s0) 127 add v0.4s, v0.4s, v8.4s 128 eor v0.16b, v0.16b, v4.16b 129 130 // o1 = i1 ^ (x1 + s1) 131 add v1.4s, v1.4s, v9.4s 132 eor v1.16b, v1.16b, v5.16b 133 134 // o2 = i2 ^ (x2 + s2) 135 add v2.4s, v2.4s, v10.4s 136 eor v2.16b, v2.16b, v6.16b 137 138 // o3 = i3 ^ (x3 + s3) 139 add v3.4s, v3.4s, v11.4s 140 eor v3.16b, v3.16b, v7.16b 141 142 st1 {v0.16b-v3.16b}, [x1] 143 144 ldp x29, x30, [sp], #16 145 ret 146ENDPROC(chacha_block_xor_neon) 147 148ENTRY(hchacha_block_neon) 149 // x0: Input state matrix, s 150 // x1: output (8 32-bit words) 151 // w2: nrounds 152 153 stp x29, x30, [sp, #-16]! 154 mov x29, sp 155 156 ld1 {v0.4s-v3.4s}, [x0] 157 158 mov w3, w2 159 bl chacha_permute 160 161 st1 {v0.16b}, [x1], #16 162 st1 {v3.16b}, [x1] 163 164 ldp x29, x30, [sp], #16 165 ret 166ENDPROC(hchacha_block_neon) 167 168 a0 .req w12 169 a1 .req w13 170 a2 .req w14 171 a3 .req w15 172 a4 .req w16 173 a5 .req w17 174 a6 .req w19 175 a7 .req w20 176 a8 .req w21 177 a9 .req w22 178 a10 .req w23 179 a11 .req w24 180 a12 .req w25 181 a13 .req w26 182 a14 .req w27 183 a15 .req w28 184 185 .align 6 186ENTRY(chacha_4block_xor_neon) 187 frame_push 10 188 189 // x0: Input state matrix, s 190 // x1: 4 data blocks output, o 191 // x2: 4 data blocks input, i 192 // w3: nrounds 193 // x4: byte count 194 195 adr_l x10, .Lpermute 196 and x5, x4, #63 197 add x10, x10, x5 198 add x11, x10, #64 199 200 // 201 // This function encrypts four consecutive ChaCha blocks by loading 202 // the state matrix in NEON registers four times. The algorithm performs 203 // each operation on the corresponding word of each state matrix, hence 204 // requires no word shuffling. For final XORing step we transpose the 205 // matrix by interleaving 32- and then 64-bit words, which allows us to 206 // do XOR in NEON registers. 207 // 208 // At the same time, a fifth block is encrypted in parallel using 209 // scalar registers 210 // 211 adr_l x9, CTRINC // ... and ROT8 212 ld1 {v30.4s-v31.4s}, [x9] 213 214 // x0..15[0-3] = s0..3[0..3] 215 add x8, x0, #16 216 ld4r { v0.4s- v3.4s}, [x0] 217 ld4r { v4.4s- v7.4s}, [x8], #16 218 ld4r { v8.4s-v11.4s}, [x8], #16 219 ld4r {v12.4s-v15.4s}, [x8] 220 221 mov a0, v0.s[0] 222 mov a1, v1.s[0] 223 mov a2, v2.s[0] 224 mov a3, v3.s[0] 225 mov a4, v4.s[0] 226 mov a5, v5.s[0] 227 mov a6, v6.s[0] 228 mov a7, v7.s[0] 229 mov a8, v8.s[0] 230 mov a9, v9.s[0] 231 mov a10, v10.s[0] 232 mov a11, v11.s[0] 233 mov a12, v12.s[0] 234 mov a13, v13.s[0] 235 mov a14, v14.s[0] 236 mov a15, v15.s[0] 237 238 // x12 += counter values 1-4 239 add v12.4s, v12.4s, v30.4s 240 241.Ldoubleround4: 242 // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 243 // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 244 // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 245 // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 246 add v0.4s, v0.4s, v4.4s 247 add a0, a0, a4 248 add v1.4s, v1.4s, v5.4s 249 add a1, a1, a5 250 add v2.4s, v2.4s, v6.4s 251 add a2, a2, a6 252 add v3.4s, v3.4s, v7.4s 253 add a3, a3, a7 254 255 eor v12.16b, v12.16b, v0.16b 256 eor a12, a12, a0 257 eor v13.16b, v13.16b, v1.16b 258 eor a13, a13, a1 259 eor v14.16b, v14.16b, v2.16b 260 eor a14, a14, a2 261 eor v15.16b, v15.16b, v3.16b 262 eor a15, a15, a3 263 264 rev32 v12.8h, v12.8h 265 ror a12, a12, #16 266 rev32 v13.8h, v13.8h 267 ror a13, a13, #16 268 rev32 v14.8h, v14.8h 269 ror a14, a14, #16 270 rev32 v15.8h, v15.8h 271 ror a15, a15, #16 272 273 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 274 // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 275 // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 276 // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 277 add v8.4s, v8.4s, v12.4s 278 add a8, a8, a12 279 add v9.4s, v9.4s, v13.4s 280 add a9, a9, a13 281 add v10.4s, v10.4s, v14.4s 282 add a10, a10, a14 283 add v11.4s, v11.4s, v15.4s 284 add a11, a11, a15 285 286 eor v16.16b, v4.16b, v8.16b 287 eor a4, a4, a8 288 eor v17.16b, v5.16b, v9.16b 289 eor a5, a5, a9 290 eor v18.16b, v6.16b, v10.16b 291 eor a6, a6, a10 292 eor v19.16b, v7.16b, v11.16b 293 eor a7, a7, a11 294 295 shl v4.4s, v16.4s, #12 296 shl v5.4s, v17.4s, #12 297 shl v6.4s, v18.4s, #12 298 shl v7.4s, v19.4s, #12 299 300 sri v4.4s, v16.4s, #20 301 ror a4, a4, #20 302 sri v5.4s, v17.4s, #20 303 ror a5, a5, #20 304 sri v6.4s, v18.4s, #20 305 ror a6, a6, #20 306 sri v7.4s, v19.4s, #20 307 ror a7, a7, #20 308 309 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 310 // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 311 // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 312 // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 313 add v0.4s, v0.4s, v4.4s 314 add a0, a0, a4 315 add v1.4s, v1.4s, v5.4s 316 add a1, a1, a5 317 add v2.4s, v2.4s, v6.4s 318 add a2, a2, a6 319 add v3.4s, v3.4s, v7.4s 320 add a3, a3, a7 321 322 eor v12.16b, v12.16b, v0.16b 323 eor a12, a12, a0 324 eor v13.16b, v13.16b, v1.16b 325 eor a13, a13, a1 326 eor v14.16b, v14.16b, v2.16b 327 eor a14, a14, a2 328 eor v15.16b, v15.16b, v3.16b 329 eor a15, a15, a3 330 331 tbl v12.16b, {v12.16b}, v31.16b 332 ror a12, a12, #24 333 tbl v13.16b, {v13.16b}, v31.16b 334 ror a13, a13, #24 335 tbl v14.16b, {v14.16b}, v31.16b 336 ror a14, a14, #24 337 tbl v15.16b, {v15.16b}, v31.16b 338 ror a15, a15, #24 339 340 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 341 // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 342 // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 343 // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 344 add v8.4s, v8.4s, v12.4s 345 add a8, a8, a12 346 add v9.4s, v9.4s, v13.4s 347 add a9, a9, a13 348 add v10.4s, v10.4s, v14.4s 349 add a10, a10, a14 350 add v11.4s, v11.4s, v15.4s 351 add a11, a11, a15 352 353 eor v16.16b, v4.16b, v8.16b 354 eor a4, a4, a8 355 eor v17.16b, v5.16b, v9.16b 356 eor a5, a5, a9 357 eor v18.16b, v6.16b, v10.16b 358 eor a6, a6, a10 359 eor v19.16b, v7.16b, v11.16b 360 eor a7, a7, a11 361 362 shl v4.4s, v16.4s, #7 363 shl v5.4s, v17.4s, #7 364 shl v6.4s, v18.4s, #7 365 shl v7.4s, v19.4s, #7 366 367 sri v4.4s, v16.4s, #25 368 ror a4, a4, #25 369 sri v5.4s, v17.4s, #25 370 ror a5, a5, #25 371 sri v6.4s, v18.4s, #25 372 ror a6, a6, #25 373 sri v7.4s, v19.4s, #25 374 ror a7, a7, #25 375 376 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 377 // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 378 // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 379 // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 380 add v0.4s, v0.4s, v5.4s 381 add a0, a0, a5 382 add v1.4s, v1.4s, v6.4s 383 add a1, a1, a6 384 add v2.4s, v2.4s, v7.4s 385 add a2, a2, a7 386 add v3.4s, v3.4s, v4.4s 387 add a3, a3, a4 388 389 eor v15.16b, v15.16b, v0.16b 390 eor a15, a15, a0 391 eor v12.16b, v12.16b, v1.16b 392 eor a12, a12, a1 393 eor v13.16b, v13.16b, v2.16b 394 eor a13, a13, a2 395 eor v14.16b, v14.16b, v3.16b 396 eor a14, a14, a3 397 398 rev32 v15.8h, v15.8h 399 ror a15, a15, #16 400 rev32 v12.8h, v12.8h 401 ror a12, a12, #16 402 rev32 v13.8h, v13.8h 403 ror a13, a13, #16 404 rev32 v14.8h, v14.8h 405 ror a14, a14, #16 406 407 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 408 // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 409 // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 410 // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 411 add v10.4s, v10.4s, v15.4s 412 add a10, a10, a15 413 add v11.4s, v11.4s, v12.4s 414 add a11, a11, a12 415 add v8.4s, v8.4s, v13.4s 416 add a8, a8, a13 417 add v9.4s, v9.4s, v14.4s 418 add a9, a9, a14 419 420 eor v16.16b, v5.16b, v10.16b 421 eor a5, a5, a10 422 eor v17.16b, v6.16b, v11.16b 423 eor a6, a6, a11 424 eor v18.16b, v7.16b, v8.16b 425 eor a7, a7, a8 426 eor v19.16b, v4.16b, v9.16b 427 eor a4, a4, a9 428 429 shl v5.4s, v16.4s, #12 430 shl v6.4s, v17.4s, #12 431 shl v7.4s, v18.4s, #12 432 shl v4.4s, v19.4s, #12 433 434 sri v5.4s, v16.4s, #20 435 ror a5, a5, #20 436 sri v6.4s, v17.4s, #20 437 ror a6, a6, #20 438 sri v7.4s, v18.4s, #20 439 ror a7, a7, #20 440 sri v4.4s, v19.4s, #20 441 ror a4, a4, #20 442 443 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 444 // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 445 // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 446 // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 447 add v0.4s, v0.4s, v5.4s 448 add a0, a0, a5 449 add v1.4s, v1.4s, v6.4s 450 add a1, a1, a6 451 add v2.4s, v2.4s, v7.4s 452 add a2, a2, a7 453 add v3.4s, v3.4s, v4.4s 454 add a3, a3, a4 455 456 eor v15.16b, v15.16b, v0.16b 457 eor a15, a15, a0 458 eor v12.16b, v12.16b, v1.16b 459 eor a12, a12, a1 460 eor v13.16b, v13.16b, v2.16b 461 eor a13, a13, a2 462 eor v14.16b, v14.16b, v3.16b 463 eor a14, a14, a3 464 465 tbl v15.16b, {v15.16b}, v31.16b 466 ror a15, a15, #24 467 tbl v12.16b, {v12.16b}, v31.16b 468 ror a12, a12, #24 469 tbl v13.16b, {v13.16b}, v31.16b 470 ror a13, a13, #24 471 tbl v14.16b, {v14.16b}, v31.16b 472 ror a14, a14, #24 473 474 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 475 // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 476 // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 477 // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 478 add v10.4s, v10.4s, v15.4s 479 add a10, a10, a15 480 add v11.4s, v11.4s, v12.4s 481 add a11, a11, a12 482 add v8.4s, v8.4s, v13.4s 483 add a8, a8, a13 484 add v9.4s, v9.4s, v14.4s 485 add a9, a9, a14 486 487 eor v16.16b, v5.16b, v10.16b 488 eor a5, a5, a10 489 eor v17.16b, v6.16b, v11.16b 490 eor a6, a6, a11 491 eor v18.16b, v7.16b, v8.16b 492 eor a7, a7, a8 493 eor v19.16b, v4.16b, v9.16b 494 eor a4, a4, a9 495 496 shl v5.4s, v16.4s, #7 497 shl v6.4s, v17.4s, #7 498 shl v7.4s, v18.4s, #7 499 shl v4.4s, v19.4s, #7 500 501 sri v5.4s, v16.4s, #25 502 ror a5, a5, #25 503 sri v6.4s, v17.4s, #25 504 ror a6, a6, #25 505 sri v7.4s, v18.4s, #25 506 ror a7, a7, #25 507 sri v4.4s, v19.4s, #25 508 ror a4, a4, #25 509 510 subs w3, w3, #2 511 b.ne .Ldoubleround4 512 513 ld4r {v16.4s-v19.4s}, [x0], #16 514 ld4r {v20.4s-v23.4s}, [x0], #16 515 516 // x12 += counter values 0-3 517 add v12.4s, v12.4s, v30.4s 518 519 // x0[0-3] += s0[0] 520 // x1[0-3] += s0[1] 521 // x2[0-3] += s0[2] 522 // x3[0-3] += s0[3] 523 add v0.4s, v0.4s, v16.4s 524 mov w6, v16.s[0] 525 mov w7, v17.s[0] 526 add v1.4s, v1.4s, v17.4s 527 mov w8, v18.s[0] 528 mov w9, v19.s[0] 529 add v2.4s, v2.4s, v18.4s 530 add a0, a0, w6 531 add a1, a1, w7 532 add v3.4s, v3.4s, v19.4s 533 add a2, a2, w8 534 add a3, a3, w9 535 536 ld4r {v24.4s-v27.4s}, [x0], #16 537 ld4r {v28.4s-v31.4s}, [x0] 538 539 // x4[0-3] += s1[0] 540 // x5[0-3] += s1[1] 541 // x6[0-3] += s1[2] 542 // x7[0-3] += s1[3] 543 add v4.4s, v4.4s, v20.4s 544 mov w6, v20.s[0] 545 mov w7, v21.s[0] 546 add v5.4s, v5.4s, v21.4s 547 mov w8, v22.s[0] 548 mov w9, v23.s[0] 549 add v6.4s, v6.4s, v22.4s 550 add a4, a4, w6 551 add a5, a5, w7 552 add v7.4s, v7.4s, v23.4s 553 add a6, a6, w8 554 add a7, a7, w9 555 556 // x8[0-3] += s2[0] 557 // x9[0-3] += s2[1] 558 // x10[0-3] += s2[2] 559 // x11[0-3] += s2[3] 560 add v8.4s, v8.4s, v24.4s 561 mov w6, v24.s[0] 562 mov w7, v25.s[0] 563 add v9.4s, v9.4s, v25.4s 564 mov w8, v26.s[0] 565 mov w9, v27.s[0] 566 add v10.4s, v10.4s, v26.4s 567 add a8, a8, w6 568 add a9, a9, w7 569 add v11.4s, v11.4s, v27.4s 570 add a10, a10, w8 571 add a11, a11, w9 572 573 // x12[0-3] += s3[0] 574 // x13[0-3] += s3[1] 575 // x14[0-3] += s3[2] 576 // x15[0-3] += s3[3] 577 add v12.4s, v12.4s, v28.4s 578 mov w6, v28.s[0] 579 mov w7, v29.s[0] 580 add v13.4s, v13.4s, v29.4s 581 mov w8, v30.s[0] 582 mov w9, v31.s[0] 583 add v14.4s, v14.4s, v30.4s 584 add a12, a12, w6 585 add a13, a13, w7 586 add v15.4s, v15.4s, v31.4s 587 add a14, a14, w8 588 add a15, a15, w9 589 590 // interleave 32-bit words in state n, n+1 591 ldp w6, w7, [x2], #64 592 zip1 v16.4s, v0.4s, v1.4s 593 ldp w8, w9, [x2, #-56] 594 eor a0, a0, w6 595 zip2 v17.4s, v0.4s, v1.4s 596 eor a1, a1, w7 597 zip1 v18.4s, v2.4s, v3.4s 598 eor a2, a2, w8 599 zip2 v19.4s, v2.4s, v3.4s 600 eor a3, a3, w9 601 ldp w6, w7, [x2, #-48] 602 zip1 v20.4s, v4.4s, v5.4s 603 ldp w8, w9, [x2, #-40] 604 eor a4, a4, w6 605 zip2 v21.4s, v4.4s, v5.4s 606 eor a5, a5, w7 607 zip1 v22.4s, v6.4s, v7.4s 608 eor a6, a6, w8 609 zip2 v23.4s, v6.4s, v7.4s 610 eor a7, a7, w9 611 ldp w6, w7, [x2, #-32] 612 zip1 v24.4s, v8.4s, v9.4s 613 ldp w8, w9, [x2, #-24] 614 eor a8, a8, w6 615 zip2 v25.4s, v8.4s, v9.4s 616 eor a9, a9, w7 617 zip1 v26.4s, v10.4s, v11.4s 618 eor a10, a10, w8 619 zip2 v27.4s, v10.4s, v11.4s 620 eor a11, a11, w9 621 ldp w6, w7, [x2, #-16] 622 zip1 v28.4s, v12.4s, v13.4s 623 ldp w8, w9, [x2, #-8] 624 eor a12, a12, w6 625 zip2 v29.4s, v12.4s, v13.4s 626 eor a13, a13, w7 627 zip1 v30.4s, v14.4s, v15.4s 628 eor a14, a14, w8 629 zip2 v31.4s, v14.4s, v15.4s 630 eor a15, a15, w9 631 632 mov x3, #64 633 subs x5, x4, #128 634 add x6, x5, x2 635 csel x3, x3, xzr, ge 636 csel x2, x2, x6, ge 637 638 // interleave 64-bit words in state n, n+2 639 zip1 v0.2d, v16.2d, v18.2d 640 zip2 v4.2d, v16.2d, v18.2d 641 stp a0, a1, [x1], #64 642 zip1 v8.2d, v17.2d, v19.2d 643 zip2 v12.2d, v17.2d, v19.2d 644 stp a2, a3, [x1, #-56] 645 ld1 {v16.16b-v19.16b}, [x2], x3 646 647 subs x6, x4, #192 648 ccmp x3, xzr, #4, lt 649 add x7, x6, x2 650 csel x3, x3, xzr, eq 651 csel x2, x2, x7, eq 652 653 zip1 v1.2d, v20.2d, v22.2d 654 zip2 v5.2d, v20.2d, v22.2d 655 stp a4, a5, [x1, #-48] 656 zip1 v9.2d, v21.2d, v23.2d 657 zip2 v13.2d, v21.2d, v23.2d 658 stp a6, a7, [x1, #-40] 659 ld1 {v20.16b-v23.16b}, [x2], x3 660 661 subs x7, x4, #256 662 ccmp x3, xzr, #4, lt 663 add x8, x7, x2 664 csel x3, x3, xzr, eq 665 csel x2, x2, x8, eq 666 667 zip1 v2.2d, v24.2d, v26.2d 668 zip2 v6.2d, v24.2d, v26.2d 669 stp a8, a9, [x1, #-32] 670 zip1 v10.2d, v25.2d, v27.2d 671 zip2 v14.2d, v25.2d, v27.2d 672 stp a10, a11, [x1, #-24] 673 ld1 {v24.16b-v27.16b}, [x2], x3 674 675 subs x8, x4, #320 676 ccmp x3, xzr, #4, lt 677 add x9, x8, x2 678 csel x2, x2, x9, eq 679 680 zip1 v3.2d, v28.2d, v30.2d 681 zip2 v7.2d, v28.2d, v30.2d 682 stp a12, a13, [x1, #-16] 683 zip1 v11.2d, v29.2d, v31.2d 684 zip2 v15.2d, v29.2d, v31.2d 685 stp a14, a15, [x1, #-8] 686 ld1 {v28.16b-v31.16b}, [x2] 687 688 // xor with corresponding input, write to output 689 tbnz x5, #63, 0f 690 eor v16.16b, v16.16b, v0.16b 691 eor v17.16b, v17.16b, v1.16b 692 eor v18.16b, v18.16b, v2.16b 693 eor v19.16b, v19.16b, v3.16b 694 st1 {v16.16b-v19.16b}, [x1], #64 695 cbz x5, .Lout 696 697 tbnz x6, #63, 1f 698 eor v20.16b, v20.16b, v4.16b 699 eor v21.16b, v21.16b, v5.16b 700 eor v22.16b, v22.16b, v6.16b 701 eor v23.16b, v23.16b, v7.16b 702 st1 {v20.16b-v23.16b}, [x1], #64 703 cbz x6, .Lout 704 705 tbnz x7, #63, 2f 706 eor v24.16b, v24.16b, v8.16b 707 eor v25.16b, v25.16b, v9.16b 708 eor v26.16b, v26.16b, v10.16b 709 eor v27.16b, v27.16b, v11.16b 710 st1 {v24.16b-v27.16b}, [x1], #64 711 cbz x7, .Lout 712 713 tbnz x8, #63, 3f 714 eor v28.16b, v28.16b, v12.16b 715 eor v29.16b, v29.16b, v13.16b 716 eor v30.16b, v30.16b, v14.16b 717 eor v31.16b, v31.16b, v15.16b 718 st1 {v28.16b-v31.16b}, [x1] 719 720.Lout: frame_pop 721 ret 722 723 // fewer than 128 bytes of in/output 7240: ld1 {v8.16b}, [x10] 725 ld1 {v9.16b}, [x11] 726 movi v10.16b, #16 727 sub x2, x1, #64 728 add x1, x1, x5 729 ld1 {v16.16b-v19.16b}, [x2] 730 tbl v4.16b, {v0.16b-v3.16b}, v8.16b 731 tbx v20.16b, {v16.16b-v19.16b}, v9.16b 732 add v8.16b, v8.16b, v10.16b 733 add v9.16b, v9.16b, v10.16b 734 tbl v5.16b, {v0.16b-v3.16b}, v8.16b 735 tbx v21.16b, {v16.16b-v19.16b}, v9.16b 736 add v8.16b, v8.16b, v10.16b 737 add v9.16b, v9.16b, v10.16b 738 tbl v6.16b, {v0.16b-v3.16b}, v8.16b 739 tbx v22.16b, {v16.16b-v19.16b}, v9.16b 740 add v8.16b, v8.16b, v10.16b 741 add v9.16b, v9.16b, v10.16b 742 tbl v7.16b, {v0.16b-v3.16b}, v8.16b 743 tbx v23.16b, {v16.16b-v19.16b}, v9.16b 744 745 eor v20.16b, v20.16b, v4.16b 746 eor v21.16b, v21.16b, v5.16b 747 eor v22.16b, v22.16b, v6.16b 748 eor v23.16b, v23.16b, v7.16b 749 st1 {v20.16b-v23.16b}, [x1] 750 b .Lout 751 752 // fewer than 192 bytes of in/output 7531: ld1 {v8.16b}, [x10] 754 ld1 {v9.16b}, [x11] 755 movi v10.16b, #16 756 add x1, x1, x6 757 tbl v0.16b, {v4.16b-v7.16b}, v8.16b 758 tbx v20.16b, {v16.16b-v19.16b}, v9.16b 759 add v8.16b, v8.16b, v10.16b 760 add v9.16b, v9.16b, v10.16b 761 tbl v1.16b, {v4.16b-v7.16b}, v8.16b 762 tbx v21.16b, {v16.16b-v19.16b}, v9.16b 763 add v8.16b, v8.16b, v10.16b 764 add v9.16b, v9.16b, v10.16b 765 tbl v2.16b, {v4.16b-v7.16b}, v8.16b 766 tbx v22.16b, {v16.16b-v19.16b}, v9.16b 767 add v8.16b, v8.16b, v10.16b 768 add v9.16b, v9.16b, v10.16b 769 tbl v3.16b, {v4.16b-v7.16b}, v8.16b 770 tbx v23.16b, {v16.16b-v19.16b}, v9.16b 771 772 eor v20.16b, v20.16b, v0.16b 773 eor v21.16b, v21.16b, v1.16b 774 eor v22.16b, v22.16b, v2.16b 775 eor v23.16b, v23.16b, v3.16b 776 st1 {v20.16b-v23.16b}, [x1] 777 b .Lout 778 779 // fewer than 256 bytes of in/output 7802: ld1 {v4.16b}, [x10] 781 ld1 {v5.16b}, [x11] 782 movi v6.16b, #16 783 add x1, x1, x7 784 tbl v0.16b, {v8.16b-v11.16b}, v4.16b 785 tbx v24.16b, {v20.16b-v23.16b}, v5.16b 786 add v4.16b, v4.16b, v6.16b 787 add v5.16b, v5.16b, v6.16b 788 tbl v1.16b, {v8.16b-v11.16b}, v4.16b 789 tbx v25.16b, {v20.16b-v23.16b}, v5.16b 790 add v4.16b, v4.16b, v6.16b 791 add v5.16b, v5.16b, v6.16b 792 tbl v2.16b, {v8.16b-v11.16b}, v4.16b 793 tbx v26.16b, {v20.16b-v23.16b}, v5.16b 794 add v4.16b, v4.16b, v6.16b 795 add v5.16b, v5.16b, v6.16b 796 tbl v3.16b, {v8.16b-v11.16b}, v4.16b 797 tbx v27.16b, {v20.16b-v23.16b}, v5.16b 798 799 eor v24.16b, v24.16b, v0.16b 800 eor v25.16b, v25.16b, v1.16b 801 eor v26.16b, v26.16b, v2.16b 802 eor v27.16b, v27.16b, v3.16b 803 st1 {v24.16b-v27.16b}, [x1] 804 b .Lout 805 806 // fewer than 320 bytes of in/output 8073: ld1 {v4.16b}, [x10] 808 ld1 {v5.16b}, [x11] 809 movi v6.16b, #16 810 add x1, x1, x8 811 tbl v0.16b, {v12.16b-v15.16b}, v4.16b 812 tbx v28.16b, {v24.16b-v27.16b}, v5.16b 813 add v4.16b, v4.16b, v6.16b 814 add v5.16b, v5.16b, v6.16b 815 tbl v1.16b, {v12.16b-v15.16b}, v4.16b 816 tbx v29.16b, {v24.16b-v27.16b}, v5.16b 817 add v4.16b, v4.16b, v6.16b 818 add v5.16b, v5.16b, v6.16b 819 tbl v2.16b, {v12.16b-v15.16b}, v4.16b 820 tbx v30.16b, {v24.16b-v27.16b}, v5.16b 821 add v4.16b, v4.16b, v6.16b 822 add v5.16b, v5.16b, v6.16b 823 tbl v3.16b, {v12.16b-v15.16b}, v4.16b 824 tbx v31.16b, {v24.16b-v27.16b}, v5.16b 825 826 eor v28.16b, v28.16b, v0.16b 827 eor v29.16b, v29.16b, v1.16b 828 eor v30.16b, v30.16b, v2.16b 829 eor v31.16b, v31.16b, v3.16b 830 st1 {v28.16b-v31.16b}, [x1] 831 b .Lout 832ENDPROC(chacha_4block_xor_neon) 833 834 .section ".rodata", "a", %progbits 835 .align L1_CACHE_SHIFT 836.Lpermute: 837 .set .Li, 0 838 .rept 192 839 .byte (.Li - 64) 840 .set .Li, .Li + 1 841 .endr 842 843CTRINC: .word 1, 2, 3, 4 844ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 845