1/* 2 * ChaCha/XChaCha NEON helper functions 3 * 4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * Based on: 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 * 13 * Copyright (C) 2015 Martin Willi 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2 of the License, or 18 * (at your option) any later version. 19 */ 20 21#include <linux/linkage.h> 22 23 .text 24 .align 6 25 26/* 27 * chacha_permute - permute one block 28 * 29 * Permute one 64-byte block where the state matrix is stored in the four NEON 30 * registers v0-v3. It performs matrix operations on four words in parallel, 31 * but requires shuffling to rearrange the words after each round. 32 * 33 * The round count is given in w3. 34 * 35 * Clobbers: w3, x10, v4, v12 36 */ 37chacha_permute: 38 39 adr x10, ROT8 40 ld1 {v12.4s}, [x10] 41 42.Ldoubleround: 43 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 44 add v0.4s, v0.4s, v1.4s 45 eor v3.16b, v3.16b, v0.16b 46 rev32 v3.8h, v3.8h 47 48 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 49 add v2.4s, v2.4s, v3.4s 50 eor v4.16b, v1.16b, v2.16b 51 shl v1.4s, v4.4s, #12 52 sri v1.4s, v4.4s, #20 53 54 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 55 add v0.4s, v0.4s, v1.4s 56 eor v3.16b, v3.16b, v0.16b 57 tbl v3.16b, {v3.16b}, v12.16b 58 59 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 60 add v2.4s, v2.4s, v3.4s 61 eor v4.16b, v1.16b, v2.16b 62 shl v1.4s, v4.4s, #7 63 sri v1.4s, v4.4s, #25 64 65 // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 66 ext v1.16b, v1.16b, v1.16b, #4 67 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 68 ext v2.16b, v2.16b, v2.16b, #8 69 // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 70 ext v3.16b, v3.16b, v3.16b, #12 71 72 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 73 add v0.4s, v0.4s, v1.4s 74 eor v3.16b, v3.16b, v0.16b 75 rev32 v3.8h, v3.8h 76 77 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 78 add v2.4s, v2.4s, v3.4s 79 eor v4.16b, v1.16b, v2.16b 80 shl v1.4s, v4.4s, #12 81 sri v1.4s, v4.4s, #20 82 83 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 84 add v0.4s, v0.4s, v1.4s 85 eor v3.16b, v3.16b, v0.16b 86 tbl v3.16b, {v3.16b}, v12.16b 87 88 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 89 add v2.4s, v2.4s, v3.4s 90 eor v4.16b, v1.16b, v2.16b 91 shl v1.4s, v4.4s, #7 92 sri v1.4s, v4.4s, #25 93 94 // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 95 ext v1.16b, v1.16b, v1.16b, #12 96 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 97 ext v2.16b, v2.16b, v2.16b, #8 98 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 99 ext v3.16b, v3.16b, v3.16b, #4 100 101 subs w3, w3, #2 102 b.ne .Ldoubleround 103 104 ret 105ENDPROC(chacha_permute) 106 107ENTRY(chacha_block_xor_neon) 108 // x0: Input state matrix, s 109 // x1: 1 data block output, o 110 // x2: 1 data block input, i 111 // w3: nrounds 112 113 stp x29, x30, [sp, #-16]! 114 mov x29, sp 115 116 // x0..3 = s0..3 117 ld1 {v0.4s-v3.4s}, [x0] 118 ld1 {v8.4s-v11.4s}, [x0] 119 120 bl chacha_permute 121 122 ld1 {v4.16b-v7.16b}, [x2] 123 124 // o0 = i0 ^ (x0 + s0) 125 add v0.4s, v0.4s, v8.4s 126 eor v0.16b, v0.16b, v4.16b 127 128 // o1 = i1 ^ (x1 + s1) 129 add v1.4s, v1.4s, v9.4s 130 eor v1.16b, v1.16b, v5.16b 131 132 // o2 = i2 ^ (x2 + s2) 133 add v2.4s, v2.4s, v10.4s 134 eor v2.16b, v2.16b, v6.16b 135 136 // o3 = i3 ^ (x3 + s3) 137 add v3.4s, v3.4s, v11.4s 138 eor v3.16b, v3.16b, v7.16b 139 140 st1 {v0.16b-v3.16b}, [x1] 141 142 ldp x29, x30, [sp], #16 143 ret 144ENDPROC(chacha_block_xor_neon) 145 146ENTRY(hchacha_block_neon) 147 // x0: Input state matrix, s 148 // x1: output (8 32-bit words) 149 // w2: nrounds 150 151 stp x29, x30, [sp, #-16]! 152 mov x29, sp 153 154 ld1 {v0.4s-v3.4s}, [x0] 155 156 mov w3, w2 157 bl chacha_permute 158 159 st1 {v0.16b}, [x1], #16 160 st1 {v3.16b}, [x1] 161 162 ldp x29, x30, [sp], #16 163 ret 164ENDPROC(hchacha_block_neon) 165 166 .align 6 167ENTRY(chacha_4block_xor_neon) 168 // x0: Input state matrix, s 169 // x1: 4 data blocks output, o 170 // x2: 4 data blocks input, i 171 // w3: nrounds 172 173 // 174 // This function encrypts four consecutive ChaCha blocks by loading 175 // the state matrix in NEON registers four times. The algorithm performs 176 // each operation on the corresponding word of each state matrix, hence 177 // requires no word shuffling. For final XORing step we transpose the 178 // matrix by interleaving 32- and then 64-bit words, which allows us to 179 // do XOR in NEON registers. 180 // 181 adr x9, CTRINC // ... and ROT8 182 ld1 {v30.4s-v31.4s}, [x9] 183 184 // x0..15[0-3] = s0..3[0..3] 185 mov x4, x0 186 ld4r { v0.4s- v3.4s}, [x4], #16 187 ld4r { v4.4s- v7.4s}, [x4], #16 188 ld4r { v8.4s-v11.4s}, [x4], #16 189 ld4r {v12.4s-v15.4s}, [x4] 190 191 // x12 += counter values 0-3 192 add v12.4s, v12.4s, v30.4s 193 194.Ldoubleround4: 195 // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 196 // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 197 // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 198 // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 199 add v0.4s, v0.4s, v4.4s 200 add v1.4s, v1.4s, v5.4s 201 add v2.4s, v2.4s, v6.4s 202 add v3.4s, v3.4s, v7.4s 203 204 eor v12.16b, v12.16b, v0.16b 205 eor v13.16b, v13.16b, v1.16b 206 eor v14.16b, v14.16b, v2.16b 207 eor v15.16b, v15.16b, v3.16b 208 209 rev32 v12.8h, v12.8h 210 rev32 v13.8h, v13.8h 211 rev32 v14.8h, v14.8h 212 rev32 v15.8h, v15.8h 213 214 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 215 // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 216 // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 217 // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 218 add v8.4s, v8.4s, v12.4s 219 add v9.4s, v9.4s, v13.4s 220 add v10.4s, v10.4s, v14.4s 221 add v11.4s, v11.4s, v15.4s 222 223 eor v16.16b, v4.16b, v8.16b 224 eor v17.16b, v5.16b, v9.16b 225 eor v18.16b, v6.16b, v10.16b 226 eor v19.16b, v7.16b, v11.16b 227 228 shl v4.4s, v16.4s, #12 229 shl v5.4s, v17.4s, #12 230 shl v6.4s, v18.4s, #12 231 shl v7.4s, v19.4s, #12 232 233 sri v4.4s, v16.4s, #20 234 sri v5.4s, v17.4s, #20 235 sri v6.4s, v18.4s, #20 236 sri v7.4s, v19.4s, #20 237 238 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 239 // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 240 // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 241 // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 242 add v0.4s, v0.4s, v4.4s 243 add v1.4s, v1.4s, v5.4s 244 add v2.4s, v2.4s, v6.4s 245 add v3.4s, v3.4s, v7.4s 246 247 eor v12.16b, v12.16b, v0.16b 248 eor v13.16b, v13.16b, v1.16b 249 eor v14.16b, v14.16b, v2.16b 250 eor v15.16b, v15.16b, v3.16b 251 252 tbl v12.16b, {v12.16b}, v31.16b 253 tbl v13.16b, {v13.16b}, v31.16b 254 tbl v14.16b, {v14.16b}, v31.16b 255 tbl v15.16b, {v15.16b}, v31.16b 256 257 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 258 // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 259 // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 260 // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 261 add v8.4s, v8.4s, v12.4s 262 add v9.4s, v9.4s, v13.4s 263 add v10.4s, v10.4s, v14.4s 264 add v11.4s, v11.4s, v15.4s 265 266 eor v16.16b, v4.16b, v8.16b 267 eor v17.16b, v5.16b, v9.16b 268 eor v18.16b, v6.16b, v10.16b 269 eor v19.16b, v7.16b, v11.16b 270 271 shl v4.4s, v16.4s, #7 272 shl v5.4s, v17.4s, #7 273 shl v6.4s, v18.4s, #7 274 shl v7.4s, v19.4s, #7 275 276 sri v4.4s, v16.4s, #25 277 sri v5.4s, v17.4s, #25 278 sri v6.4s, v18.4s, #25 279 sri v7.4s, v19.4s, #25 280 281 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 282 // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 283 // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 284 // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 285 add v0.4s, v0.4s, v5.4s 286 add v1.4s, v1.4s, v6.4s 287 add v2.4s, v2.4s, v7.4s 288 add v3.4s, v3.4s, v4.4s 289 290 eor v15.16b, v15.16b, v0.16b 291 eor v12.16b, v12.16b, v1.16b 292 eor v13.16b, v13.16b, v2.16b 293 eor v14.16b, v14.16b, v3.16b 294 295 rev32 v15.8h, v15.8h 296 rev32 v12.8h, v12.8h 297 rev32 v13.8h, v13.8h 298 rev32 v14.8h, v14.8h 299 300 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 301 // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 302 // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 303 // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 304 add v10.4s, v10.4s, v15.4s 305 add v11.4s, v11.4s, v12.4s 306 add v8.4s, v8.4s, v13.4s 307 add v9.4s, v9.4s, v14.4s 308 309 eor v16.16b, v5.16b, v10.16b 310 eor v17.16b, v6.16b, v11.16b 311 eor v18.16b, v7.16b, v8.16b 312 eor v19.16b, v4.16b, v9.16b 313 314 shl v5.4s, v16.4s, #12 315 shl v6.4s, v17.4s, #12 316 shl v7.4s, v18.4s, #12 317 shl v4.4s, v19.4s, #12 318 319 sri v5.4s, v16.4s, #20 320 sri v6.4s, v17.4s, #20 321 sri v7.4s, v18.4s, #20 322 sri v4.4s, v19.4s, #20 323 324 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 325 // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 326 // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 327 // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 328 add v0.4s, v0.4s, v5.4s 329 add v1.4s, v1.4s, v6.4s 330 add v2.4s, v2.4s, v7.4s 331 add v3.4s, v3.4s, v4.4s 332 333 eor v15.16b, v15.16b, v0.16b 334 eor v12.16b, v12.16b, v1.16b 335 eor v13.16b, v13.16b, v2.16b 336 eor v14.16b, v14.16b, v3.16b 337 338 tbl v15.16b, {v15.16b}, v31.16b 339 tbl v12.16b, {v12.16b}, v31.16b 340 tbl v13.16b, {v13.16b}, v31.16b 341 tbl v14.16b, {v14.16b}, v31.16b 342 343 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 344 // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 345 // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 346 // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 347 add v10.4s, v10.4s, v15.4s 348 add v11.4s, v11.4s, v12.4s 349 add v8.4s, v8.4s, v13.4s 350 add v9.4s, v9.4s, v14.4s 351 352 eor v16.16b, v5.16b, v10.16b 353 eor v17.16b, v6.16b, v11.16b 354 eor v18.16b, v7.16b, v8.16b 355 eor v19.16b, v4.16b, v9.16b 356 357 shl v5.4s, v16.4s, #7 358 shl v6.4s, v17.4s, #7 359 shl v7.4s, v18.4s, #7 360 shl v4.4s, v19.4s, #7 361 362 sri v5.4s, v16.4s, #25 363 sri v6.4s, v17.4s, #25 364 sri v7.4s, v18.4s, #25 365 sri v4.4s, v19.4s, #25 366 367 subs w3, w3, #2 368 b.ne .Ldoubleround4 369 370 ld4r {v16.4s-v19.4s}, [x0], #16 371 ld4r {v20.4s-v23.4s}, [x0], #16 372 373 // x12 += counter values 0-3 374 add v12.4s, v12.4s, v30.4s 375 376 // x0[0-3] += s0[0] 377 // x1[0-3] += s0[1] 378 // x2[0-3] += s0[2] 379 // x3[0-3] += s0[3] 380 add v0.4s, v0.4s, v16.4s 381 add v1.4s, v1.4s, v17.4s 382 add v2.4s, v2.4s, v18.4s 383 add v3.4s, v3.4s, v19.4s 384 385 ld4r {v24.4s-v27.4s}, [x0], #16 386 ld4r {v28.4s-v31.4s}, [x0] 387 388 // x4[0-3] += s1[0] 389 // x5[0-3] += s1[1] 390 // x6[0-3] += s1[2] 391 // x7[0-3] += s1[3] 392 add v4.4s, v4.4s, v20.4s 393 add v5.4s, v5.4s, v21.4s 394 add v6.4s, v6.4s, v22.4s 395 add v7.4s, v7.4s, v23.4s 396 397 // x8[0-3] += s2[0] 398 // x9[0-3] += s2[1] 399 // x10[0-3] += s2[2] 400 // x11[0-3] += s2[3] 401 add v8.4s, v8.4s, v24.4s 402 add v9.4s, v9.4s, v25.4s 403 add v10.4s, v10.4s, v26.4s 404 add v11.4s, v11.4s, v27.4s 405 406 // x12[0-3] += s3[0] 407 // x13[0-3] += s3[1] 408 // x14[0-3] += s3[2] 409 // x15[0-3] += s3[3] 410 add v12.4s, v12.4s, v28.4s 411 add v13.4s, v13.4s, v29.4s 412 add v14.4s, v14.4s, v30.4s 413 add v15.4s, v15.4s, v31.4s 414 415 // interleave 32-bit words in state n, n+1 416 zip1 v16.4s, v0.4s, v1.4s 417 zip2 v17.4s, v0.4s, v1.4s 418 zip1 v18.4s, v2.4s, v3.4s 419 zip2 v19.4s, v2.4s, v3.4s 420 zip1 v20.4s, v4.4s, v5.4s 421 zip2 v21.4s, v4.4s, v5.4s 422 zip1 v22.4s, v6.4s, v7.4s 423 zip2 v23.4s, v6.4s, v7.4s 424 zip1 v24.4s, v8.4s, v9.4s 425 zip2 v25.4s, v8.4s, v9.4s 426 zip1 v26.4s, v10.4s, v11.4s 427 zip2 v27.4s, v10.4s, v11.4s 428 zip1 v28.4s, v12.4s, v13.4s 429 zip2 v29.4s, v12.4s, v13.4s 430 zip1 v30.4s, v14.4s, v15.4s 431 zip2 v31.4s, v14.4s, v15.4s 432 433 // interleave 64-bit words in state n, n+2 434 zip1 v0.2d, v16.2d, v18.2d 435 zip2 v4.2d, v16.2d, v18.2d 436 zip1 v8.2d, v17.2d, v19.2d 437 zip2 v12.2d, v17.2d, v19.2d 438 ld1 {v16.16b-v19.16b}, [x2], #64 439 440 zip1 v1.2d, v20.2d, v22.2d 441 zip2 v5.2d, v20.2d, v22.2d 442 zip1 v9.2d, v21.2d, v23.2d 443 zip2 v13.2d, v21.2d, v23.2d 444 ld1 {v20.16b-v23.16b}, [x2], #64 445 446 zip1 v2.2d, v24.2d, v26.2d 447 zip2 v6.2d, v24.2d, v26.2d 448 zip1 v10.2d, v25.2d, v27.2d 449 zip2 v14.2d, v25.2d, v27.2d 450 ld1 {v24.16b-v27.16b}, [x2], #64 451 452 zip1 v3.2d, v28.2d, v30.2d 453 zip2 v7.2d, v28.2d, v30.2d 454 zip1 v11.2d, v29.2d, v31.2d 455 zip2 v15.2d, v29.2d, v31.2d 456 ld1 {v28.16b-v31.16b}, [x2] 457 458 // xor with corresponding input, write to output 459 eor v16.16b, v16.16b, v0.16b 460 eor v17.16b, v17.16b, v1.16b 461 eor v18.16b, v18.16b, v2.16b 462 eor v19.16b, v19.16b, v3.16b 463 eor v20.16b, v20.16b, v4.16b 464 eor v21.16b, v21.16b, v5.16b 465 st1 {v16.16b-v19.16b}, [x1], #64 466 eor v22.16b, v22.16b, v6.16b 467 eor v23.16b, v23.16b, v7.16b 468 eor v24.16b, v24.16b, v8.16b 469 eor v25.16b, v25.16b, v9.16b 470 st1 {v20.16b-v23.16b}, [x1], #64 471 eor v26.16b, v26.16b, v10.16b 472 eor v27.16b, v27.16b, v11.16b 473 eor v28.16b, v28.16b, v12.16b 474 st1 {v24.16b-v27.16b}, [x1], #64 475 eor v29.16b, v29.16b, v13.16b 476 eor v30.16b, v30.16b, v14.16b 477 eor v31.16b, v31.16b, v15.16b 478 st1 {v28.16b-v31.16b}, [x1] 479 480 ret 481ENDPROC(chacha_4block_xor_neon) 482 483CTRINC: .word 0, 1, 2, 3 484ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 485