1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright (C) 2018 Google, Inc. 4 */ 5 6#include <linux/linkage.h> 7#include <asm/assembler.h> 8 9/* 10 * Design notes: 11 * 12 * 16 registers would be needed to hold the state matrix, but only 14 are 13 * available because 'sp' and 'pc' cannot be used. So we spill the elements 14 * (x8, x9) to the stack and swap them out with (x10, x11). This adds one 15 * 'ldrd' and one 'strd' instruction per round. 16 * 17 * All rotates are performed using the implicit rotate operand accepted by the 18 * 'add' and 'eor' instructions. This is faster than using explicit rotate 19 * instructions. To make this work, we allow the values in the second and last 20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the 21 * wrong rotation amount. The rotation amount is then fixed up just in time 22 * when the values are used. 'brot' is the number of bits the values in row 'b' 23 * need to be rotated right to arrive at the correct values, and 'drot' 24 * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 25 * that they end up as (25, 24) after every round. 26 */ 27 28 // ChaCha state registers 29 X0 .req r0 30 X1 .req r1 31 X2 .req r2 32 X3 .req r3 33 X4 .req r4 34 X5 .req r5 35 X6 .req r6 36 X7 .req r7 37 X8_X10 .req r8 // shared by x8 and x10 38 X9_X11 .req r9 // shared by x9 and x11 39 X12 .req r10 40 X13 .req r11 41 X14 .req r12 42 X15 .req r14 43 44.macro __rev out, in, t0, t1, t2 45.if __LINUX_ARM_ARCH__ >= 6 46 rev \out, \in 47.else 48 lsl \t0, \in, #24 49 and \t1, \in, #0xff00 50 and \t2, \in, #0xff0000 51 orr \out, \t0, \in, lsr #24 52 orr \out, \out, \t1, lsl #8 53 orr \out, \out, \t2, lsr #8 54.endif 55.endm 56 57.macro _le32_bswap x, t0, t1, t2 58#ifdef __ARMEB__ 59 __rev \x, \x, \t0, \t1, \t2 60#endif 61.endm 62 63.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 64 _le32_bswap \a, \t0, \t1, \t2 65 _le32_bswap \b, \t0, \t1, \t2 66 _le32_bswap \c, \t0, \t1, \t2 67 _le32_bswap \d, \t0, \t1, \t2 68.endm 69 70.macro __ldrd a, b, src, offset 71#if __LINUX_ARM_ARCH__ >= 6 72 ldrd \a, \b, [\src, #\offset] 73#else 74 ldr \a, [\src, #\offset] 75 ldr \b, [\src, #\offset + 4] 76#endif 77.endm 78 79.macro __strd a, b, dst, offset 80#if __LINUX_ARM_ARCH__ >= 6 81 strd \a, \b, [\dst, #\offset] 82#else 83 str \a, [\dst, #\offset] 84 str \b, [\dst, #\offset + 4] 85#endif 86.endm 87 88.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 89 90 // a += b; d ^= a; d = rol(d, 16); 91 add \a1, \a1, \b1, ror #brot 92 add \a2, \a2, \b2, ror #brot 93 eor \d1, \a1, \d1, ror #drot 94 eor \d2, \a2, \d2, ror #drot 95 // drot == 32 - 16 == 16 96 97 // c += d; b ^= c; b = rol(b, 12); 98 add \c1, \c1, \d1, ror #16 99 add \c2, \c2, \d2, ror #16 100 eor \b1, \c1, \b1, ror #brot 101 eor \b2, \c2, \b2, ror #brot 102 // brot == 32 - 12 == 20 103 104 // a += b; d ^= a; d = rol(d, 8); 105 add \a1, \a1, \b1, ror #20 106 add \a2, \a2, \b2, ror #20 107 eor \d1, \a1, \d1, ror #16 108 eor \d2, \a2, \d2, ror #16 109 // drot == 32 - 8 == 24 110 111 // c += d; b ^= c; b = rol(b, 7); 112 add \c1, \c1, \d1, ror #24 113 add \c2, \c2, \d2, ror #24 114 eor \b1, \c1, \b1, ror #20 115 eor \b2, \c2, \b2, ror #20 116 // brot == 32 - 7 == 25 117.endm 118 119.macro _doubleround 120 121 // column round 122 123 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) 124 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 125 126 // save (x8, x9); restore (x10, x11) 127 __strd X8_X10, X9_X11, sp, 0 128 __ldrd X8_X10, X9_X11, sp, 8 129 130 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) 131 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 132 133 .set brot, 25 134 .set drot, 24 135 136 // diagonal round 137 138 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) 139 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 140 141 // save (x10, x11); restore (x8, x9) 142 __strd X8_X10, X9_X11, sp, 8 143 __ldrd X8_X10, X9_X11, sp, 0 144 145 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) 146 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 147.endm 148 149.macro _chacha_permute nrounds 150 .set brot, 0 151 .set drot, 0 152 .rept \nrounds / 2 153 _doubleround 154 .endr 155.endm 156 157.macro _chacha nrounds 158 159.Lnext_block\@: 160 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN 161 // Registers contain x0-x9,x12-x15. 162 163 // Do the core ChaCha permutation to update x0-x15. 164 _chacha_permute \nrounds 165 166 add sp, #8 167 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN 168 // Registers contain x0-x9,x12-x15. 169 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 170 171 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). 172 push {X8_X10, X9_X11, X12, X13, X14, X15} 173 174 // Load (OUT, IN, LEN). 175 ldr r14, [sp, #96] 176 ldr r12, [sp, #100] 177 ldr r11, [sp, #104] 178 179 orr r10, r14, r12 180 181 // Use slow path if fewer than 64 bytes remain. 182 cmp r11, #64 183 blt .Lxor_slowpath\@ 184 185 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on 186 // ARMv6+, since ldmia and stmia (used below) still require alignment. 187 tst r10, #3 188 bne .Lxor_slowpath\@ 189 190 // Fast path: XOR 64 bytes of aligned data. 191 192 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 193 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. 194 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 195 196 // x0-x3 197 __ldrd r8, r9, sp, 32 198 __ldrd r10, r11, sp, 40 199 add X0, X0, r8 200 add X1, X1, r9 201 add X2, X2, r10 202 add X3, X3, r11 203 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 204 ldmia r12!, {r8-r11} 205 eor X0, X0, r8 206 eor X1, X1, r9 207 eor X2, X2, r10 208 eor X3, X3, r11 209 stmia r14!, {X0-X3} 210 211 // x4-x7 212 __ldrd r8, r9, sp, 48 213 __ldrd r10, r11, sp, 56 214 add X4, r8, X4, ror #brot 215 add X5, r9, X5, ror #brot 216 ldmia r12!, {X0-X3} 217 add X6, r10, X6, ror #brot 218 add X7, r11, X7, ror #brot 219 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 220 eor X4, X4, X0 221 eor X5, X5, X1 222 eor X6, X6, X2 223 eor X7, X7, X3 224 stmia r14!, {X4-X7} 225 226 // x8-x15 227 pop {r0-r7} // (x8-x9,x12-x15,x10-x11) 228 __ldrd r8, r9, sp, 32 229 __ldrd r10, r11, sp, 40 230 add r0, r0, r8 // x8 231 add r1, r1, r9 // x9 232 add r6, r6, r10 // x10 233 add r7, r7, r11 // x11 234 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 235 ldmia r12!, {r8-r11} 236 eor r0, r0, r8 // x8 237 eor r1, r1, r9 // x9 238 eor r6, r6, r10 // x10 239 eor r7, r7, r11 // x11 240 stmia r14!, {r0,r1,r6,r7} 241 ldmia r12!, {r0,r1,r6,r7} 242 __ldrd r8, r9, sp, 48 243 __ldrd r10, r11, sp, 56 244 add r2, r8, r2, ror #drot // x12 245 add r3, r9, r3, ror #drot // x13 246 add r4, r10, r4, ror #drot // x14 247 add r5, r11, r5, ror #drot // x15 248 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 249 ldr r9, [sp, #72] // load LEN 250 eor r2, r2, r0 // x12 251 eor r3, r3, r1 // x13 252 eor r4, r4, r6 // x14 253 eor r5, r5, r7 // x15 254 subs r9, #64 // decrement and check LEN 255 stmia r14!, {r2-r5} 256 257 beq .Ldone\@ 258 259.Lprepare_for_next_block\@: 260 261 // Stack: x0-x15 OUT IN LEN 262 263 // Increment block counter (x12) 264 add r8, #1 265 266 // Store updated (OUT, IN, LEN) 267 str r14, [sp, #64] 268 str r12, [sp, #68] 269 str r9, [sp, #72] 270 271 mov r14, sp 272 273 // Store updated block counter (x12) 274 str r8, [sp, #48] 275 276 sub sp, #16 277 278 // Reload state and do next block 279 ldmia r14!, {r0-r11} // load x0-x11 280 __strd r10, r11, sp, 8 // store x10-x11 before state 281 ldmia r14, {r10-r12,r14} // load x12-x15 282 b .Lnext_block\@ 283 284.Lxor_slowpath\@: 285 // Slow path: < 64 bytes remaining, or unaligned input or output buffer. 286 // We handle it by storing the 64 bytes of keystream to the stack, then 287 // XOR-ing the needed portion with the data. 288 289 // Allocate keystream buffer 290 sub sp, #64 291 mov r14, sp 292 293 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 294 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. 295 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 296 297 // Save keystream for x0-x3 298 __ldrd r8, r9, sp, 96 299 __ldrd r10, r11, sp, 104 300 add X0, X0, r8 301 add X1, X1, r9 302 add X2, X2, r10 303 add X3, X3, r11 304 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 305 stmia r14!, {X0-X3} 306 307 // Save keystream for x4-x7 308 __ldrd r8, r9, sp, 112 309 __ldrd r10, r11, sp, 120 310 add X4, r8, X4, ror #brot 311 add X5, r9, X5, ror #brot 312 add X6, r10, X6, ror #brot 313 add X7, r11, X7, ror #brot 314 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 315 add r8, sp, #64 316 stmia r14!, {X4-X7} 317 318 // Save keystream for x8-x15 319 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) 320 __ldrd r8, r9, sp, 128 321 __ldrd r10, r11, sp, 136 322 add r0, r0, r8 // x8 323 add r1, r1, r9 // x9 324 add r6, r6, r10 // x10 325 add r7, r7, r11 // x11 326 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 327 stmia r14!, {r0,r1,r6,r7} 328 __ldrd r8, r9, sp, 144 329 __ldrd r10, r11, sp, 152 330 add r2, r8, r2, ror #drot // x12 331 add r3, r9, r3, ror #drot // x13 332 add r4, r10, r4, ror #drot // x14 333 add r5, r11, r5, ror #drot // x15 334 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 335 stmia r14, {r2-r5} 336 337 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN 338 // Registers: r8 is block counter, r12 is IN. 339 340 ldr r9, [sp, #168] // LEN 341 ldr r14, [sp, #160] // OUT 342 cmp r9, #64 343 mov r0, sp 344 movle r1, r9 345 movgt r1, #64 346 // r1 is number of bytes to XOR, in range [1, 64] 347 348.if __LINUX_ARM_ARCH__ < 6 349 orr r2, r12, r14 350 tst r2, #3 // IN or OUT misaligned? 351 bne .Lxor_next_byte\@ 352.endif 353 354 // XOR a word at a time 355.rept 16 356 subs r1, #4 357 blt .Lxor_words_done\@ 358 ldr r2, [r12], #4 359 ldr r3, [r0], #4 360 eor r2, r2, r3 361 str r2, [r14], #4 362.endr 363 b .Lxor_slowpath_done\@ 364.Lxor_words_done\@: 365 ands r1, r1, #3 366 beq .Lxor_slowpath_done\@ 367 368 // XOR a byte at a time 369.Lxor_next_byte\@: 370 ldrb r2, [r12], #1 371 ldrb r3, [r0], #1 372 eor r2, r2, r3 373 strb r2, [r14], #1 374 subs r1, #1 375 bne .Lxor_next_byte\@ 376 377.Lxor_slowpath_done\@: 378 subs r9, #64 379 add sp, #96 380 bgt .Lprepare_for_next_block\@ 381 382.Ldone\@: 383.endm // _chacha 384 385/* 386 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 387 * const u32 *state, int nrounds); 388 */ 389ENTRY(chacha_doarm) 390 cmp r2, #0 // len == 0? 391 reteq lr 392 393 ldr ip, [sp] 394 cmp ip, #12 395 396 push {r0-r2,r4-r11,lr} 397 398 // Push state x0-x15 onto stack. 399 // Also store an extra copy of x10-x11 just before the state. 400 401 add X12, r3, #48 402 ldm X12, {X12,X13,X14,X15} 403 push {X12,X13,X14,X15} 404 sub sp, sp, #64 405 406 __ldrd X8_X10, X9_X11, r3, 40 407 __strd X8_X10, X9_X11, sp, 8 408 __strd X8_X10, X9_X11, sp, 56 409 ldm r3, {X0-X9_X11} 410 __strd X0, X1, sp, 16 411 __strd X2, X3, sp, 24 412 __strd X4, X5, sp, 32 413 __strd X6, X7, sp, 40 414 __strd X8_X10, X9_X11, sp, 48 415 416 beq 1f 417 _chacha 20 418 4190: add sp, #76 420 pop {r4-r11, pc} 421 4221: _chacha 12 423 b 0b 424ENDPROC(chacha_doarm) 425 426/* 427 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); 428 */ 429ENTRY(hchacha_block_arm) 430 push {r1,r4-r11,lr} 431 432 cmp r2, #12 // ChaCha12 ? 433 434 mov r14, r0 435 ldmia r14!, {r0-r11} // load x0-x11 436 push {r10-r11} // store x10-x11 to stack 437 ldm r14, {r10-r12,r14} // load x12-x15 438 sub sp, #8 439 440 beq 1f 441 _chacha_permute 20 442 443 // Skip over (unused0-unused1, x10-x11) 4440: add sp, #16 445 446 // Fix up rotations of x12-x15 447 ror X12, X12, #drot 448 ror X13, X13, #drot 449 pop {r4} // load 'out' 450 ror X14, X14, #drot 451 ror X15, X15, #drot 452 453 // Store (x0-x3,x12-x15) to 'out' 454 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} 455 456 pop {r4-r11,pc} 457 4581: _chacha_permute 12 459 b 0b 460ENDPROC(hchacha_block_arm) 461