129621d09SArd Biesheuvel/* SPDX-License-Identifier: GPL-2.0 */ 229621d09SArd Biesheuvel/* 329621d09SArd Biesheuvel * Copyright (C) 2018 Google, Inc. 429621d09SArd Biesheuvel */ 529621d09SArd Biesheuvel 629621d09SArd Biesheuvel#include <linux/linkage.h> 729621d09SArd Biesheuvel#include <asm/assembler.h> 829621d09SArd Biesheuvel 929621d09SArd Biesheuvel/* 1029621d09SArd Biesheuvel * Design notes: 1129621d09SArd Biesheuvel * 1229621d09SArd Biesheuvel * 16 registers would be needed to hold the state matrix, but only 14 are 1329621d09SArd Biesheuvel * available because 'sp' and 'pc' cannot be used. So we spill the elements 1429621d09SArd Biesheuvel * (x8, x9) to the stack and swap them out with (x10, x11). This adds one 1529621d09SArd Biesheuvel * 'ldrd' and one 'strd' instruction per round. 1629621d09SArd Biesheuvel * 1729621d09SArd Biesheuvel * All rotates are performed using the implicit rotate operand accepted by the 1829621d09SArd Biesheuvel * 'add' and 'eor' instructions. This is faster than using explicit rotate 1929621d09SArd Biesheuvel * instructions. To make this work, we allow the values in the second and last 2029621d09SArd Biesheuvel * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the 2129621d09SArd Biesheuvel * wrong rotation amount. The rotation amount is then fixed up just in time 2229621d09SArd Biesheuvel * when the values are used. 'brot' is the number of bits the values in row 'b' 2329621d09SArd Biesheuvel * need to be rotated right to arrive at the correct values, and 'drot' 2429621d09SArd Biesheuvel * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 2529621d09SArd Biesheuvel * that they end up as (25, 24) after every round. 2629621d09SArd Biesheuvel */ 2729621d09SArd Biesheuvel 2829621d09SArd Biesheuvel // ChaCha state registers 2929621d09SArd Biesheuvel X0 .req r0 3029621d09SArd Biesheuvel X1 .req r1 3129621d09SArd Biesheuvel X2 .req r2 3229621d09SArd Biesheuvel X3 .req r3 3329621d09SArd Biesheuvel X4 .req r4 3429621d09SArd Biesheuvel X5 .req r5 3529621d09SArd Biesheuvel X6 .req r6 3629621d09SArd Biesheuvel X7 .req r7 3729621d09SArd Biesheuvel X8_X10 .req r8 // shared by x8 and x10 3829621d09SArd Biesheuvel X9_X11 .req r9 // shared by x9 and x11 3929621d09SArd Biesheuvel X12 .req r10 4029621d09SArd Biesheuvel X13 .req r11 4129621d09SArd Biesheuvel X14 .req r12 4229621d09SArd Biesheuvel X15 .req r14 4329621d09SArd Biesheuvel 44*e0ba808dSArd Biesheuvel.macro _le32_bswap_4x a, b, c, d, tmp 4529621d09SArd Biesheuvel#ifdef __ARMEB__ 46*e0ba808dSArd Biesheuvel rev_l \a, \tmp 47*e0ba808dSArd Biesheuvel rev_l \b, \tmp 48*e0ba808dSArd Biesheuvel rev_l \c, \tmp 49*e0ba808dSArd Biesheuvel rev_l \d, \tmp 5029621d09SArd Biesheuvel#endif 5129621d09SArd Biesheuvel.endm 5229621d09SArd Biesheuvel 5329621d09SArd Biesheuvel.macro __ldrd a, b, src, offset 5429621d09SArd Biesheuvel#if __LINUX_ARM_ARCH__ >= 6 5529621d09SArd Biesheuvel ldrd \a, \b, [\src, #\offset] 5629621d09SArd Biesheuvel#else 5729621d09SArd Biesheuvel ldr \a, [\src, #\offset] 5829621d09SArd Biesheuvel ldr \b, [\src, #\offset + 4] 5929621d09SArd Biesheuvel#endif 6029621d09SArd Biesheuvel.endm 6129621d09SArd Biesheuvel 6229621d09SArd Biesheuvel.macro __strd a, b, dst, offset 6329621d09SArd Biesheuvel#if __LINUX_ARM_ARCH__ >= 6 6429621d09SArd Biesheuvel strd \a, \b, [\dst, #\offset] 6529621d09SArd Biesheuvel#else 6629621d09SArd Biesheuvel str \a, [\dst, #\offset] 6729621d09SArd Biesheuvel str \b, [\dst, #\offset + 4] 6829621d09SArd Biesheuvel#endif 6929621d09SArd Biesheuvel.endm 7029621d09SArd Biesheuvel 7129621d09SArd Biesheuvel.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 7229621d09SArd Biesheuvel 7329621d09SArd Biesheuvel // a += b; d ^= a; d = rol(d, 16); 7429621d09SArd Biesheuvel add \a1, \a1, \b1, ror #brot 7529621d09SArd Biesheuvel add \a2, \a2, \b2, ror #brot 7629621d09SArd Biesheuvel eor \d1, \a1, \d1, ror #drot 7729621d09SArd Biesheuvel eor \d2, \a2, \d2, ror #drot 7829621d09SArd Biesheuvel // drot == 32 - 16 == 16 7929621d09SArd Biesheuvel 8029621d09SArd Biesheuvel // c += d; b ^= c; b = rol(b, 12); 8129621d09SArd Biesheuvel add \c1, \c1, \d1, ror #16 8229621d09SArd Biesheuvel add \c2, \c2, \d2, ror #16 8329621d09SArd Biesheuvel eor \b1, \c1, \b1, ror #brot 8429621d09SArd Biesheuvel eor \b2, \c2, \b2, ror #brot 8529621d09SArd Biesheuvel // brot == 32 - 12 == 20 8629621d09SArd Biesheuvel 8729621d09SArd Biesheuvel // a += b; d ^= a; d = rol(d, 8); 8829621d09SArd Biesheuvel add \a1, \a1, \b1, ror #20 8929621d09SArd Biesheuvel add \a2, \a2, \b2, ror #20 9029621d09SArd Biesheuvel eor \d1, \a1, \d1, ror #16 9129621d09SArd Biesheuvel eor \d2, \a2, \d2, ror #16 9229621d09SArd Biesheuvel // drot == 32 - 8 == 24 9329621d09SArd Biesheuvel 9429621d09SArd Biesheuvel // c += d; b ^= c; b = rol(b, 7); 9529621d09SArd Biesheuvel add \c1, \c1, \d1, ror #24 9629621d09SArd Biesheuvel add \c2, \c2, \d2, ror #24 9729621d09SArd Biesheuvel eor \b1, \c1, \b1, ror #20 9829621d09SArd Biesheuvel eor \b2, \c2, \b2, ror #20 9929621d09SArd Biesheuvel // brot == 32 - 7 == 25 10029621d09SArd Biesheuvel.endm 10129621d09SArd Biesheuvel 10229621d09SArd Biesheuvel.macro _doubleround 10329621d09SArd Biesheuvel 10429621d09SArd Biesheuvel // column round 10529621d09SArd Biesheuvel 10629621d09SArd Biesheuvel // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) 10729621d09SArd Biesheuvel _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 10829621d09SArd Biesheuvel 10929621d09SArd Biesheuvel // save (x8, x9); restore (x10, x11) 11029621d09SArd Biesheuvel __strd X8_X10, X9_X11, sp, 0 11129621d09SArd Biesheuvel __ldrd X8_X10, X9_X11, sp, 8 11229621d09SArd Biesheuvel 11329621d09SArd Biesheuvel // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) 11429621d09SArd Biesheuvel _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 11529621d09SArd Biesheuvel 11629621d09SArd Biesheuvel .set brot, 25 11729621d09SArd Biesheuvel .set drot, 24 11829621d09SArd Biesheuvel 11929621d09SArd Biesheuvel // diagonal round 12029621d09SArd Biesheuvel 12129621d09SArd Biesheuvel // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) 12229621d09SArd Biesheuvel _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 12329621d09SArd Biesheuvel 12429621d09SArd Biesheuvel // save (x10, x11); restore (x8, x9) 12529621d09SArd Biesheuvel __strd X8_X10, X9_X11, sp, 8 12629621d09SArd Biesheuvel __ldrd X8_X10, X9_X11, sp, 0 12729621d09SArd Biesheuvel 12829621d09SArd Biesheuvel // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) 12929621d09SArd Biesheuvel _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 13029621d09SArd Biesheuvel.endm 13129621d09SArd Biesheuvel 13229621d09SArd Biesheuvel.macro _chacha_permute nrounds 13329621d09SArd Biesheuvel .set brot, 0 13429621d09SArd Biesheuvel .set drot, 0 13529621d09SArd Biesheuvel .rept \nrounds / 2 13629621d09SArd Biesheuvel _doubleround 13729621d09SArd Biesheuvel .endr 13829621d09SArd Biesheuvel.endm 13929621d09SArd Biesheuvel 14029621d09SArd Biesheuvel.macro _chacha nrounds 14129621d09SArd Biesheuvel 14229621d09SArd Biesheuvel.Lnext_block\@: 14329621d09SArd Biesheuvel // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN 14429621d09SArd Biesheuvel // Registers contain x0-x9,x12-x15. 14529621d09SArd Biesheuvel 14629621d09SArd Biesheuvel // Do the core ChaCha permutation to update x0-x15. 14729621d09SArd Biesheuvel _chacha_permute \nrounds 14829621d09SArd Biesheuvel 14929621d09SArd Biesheuvel add sp, #8 15029621d09SArd Biesheuvel // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN 15129621d09SArd Biesheuvel // Registers contain x0-x9,x12-x15. 15229621d09SArd Biesheuvel // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 15329621d09SArd Biesheuvel 15429621d09SArd Biesheuvel // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). 15529621d09SArd Biesheuvel push {X8_X10, X9_X11, X12, X13, X14, X15} 15629621d09SArd Biesheuvel 15729621d09SArd Biesheuvel // Load (OUT, IN, LEN). 15829621d09SArd Biesheuvel ldr r14, [sp, #96] 15929621d09SArd Biesheuvel ldr r12, [sp, #100] 16029621d09SArd Biesheuvel ldr r11, [sp, #104] 16129621d09SArd Biesheuvel 16229621d09SArd Biesheuvel orr r10, r14, r12 16329621d09SArd Biesheuvel 16429621d09SArd Biesheuvel // Use slow path if fewer than 64 bytes remain. 16529621d09SArd Biesheuvel cmp r11, #64 16629621d09SArd Biesheuvel blt .Lxor_slowpath\@ 16729621d09SArd Biesheuvel 16829621d09SArd Biesheuvel // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on 16929621d09SArd Biesheuvel // ARMv6+, since ldmia and stmia (used below) still require alignment. 17029621d09SArd Biesheuvel tst r10, #3 17129621d09SArd Biesheuvel bne .Lxor_slowpath\@ 17229621d09SArd Biesheuvel 17329621d09SArd Biesheuvel // Fast path: XOR 64 bytes of aligned data. 17429621d09SArd Biesheuvel 17529621d09SArd Biesheuvel // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 17629621d09SArd Biesheuvel // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. 17729621d09SArd Biesheuvel // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 17829621d09SArd Biesheuvel 17929621d09SArd Biesheuvel // x0-x3 18029621d09SArd Biesheuvel __ldrd r8, r9, sp, 32 18129621d09SArd Biesheuvel __ldrd r10, r11, sp, 40 18229621d09SArd Biesheuvel add X0, X0, r8 18329621d09SArd Biesheuvel add X1, X1, r9 18429621d09SArd Biesheuvel add X2, X2, r10 18529621d09SArd Biesheuvel add X3, X3, r11 186*e0ba808dSArd Biesheuvel _le32_bswap_4x X0, X1, X2, X3, r8 18729621d09SArd Biesheuvel ldmia r12!, {r8-r11} 18829621d09SArd Biesheuvel eor X0, X0, r8 18929621d09SArd Biesheuvel eor X1, X1, r9 19029621d09SArd Biesheuvel eor X2, X2, r10 19129621d09SArd Biesheuvel eor X3, X3, r11 19229621d09SArd Biesheuvel stmia r14!, {X0-X3} 19329621d09SArd Biesheuvel 19429621d09SArd Biesheuvel // x4-x7 19529621d09SArd Biesheuvel __ldrd r8, r9, sp, 48 19629621d09SArd Biesheuvel __ldrd r10, r11, sp, 56 19729621d09SArd Biesheuvel add X4, r8, X4, ror #brot 19829621d09SArd Biesheuvel add X5, r9, X5, ror #brot 19929621d09SArd Biesheuvel ldmia r12!, {X0-X3} 20029621d09SArd Biesheuvel add X6, r10, X6, ror #brot 20129621d09SArd Biesheuvel add X7, r11, X7, ror #brot 202*e0ba808dSArd Biesheuvel _le32_bswap_4x X4, X5, X6, X7, r8 20329621d09SArd Biesheuvel eor X4, X4, X0 20429621d09SArd Biesheuvel eor X5, X5, X1 20529621d09SArd Biesheuvel eor X6, X6, X2 20629621d09SArd Biesheuvel eor X7, X7, X3 20729621d09SArd Biesheuvel stmia r14!, {X4-X7} 20829621d09SArd Biesheuvel 20929621d09SArd Biesheuvel // x8-x15 21029621d09SArd Biesheuvel pop {r0-r7} // (x8-x9,x12-x15,x10-x11) 21129621d09SArd Biesheuvel __ldrd r8, r9, sp, 32 21229621d09SArd Biesheuvel __ldrd r10, r11, sp, 40 21329621d09SArd Biesheuvel add r0, r0, r8 // x8 21429621d09SArd Biesheuvel add r1, r1, r9 // x9 21529621d09SArd Biesheuvel add r6, r6, r10 // x10 21629621d09SArd Biesheuvel add r7, r7, r11 // x11 217*e0ba808dSArd Biesheuvel _le32_bswap_4x r0, r1, r6, r7, r8 21829621d09SArd Biesheuvel ldmia r12!, {r8-r11} 21929621d09SArd Biesheuvel eor r0, r0, r8 // x8 22029621d09SArd Biesheuvel eor r1, r1, r9 // x9 22129621d09SArd Biesheuvel eor r6, r6, r10 // x10 22229621d09SArd Biesheuvel eor r7, r7, r11 // x11 22329621d09SArd Biesheuvel stmia r14!, {r0,r1,r6,r7} 22429621d09SArd Biesheuvel ldmia r12!, {r0,r1,r6,r7} 22529621d09SArd Biesheuvel __ldrd r8, r9, sp, 48 22629621d09SArd Biesheuvel __ldrd r10, r11, sp, 56 22729621d09SArd Biesheuvel add r2, r8, r2, ror #drot // x12 22829621d09SArd Biesheuvel add r3, r9, r3, ror #drot // x13 22929621d09SArd Biesheuvel add r4, r10, r4, ror #drot // x14 23029621d09SArd Biesheuvel add r5, r11, r5, ror #drot // x15 231*e0ba808dSArd Biesheuvel _le32_bswap_4x r2, r3, r4, r5, r9 23229621d09SArd Biesheuvel ldr r9, [sp, #72] // load LEN 23329621d09SArd Biesheuvel eor r2, r2, r0 // x12 23429621d09SArd Biesheuvel eor r3, r3, r1 // x13 23529621d09SArd Biesheuvel eor r4, r4, r6 // x14 23629621d09SArd Biesheuvel eor r5, r5, r7 // x15 23729621d09SArd Biesheuvel subs r9, #64 // decrement and check LEN 23829621d09SArd Biesheuvel stmia r14!, {r2-r5} 23929621d09SArd Biesheuvel 24029621d09SArd Biesheuvel beq .Ldone\@ 24129621d09SArd Biesheuvel 24229621d09SArd Biesheuvel.Lprepare_for_next_block\@: 24329621d09SArd Biesheuvel 24429621d09SArd Biesheuvel // Stack: x0-x15 OUT IN LEN 24529621d09SArd Biesheuvel 24629621d09SArd Biesheuvel // Increment block counter (x12) 24729621d09SArd Biesheuvel add r8, #1 24829621d09SArd Biesheuvel 24929621d09SArd Biesheuvel // Store updated (OUT, IN, LEN) 25029621d09SArd Biesheuvel str r14, [sp, #64] 25129621d09SArd Biesheuvel str r12, [sp, #68] 25229621d09SArd Biesheuvel str r9, [sp, #72] 25329621d09SArd Biesheuvel 25429621d09SArd Biesheuvel mov r14, sp 25529621d09SArd Biesheuvel 25629621d09SArd Biesheuvel // Store updated block counter (x12) 25729621d09SArd Biesheuvel str r8, [sp, #48] 25829621d09SArd Biesheuvel 25929621d09SArd Biesheuvel sub sp, #16 26029621d09SArd Biesheuvel 26129621d09SArd Biesheuvel // Reload state and do next block 26229621d09SArd Biesheuvel ldmia r14!, {r0-r11} // load x0-x11 26329621d09SArd Biesheuvel __strd r10, r11, sp, 8 // store x10-x11 before state 26429621d09SArd Biesheuvel ldmia r14, {r10-r12,r14} // load x12-x15 26529621d09SArd Biesheuvel b .Lnext_block\@ 26629621d09SArd Biesheuvel 26729621d09SArd Biesheuvel.Lxor_slowpath\@: 26829621d09SArd Biesheuvel // Slow path: < 64 bytes remaining, or unaligned input or output buffer. 26929621d09SArd Biesheuvel // We handle it by storing the 64 bytes of keystream to the stack, then 27029621d09SArd Biesheuvel // XOR-ing the needed portion with the data. 27129621d09SArd Biesheuvel 27229621d09SArd Biesheuvel // Allocate keystream buffer 27329621d09SArd Biesheuvel sub sp, #64 27429621d09SArd Biesheuvel mov r14, sp 27529621d09SArd Biesheuvel 27629621d09SArd Biesheuvel // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 27729621d09SArd Biesheuvel // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. 27829621d09SArd Biesheuvel // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 27929621d09SArd Biesheuvel 28029621d09SArd Biesheuvel // Save keystream for x0-x3 28129621d09SArd Biesheuvel __ldrd r8, r9, sp, 96 28229621d09SArd Biesheuvel __ldrd r10, r11, sp, 104 28329621d09SArd Biesheuvel add X0, X0, r8 28429621d09SArd Biesheuvel add X1, X1, r9 28529621d09SArd Biesheuvel add X2, X2, r10 28629621d09SArd Biesheuvel add X3, X3, r11 287*e0ba808dSArd Biesheuvel _le32_bswap_4x X0, X1, X2, X3, r8 28829621d09SArd Biesheuvel stmia r14!, {X0-X3} 28929621d09SArd Biesheuvel 29029621d09SArd Biesheuvel // Save keystream for x4-x7 29129621d09SArd Biesheuvel __ldrd r8, r9, sp, 112 29229621d09SArd Biesheuvel __ldrd r10, r11, sp, 120 29329621d09SArd Biesheuvel add X4, r8, X4, ror #brot 29429621d09SArd Biesheuvel add X5, r9, X5, ror #brot 29529621d09SArd Biesheuvel add X6, r10, X6, ror #brot 29629621d09SArd Biesheuvel add X7, r11, X7, ror #brot 297*e0ba808dSArd Biesheuvel _le32_bswap_4x X4, X5, X6, X7, r8 29829621d09SArd Biesheuvel add r8, sp, #64 29929621d09SArd Biesheuvel stmia r14!, {X4-X7} 30029621d09SArd Biesheuvel 30129621d09SArd Biesheuvel // Save keystream for x8-x15 30229621d09SArd Biesheuvel ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) 30329621d09SArd Biesheuvel __ldrd r8, r9, sp, 128 30429621d09SArd Biesheuvel __ldrd r10, r11, sp, 136 30529621d09SArd Biesheuvel add r0, r0, r8 // x8 30629621d09SArd Biesheuvel add r1, r1, r9 // x9 30729621d09SArd Biesheuvel add r6, r6, r10 // x10 30829621d09SArd Biesheuvel add r7, r7, r11 // x11 309*e0ba808dSArd Biesheuvel _le32_bswap_4x r0, r1, r6, r7, r8 31029621d09SArd Biesheuvel stmia r14!, {r0,r1,r6,r7} 31129621d09SArd Biesheuvel __ldrd r8, r9, sp, 144 31229621d09SArd Biesheuvel __ldrd r10, r11, sp, 152 31329621d09SArd Biesheuvel add r2, r8, r2, ror #drot // x12 31429621d09SArd Biesheuvel add r3, r9, r3, ror #drot // x13 31529621d09SArd Biesheuvel add r4, r10, r4, ror #drot // x14 31629621d09SArd Biesheuvel add r5, r11, r5, ror #drot // x15 317*e0ba808dSArd Biesheuvel _le32_bswap_4x r2, r3, r4, r5, r9 31829621d09SArd Biesheuvel stmia r14, {r2-r5} 31929621d09SArd Biesheuvel 32029621d09SArd Biesheuvel // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN 32129621d09SArd Biesheuvel // Registers: r8 is block counter, r12 is IN. 32229621d09SArd Biesheuvel 32329621d09SArd Biesheuvel ldr r9, [sp, #168] // LEN 32429621d09SArd Biesheuvel ldr r14, [sp, #160] // OUT 32529621d09SArd Biesheuvel cmp r9, #64 32629621d09SArd Biesheuvel mov r0, sp 32729621d09SArd Biesheuvel movle r1, r9 32829621d09SArd Biesheuvel movgt r1, #64 32929621d09SArd Biesheuvel // r1 is number of bytes to XOR, in range [1, 64] 33029621d09SArd Biesheuvel 33129621d09SArd Biesheuvel.if __LINUX_ARM_ARCH__ < 6 33229621d09SArd Biesheuvel orr r2, r12, r14 33329621d09SArd Biesheuvel tst r2, #3 // IN or OUT misaligned? 33429621d09SArd Biesheuvel bne .Lxor_next_byte\@ 33529621d09SArd Biesheuvel.endif 33629621d09SArd Biesheuvel 33729621d09SArd Biesheuvel // XOR a word at a time 33829621d09SArd Biesheuvel.rept 16 33929621d09SArd Biesheuvel subs r1, #4 34029621d09SArd Biesheuvel blt .Lxor_words_done\@ 34129621d09SArd Biesheuvel ldr r2, [r12], #4 34229621d09SArd Biesheuvel ldr r3, [r0], #4 34329621d09SArd Biesheuvel eor r2, r2, r3 34429621d09SArd Biesheuvel str r2, [r14], #4 34529621d09SArd Biesheuvel.endr 34629621d09SArd Biesheuvel b .Lxor_slowpath_done\@ 34729621d09SArd Biesheuvel.Lxor_words_done\@: 34829621d09SArd Biesheuvel ands r1, r1, #3 34929621d09SArd Biesheuvel beq .Lxor_slowpath_done\@ 35029621d09SArd Biesheuvel 35129621d09SArd Biesheuvel // XOR a byte at a time 35229621d09SArd Biesheuvel.Lxor_next_byte\@: 35329621d09SArd Biesheuvel ldrb r2, [r12], #1 35429621d09SArd Biesheuvel ldrb r3, [r0], #1 35529621d09SArd Biesheuvel eor r2, r2, r3 35629621d09SArd Biesheuvel strb r2, [r14], #1 35729621d09SArd Biesheuvel subs r1, #1 35829621d09SArd Biesheuvel bne .Lxor_next_byte\@ 35929621d09SArd Biesheuvel 36029621d09SArd Biesheuvel.Lxor_slowpath_done\@: 36129621d09SArd Biesheuvel subs r9, #64 36229621d09SArd Biesheuvel add sp, #96 36329621d09SArd Biesheuvel bgt .Lprepare_for_next_block\@ 36429621d09SArd Biesheuvel 36529621d09SArd Biesheuvel.Ldone\@: 36629621d09SArd Biesheuvel.endm // _chacha 36729621d09SArd Biesheuvel 36829621d09SArd Biesheuvel/* 369b36d8c09SArd Biesheuvel * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 370b36d8c09SArd Biesheuvel * const u32 *state, int nrounds); 37129621d09SArd Biesheuvel */ 372b36d8c09SArd BiesheuvelENTRY(chacha_doarm) 37329621d09SArd Biesheuvel cmp r2, #0 // len == 0? 37429621d09SArd Biesheuvel reteq lr 37529621d09SArd Biesheuvel 376b36d8c09SArd Biesheuvel ldr ip, [sp] 377b36d8c09SArd Biesheuvel cmp ip, #12 378b36d8c09SArd Biesheuvel 37929621d09SArd Biesheuvel push {r0-r2,r4-r11,lr} 38029621d09SArd Biesheuvel 38129621d09SArd Biesheuvel // Push state x0-x15 onto stack. 38229621d09SArd Biesheuvel // Also store an extra copy of x10-x11 just before the state. 38329621d09SArd Biesheuvel 384b36d8c09SArd Biesheuvel add X12, r3, #48 385b36d8c09SArd Biesheuvel ldm X12, {X12,X13,X14,X15} 386b36d8c09SArd Biesheuvel push {X12,X13,X14,X15} 387b36d8c09SArd Biesheuvel sub sp, sp, #64 38829621d09SArd Biesheuvel 389b36d8c09SArd Biesheuvel __ldrd X8_X10, X9_X11, r3, 40 39029621d09SArd Biesheuvel __strd X8_X10, X9_X11, sp, 8 391b36d8c09SArd Biesheuvel __strd X8_X10, X9_X11, sp, 56 392b36d8c09SArd Biesheuvel ldm r3, {X0-X9_X11} 39329621d09SArd Biesheuvel __strd X0, X1, sp, 16 39429621d09SArd Biesheuvel __strd X2, X3, sp, 24 395b36d8c09SArd Biesheuvel __strd X4, X5, sp, 32 396b36d8c09SArd Biesheuvel __strd X6, X7, sp, 40 397b36d8c09SArd Biesheuvel __strd X8_X10, X9_X11, sp, 48 39829621d09SArd Biesheuvel 399b36d8c09SArd Biesheuvel beq 1f 40029621d09SArd Biesheuvel _chacha 20 40129621d09SArd Biesheuvel 402b36d8c09SArd Biesheuvel0: add sp, #76 40329621d09SArd Biesheuvel pop {r4-r11, pc} 404b36d8c09SArd Biesheuvel 405b36d8c09SArd Biesheuvel1: _chacha 12 406b36d8c09SArd Biesheuvel b 0b 407b36d8c09SArd BiesheuvelENDPROC(chacha_doarm) 40829621d09SArd Biesheuvel 40929621d09SArd Biesheuvel/* 410b36d8c09SArd Biesheuvel * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); 41129621d09SArd Biesheuvel */ 412b36d8c09SArd BiesheuvelENTRY(hchacha_block_arm) 41329621d09SArd Biesheuvel push {r1,r4-r11,lr} 41429621d09SArd Biesheuvel 415b36d8c09SArd Biesheuvel cmp r2, #12 // ChaCha12 ? 416b36d8c09SArd Biesheuvel 41729621d09SArd Biesheuvel mov r14, r0 41829621d09SArd Biesheuvel ldmia r14!, {r0-r11} // load x0-x11 41929621d09SArd Biesheuvel push {r10-r11} // store x10-x11 to stack 42029621d09SArd Biesheuvel ldm r14, {r10-r12,r14} // load x12-x15 42129621d09SArd Biesheuvel sub sp, #8 42229621d09SArd Biesheuvel 423b36d8c09SArd Biesheuvel beq 1f 42429621d09SArd Biesheuvel _chacha_permute 20 42529621d09SArd Biesheuvel 42629621d09SArd Biesheuvel // Skip over (unused0-unused1, x10-x11) 427b36d8c09SArd Biesheuvel0: add sp, #16 42829621d09SArd Biesheuvel 42929621d09SArd Biesheuvel // Fix up rotations of x12-x15 43029621d09SArd Biesheuvel ror X12, X12, #drot 43129621d09SArd Biesheuvel ror X13, X13, #drot 43229621d09SArd Biesheuvel pop {r4} // load 'out' 43329621d09SArd Biesheuvel ror X14, X14, #drot 43429621d09SArd Biesheuvel ror X15, X15, #drot 43529621d09SArd Biesheuvel 43629621d09SArd Biesheuvel // Store (x0-x3,x12-x15) to 'out' 43729621d09SArd Biesheuvel stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} 43829621d09SArd Biesheuvel 43929621d09SArd Biesheuvel pop {r4-r11,pc} 440b36d8c09SArd Biesheuvel 441b36d8c09SArd Biesheuvel1: _chacha_permute 12 442b36d8c09SArd Biesheuvel b 0b 443b36d8c09SArd BiesheuvelENDPROC(hchacha_block_arm) 444