1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * BLAKE2s digest algorithm, ARM scalar implementation 4 * 5 * Copyright 2020 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10#include <linux/linkage.h> 11 12 // Registers used to hold message words temporarily. There aren't 13 // enough ARM registers to hold the whole message block, so we have to 14 // load the words on-demand. 15 M_0 .req r12 16 M_1 .req r14 17 18// The BLAKE2s initialization vector 19.Lblake2s_IV: 20 .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A 21 .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 22 23.macro __ldrd a, b, src, offset 24#if __LINUX_ARM_ARCH__ >= 6 25 ldrd \a, \b, [\src, #\offset] 26#else 27 ldr \a, [\src, #\offset] 28 ldr \b, [\src, #\offset + 4] 29#endif 30.endm 31 32.macro __strd a, b, dst, offset 33#if __LINUX_ARM_ARCH__ >= 6 34 strd \a, \b, [\dst, #\offset] 35#else 36 str \a, [\dst, #\offset] 37 str \b, [\dst, #\offset + 4] 38#endif 39.endm 40 41// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. 42// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two 43// columns/diagonals. s0-s1 are the word offsets to the message words the first 44// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. 45// M_0 and M_1 are free to use, and the message block can be found at sp + 32. 46// 47// Note that to save instructions, the rotations don't happen when the 48// pseudocode says they should, but rather they are delayed until the values are 49// used. See the comment above _blake2s_round(). 50.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 51 52 ldr M_0, [sp, #32 + 4 * \s0] 53 ldr M_1, [sp, #32 + 4 * \s2] 54 55 // a += b + m[blake2s_sigma[r][2*i + 0]]; 56 add \a0, \a0, \b0, ror #brot 57 add \a1, \a1, \b1, ror #brot 58 add \a0, \a0, M_0 59 add \a1, \a1, M_1 60 61 // d = ror32(d ^ a, 16); 62 eor \d0, \a0, \d0, ror #drot 63 eor \d1, \a1, \d1, ror #drot 64 65 // c += d; 66 add \c0, \c0, \d0, ror #16 67 add \c1, \c1, \d1, ror #16 68 69 // b = ror32(b ^ c, 12); 70 eor \b0, \c0, \b0, ror #brot 71 eor \b1, \c1, \b1, ror #brot 72 73 ldr M_0, [sp, #32 + 4 * \s1] 74 ldr M_1, [sp, #32 + 4 * \s3] 75 76 // a += b + m[blake2s_sigma[r][2*i + 1]]; 77 add \a0, \a0, \b0, ror #12 78 add \a1, \a1, \b1, ror #12 79 add \a0, \a0, M_0 80 add \a1, \a1, M_1 81 82 // d = ror32(d ^ a, 8); 83 eor \d0, \a0, \d0, ror#16 84 eor \d1, \a1, \d1, ror#16 85 86 // c += d; 87 add \c0, \c0, \d0, ror#8 88 add \c1, \c1, \d1, ror#8 89 90 // b = ror32(b ^ c, 7); 91 eor \b0, \c0, \b0, ror#12 92 eor \b1, \c1, \b1, ror#12 93.endm 94 95// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] 96// are in r0..r9. The stack pointer points to 8 bytes of scratch space for 97// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and 98// r14 are free to use. The macro arguments s0-s15 give the order in which the 99// message words are used in this round. 100// 101// All rotates are performed using the implicit rotate operand accepted by the 102// 'add' and 'eor' instructions. This is faster than using explicit rotate 103// instructions. To make this work, we allow the values in the second and last 104// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the 105// wrong rotation amount. The rotation amount is then fixed up just in time 106// when the values are used. 'brot' is the number of bits the values in row 'b' 107// need to be rotated right to arrive at the correct values, and 'drot' 108// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 109// that they end up as (7, 8) after every round. 110.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ 111 s8, s9, s10, s11, s12, s13, s14, s15 112 113 // Mix first two columns: 114 // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). 115 __ldrd r10, r11, sp, 16 // load v[12] and v[13] 116 _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ 117 \s0, \s1, \s2, \s3 118 __strd r8, r9, sp, 0 119 __strd r10, r11, sp, 16 120 121 // Mix second two columns: 122 // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). 123 __ldrd r8, r9, sp, 8 // load v[10] and v[11] 124 __ldrd r10, r11, sp, 24 // load v[14] and v[15] 125 _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ 126 \s4, \s5, \s6, \s7 127 str r10, [sp, #24] // store v[14] 128 // v[10], v[11], and v[15] are used below, so no need to store them yet. 129 130 .set brot, 7 131 .set drot, 8 132 133 // Mix first two diagonals: 134 // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). 135 ldr r10, [sp, #16] // load v[12] 136 _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ 137 \s8, \s9, \s10, \s11 138 __strd r8, r9, sp, 8 139 str r11, [sp, #28] 140 str r10, [sp, #16] 141 142 // Mix second two diagonals: 143 // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). 144 __ldrd r8, r9, sp, 0 // load v[8] and v[9] 145 __ldrd r10, r11, sp, 20 // load v[13] and v[14] 146 _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ 147 \s12, \s13, \s14, \s15 148 __strd r10, r11, sp, 20 149.endm 150 151// 152// void blake2s_compress_arch(struct blake2s_state *state, 153// const u8 *block, size_t nblocks, u32 inc); 154// 155// Only the first three fields of struct blake2s_state are used: 156// u32 h[8]; (inout) 157// u32 t[2]; (inout) 158// u32 f[2]; (in) 159// 160 .align 5 161ENTRY(blake2s_compress_arch) 162 push {r0-r2,r4-r11,lr} // keep this an even number 163 164.Lnext_block: 165 // r0 is 'state' 166 // r1 is 'block' 167 // r3 is 'inc' 168 169 // Load and increment the counter t[0..1]. 170 __ldrd r10, r11, r0, 32 171 adds r10, r10, r3 172 adc r11, r11, #0 173 __strd r10, r11, r0, 32 174 175 // _blake2s_round is very short on registers, so copy the message block 176 // to the stack to save a register during the rounds. This also has the 177 // advantage that misalignment only needs to be dealt with in one place. 178 sub sp, sp, #64 179 mov r12, sp 180 tst r1, #3 181 bne .Lcopy_block_misaligned 182 ldmia r1!, {r2-r9} 183 stmia r12!, {r2-r9} 184 ldmia r1!, {r2-r9} 185 stmia r12, {r2-r9} 186.Lcopy_block_done: 187 str r1, [sp, #68] // Update message pointer 188 189 // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space 190 // for spilling v[8..9]. Leave v[8..9] in r8-r9. 191 mov r14, r0 // r14 = state 192 adr r12, .Lblake2s_IV 193 ldmia r12!, {r8-r9} // load IV[0..1] 194 __ldrd r0, r1, r14, 40 // load f[0..1] 195 ldm r12, {r2-r7} // load IV[3..7] 196 eor r4, r4, r10 // v[12] = IV[4] ^ t[0] 197 eor r5, r5, r11 // v[13] = IV[5] ^ t[1] 198 eor r6, r6, r0 // v[14] = IV[6] ^ f[0] 199 eor r7, r7, r1 // v[15] = IV[7] ^ f[1] 200 push {r2-r7} // push v[9..15] 201 sub sp, sp, #8 // leave space for v[8..9] 202 203 // Load h[0..7] == v[0..7]. 204 ldm r14, {r0-r7} 205 206 // Execute the rounds. Each round is provided the order in which it 207 // needs to use the message words. 208 .set brot, 0 209 .set drot, 0 210 _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 211 _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 212 _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 213 _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 214 _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 215 _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 216 _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 217 _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 218 _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 219 _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 220 221 // Fold the final state matrix into the hash chaining value: 222 // 223 // for (i = 0; i < 8; i++) 224 // h[i] ^= v[i] ^ v[i + 8]; 225 // 226 ldr r14, [sp, #96] // r14 = &h[0] 227 add sp, sp, #8 // v[8..9] are already loaded. 228 pop {r10-r11} // load v[10..11] 229 eor r0, r0, r8 230 eor r1, r1, r9 231 eor r2, r2, r10 232 eor r3, r3, r11 233 ldm r14, {r8-r11} // load h[0..3] 234 eor r0, r0, r8 235 eor r1, r1, r9 236 eor r2, r2, r10 237 eor r3, r3, r11 238 stmia r14!, {r0-r3} // store new h[0..3] 239 ldm r14, {r0-r3} // load old h[4..7] 240 pop {r8-r11} // load v[12..15] 241 eor r0, r0, r4, ror #brot 242 eor r1, r1, r5, ror #brot 243 eor r2, r2, r6, ror #brot 244 eor r3, r3, r7, ror #brot 245 eor r0, r0, r8, ror #drot 246 eor r1, r1, r9, ror #drot 247 eor r2, r2, r10, ror #drot 248 eor r3, r3, r11, ror #drot 249 add sp, sp, #64 // skip copy of message block 250 stm r14, {r0-r3} // store new h[4..7] 251 252 // Advance to the next block, if there is one. Note that if there are 253 // multiple blocks, then 'inc' (the counter increment amount) must be 254 // 64. So we can simply set it to 64 without re-loading it. 255 ldm sp, {r0, r1, r2} // load (state, block, nblocks) 256 mov r3, #64 // set 'inc' 257 subs r2, r2, #1 // nblocks-- 258 str r2, [sp, #8] 259 bne .Lnext_block // nblocks != 0? 260 261 pop {r0-r2,r4-r11,pc} 262 263 // The next message block (pointed to by r1) isn't 4-byte aligned, so it 264 // can't be loaded using ldmia. Copy it to the stack buffer (pointed to 265 // by r12) using an alternative method. r2-r9 are free to use. 266.Lcopy_block_misaligned: 267 mov r2, #64 2681: 269#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 270 ldr r3, [r1], #4 271#else 272 ldrb r3, [r1, #0] 273 ldrb r4, [r1, #1] 274 ldrb r5, [r1, #2] 275 ldrb r6, [r1, #3] 276 add r1, r1, #4 277 orr r3, r3, r4, lsl #8 278 orr r3, r3, r5, lsl #16 279 orr r3, r3, r6, lsl #24 280#endif 281 subs r2, r2, #4 282 str r3, [r12], #4 283 bne 1b 284 b .Lcopy_block_done 285ENDPROC(blake2s_compress_arch) 286