1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req q0 12 T1 .req q1 13 XL .req q2 14 XM .req q3 15 XH .req q4 16 IN1 .req q4 17 18 SHASH_L .req d0 19 SHASH_H .req d1 20 T1_L .req d2 21 T1_H .req d3 22 XL_L .req d4 23 XL_H .req d5 24 XM_L .req d6 25 XM_H .req d7 26 XH_L .req d8 27 28 t0l .req d10 29 t0h .req d11 30 t1l .req d12 31 t1h .req d13 32 t2l .req d14 33 t2h .req d15 34 t3l .req d16 35 t3h .req d17 36 t4l .req d18 37 t4h .req d19 38 39 t0q .req q5 40 t1q .req q6 41 t2q .req q7 42 t3q .req q8 43 t4q .req q9 44 T2 .req q9 45 46 s1l .req d20 47 s1h .req d21 48 s2l .req d22 49 s2h .req d23 50 s3l .req d24 51 s3h .req d25 52 s4l .req d26 53 s4h .req d27 54 55 MASK .req d28 56 SHASH2_p8 .req d28 57 58 k16 .req d29 59 k32 .req d30 60 k48 .req d31 61 SHASH2_p64 .req d31 62 63 HH .req q10 64 HH3 .req q11 65 HH4 .req q12 66 HH34 .req q13 67 68 HH_L .req d20 69 HH_H .req d21 70 HH3_L .req d22 71 HH3_H .req d23 72 HH4_L .req d24 73 HH4_H .req d25 74 HH34_L .req d26 75 HH34_H .req d27 76 SHASH2_H .req d29 77 78 XL2 .req q5 79 XM2 .req q6 80 XH2 .req q7 81 T3 .req q8 82 83 XL2_L .req d10 84 XL2_H .req d11 85 XM2_L .req d12 86 XM2_H .req d13 87 T3_L .req d16 88 T3_H .req d17 89 90 .text 91 .fpu crypto-neon-fp-armv8 92 93 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 94 vmull.p64 \rd, \rn, \rm 95 .endm 96 97 /* 98 * This implementation of 64x64 -> 128 bit polynomial multiplication 99 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 100 * "Fast Software Polynomial Multiplication on ARM Processors Using 101 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 102 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 103 * 104 * It has been slightly tweaked for in-order performance, and to allow 105 * 'rq' to overlap with 'ad' or 'bd'. 106 */ 107 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 108 vext.8 t0l, \ad, \ad, #1 @ A1 109 .ifc \b1, t4l 110 vext.8 t4l, \bd, \bd, #1 @ B1 111 .endif 112 vmull.p8 t0q, t0l, \bd @ F = A1*B 113 vext.8 t1l, \ad, \ad, #2 @ A2 114 vmull.p8 t4q, \ad, \b1 @ E = A*B1 115 .ifc \b2, t3l 116 vext.8 t3l, \bd, \bd, #2 @ B2 117 .endif 118 vmull.p8 t1q, t1l, \bd @ H = A2*B 119 vext.8 t2l, \ad, \ad, #3 @ A3 120 vmull.p8 t3q, \ad, \b2 @ G = A*B2 121 veor t0q, t0q, t4q @ L = E + F 122 .ifc \b3, t4l 123 vext.8 t4l, \bd, \bd, #3 @ B3 124 .endif 125 vmull.p8 t2q, t2l, \bd @ J = A3*B 126 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 127 veor t1q, t1q, t3q @ M = G + H 128 .ifc \b4, t3l 129 vext.8 t3l, \bd, \bd, #4 @ B4 130 .endif 131 vmull.p8 t4q, \ad, \b3 @ I = A*B3 132 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 133 vmull.p8 t3q, \ad, \b4 @ K = A*B4 134 vand t0h, t0h, k48 135 vand t1h, t1h, k32 136 veor t2q, t2q, t4q @ N = I + J 137 veor t0l, t0l, t0h 138 veor t1l, t1l, t1h 139 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 140 vand t2h, t2h, k16 141 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 142 vmov.i64 t3h, #0 143 vext.8 t0q, t0q, t0q, #15 144 veor t2l, t2l, t2h 145 vext.8 t1q, t1q, t1q, #14 146 vmull.p8 \rq, \ad, \bd @ D = A*B 147 vext.8 t2q, t2q, t2q, #13 148 vext.8 t3q, t3q, t3q, #12 149 veor t0q, t0q, t1q 150 veor t2q, t2q, t3q 151 veor \rq, \rq, t0q 152 veor \rq, \rq, t2q 153 .endm 154 155 // 156 // PMULL (64x64->128) based reduction for CPUs that can do 157 // it in a single instruction. 158 // 159 .macro __pmull_reduce_p64 160 vmull.p64 T1, XL_L, MASK 161 162 veor XH_L, XH_L, XM_H 163 vext.8 T1, T1, T1, #8 164 veor XL_H, XL_H, XM_L 165 veor T1, T1, XL 166 167 vmull.p64 XL, T1_H, MASK 168 .endm 169 170 // 171 // Alternative reduction for CPUs that lack support for the 172 // 64x64->128 PMULL instruction 173 // 174 .macro __pmull_reduce_p8 175 veor XL_H, XL_H, XM_L 176 veor XH_L, XH_L, XM_H 177 178 vshl.i64 T1, XL, #57 179 vshl.i64 T2, XL, #62 180 veor T1, T1, T2 181 vshl.i64 T2, XL, #63 182 veor T1, T1, T2 183 veor XL_H, XL_H, T1_L 184 veor XH_L, XH_L, T1_H 185 186 vshr.u64 T1, XL, #1 187 veor XH, XH, XL 188 veor XL, XL, T1 189 vshr.u64 T1, T1, #6 190 vshr.u64 XL, XL, #1 191 .endm 192 193 .macro ghash_update, pn 194 vld1.64 {XL}, [r1] 195 196 /* do the head block first, if supplied */ 197 ldr ip, [sp] 198 teq ip, #0 199 beq 0f 200 vld1.64 {T1}, [ip] 201 teq r0, #0 202 b 3f 203 2040: .ifc \pn, p64 205 tst r0, #3 // skip until #blocks is a 206 bne 2f // round multiple of 4 207 208 vld1.8 {XL2-XM2}, [r2]! 2091: vld1.8 {T3-T2}, [r2]! 210 vrev64.8 XL2, XL2 211 vrev64.8 XM2, XM2 212 213 subs r0, r0, #4 214 215 vext.8 T1, XL2, XL2, #8 216 veor XL2_H, XL2_H, XL_L 217 veor XL, XL, T1 218 219 vrev64.8 T3, T3 220 vrev64.8 T1, T2 221 222 vmull.p64 XH, HH4_H, XL_H // a1 * b1 223 veor XL2_H, XL2_H, XL_H 224 vmull.p64 XL, HH4_L, XL_L // a0 * b0 225 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 226 227 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 228 veor XM2_L, XM2_L, XM2_H 229 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 230 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 231 232 veor XH, XH, XH2 233 veor XL, XL, XL2 234 veor XM, XM, XM2 235 236 vmull.p64 XH2, HH_H, T3_L // a1 * b1 237 veor T3_L, T3_L, T3_H 238 vmull.p64 XL2, HH_L, T3_H // a0 * b0 239 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 240 241 veor XH, XH, XH2 242 veor XL, XL, XL2 243 veor XM, XM, XM2 244 245 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 246 veor T1_L, T1_L, T1_H 247 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 248 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 249 250 veor XH, XH, XH2 251 veor XL, XL, XL2 252 veor XM, XM, XM2 253 254 beq 4f 255 256 vld1.8 {XL2-XM2}, [r2]! 257 258 veor T1, XL, XH 259 veor XM, XM, T1 260 261 __pmull_reduce_p64 262 263 veor T1, T1, XH 264 veor XL, XL, T1 265 266 b 1b 267 .endif 268 2692: vld1.64 {T1}, [r2]! 270 subs r0, r0, #1 271 2723: /* multiply XL by SHASH in GF(2^128) */ 273#ifndef CONFIG_CPU_BIG_ENDIAN 274 vrev64.8 T1, T1 275#endif 276 vext.8 IN1, T1, T1, #8 277 veor T1_L, T1_L, XL_H 278 veor XL, XL, IN1 279 280 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 281 veor T1, T1, XL 282 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 283 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 284 2854: veor T1, XL, XH 286 veor XM, XM, T1 287 288 __pmull_reduce_\pn 289 290 veor T1, T1, XH 291 veor XL, XL, T1 292 293 bne 0b 294 295 vst1.64 {XL}, [r1] 296 bx lr 297 .endm 298 299 /* 300 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 301 * struct ghash_key const *k, const char *head) 302 */ 303ENTRY(pmull_ghash_update_p64) 304 vld1.64 {SHASH}, [r3]! 305 vld1.64 {HH}, [r3]! 306 vld1.64 {HH3-HH4}, [r3] 307 308 veor SHASH2_p64, SHASH_L, SHASH_H 309 veor SHASH2_H, HH_L, HH_H 310 veor HH34_L, HH3_L, HH3_H 311 veor HH34_H, HH4_L, HH4_H 312 313 vmov.i8 MASK, #0xe1 314 vshl.u64 MASK, MASK, #57 315 316 ghash_update p64 317ENDPROC(pmull_ghash_update_p64) 318 319ENTRY(pmull_ghash_update_p8) 320 vld1.64 {SHASH}, [r3] 321 veor SHASH2_p8, SHASH_L, SHASH_H 322 323 vext.8 s1l, SHASH_L, SHASH_L, #1 324 vext.8 s2l, SHASH_L, SHASH_L, #2 325 vext.8 s3l, SHASH_L, SHASH_L, #3 326 vext.8 s4l, SHASH_L, SHASH_L, #4 327 vext.8 s1h, SHASH_H, SHASH_H, #1 328 vext.8 s2h, SHASH_H, SHASH_H, #2 329 vext.8 s3h, SHASH_H, SHASH_H, #3 330 vext.8 s4h, SHASH_H, SHASH_H, #4 331 332 vmov.i64 k16, #0xffff 333 vmov.i64 k32, #0xffffffff 334 vmov.i64 k48, #0xffffffffffff 335 336 ghash_update p8 337ENDPROC(pmull_ghash_update_p8) 338