1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req q0 12 T1 .req q1 13 XL .req q2 14 XM .req q3 15 XH .req q4 16 IN1 .req q4 17 18 SHASH_L .req d0 19 SHASH_H .req d1 20 T1_L .req d2 21 T1_H .req d3 22 XL_L .req d4 23 XL_H .req d5 24 XM_L .req d6 25 XM_H .req d7 26 XH_L .req d8 27 28 t0l .req d10 29 t0h .req d11 30 t1l .req d12 31 t1h .req d13 32 t2l .req d14 33 t2h .req d15 34 t3l .req d16 35 t3h .req d17 36 t4l .req d18 37 t4h .req d19 38 39 t0q .req q5 40 t1q .req q6 41 t2q .req q7 42 t3q .req q8 43 t4q .req q9 44 T2 .req q9 45 46 s1l .req d20 47 s1h .req d21 48 s2l .req d22 49 s2h .req d23 50 s3l .req d24 51 s3h .req d25 52 s4l .req d26 53 s4h .req d27 54 55 MASK .req d28 56 SHASH2_p8 .req d28 57 58 k16 .req d29 59 k32 .req d30 60 k48 .req d31 61 SHASH2_p64 .req d31 62 63 HH .req q10 64 HH3 .req q11 65 HH4 .req q12 66 HH34 .req q13 67 68 HH_L .req d20 69 HH_H .req d21 70 HH3_L .req d22 71 HH3_H .req d23 72 HH4_L .req d24 73 HH4_H .req d25 74 HH34_L .req d26 75 HH34_H .req d27 76 SHASH2_H .req d29 77 78 XL2 .req q5 79 XM2 .req q6 80 XH2 .req q7 81 T3 .req q8 82 83 XL2_L .req d10 84 XL2_H .req d11 85 XM2_L .req d12 86 XM2_H .req d13 87 T3_L .req d16 88 T3_H .req d17 89 90 .text 91 .arch armv8-a 92 .fpu crypto-neon-fp-armv8 93 94 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 95 vmull.p64 \rd, \rn, \rm 96 .endm 97 98 /* 99 * This implementation of 64x64 -> 128 bit polynomial multiplication 100 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 101 * "Fast Software Polynomial Multiplication on ARM Processors Using 102 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 103 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 104 * 105 * It has been slightly tweaked for in-order performance, and to allow 106 * 'rq' to overlap with 'ad' or 'bd'. 107 */ 108 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 109 vext.8 t0l, \ad, \ad, #1 @ A1 110 .ifc \b1, t4l 111 vext.8 t4l, \bd, \bd, #1 @ B1 112 .endif 113 vmull.p8 t0q, t0l, \bd @ F = A1*B 114 vext.8 t1l, \ad, \ad, #2 @ A2 115 vmull.p8 t4q, \ad, \b1 @ E = A*B1 116 .ifc \b2, t3l 117 vext.8 t3l, \bd, \bd, #2 @ B2 118 .endif 119 vmull.p8 t1q, t1l, \bd @ H = A2*B 120 vext.8 t2l, \ad, \ad, #3 @ A3 121 vmull.p8 t3q, \ad, \b2 @ G = A*B2 122 veor t0q, t0q, t4q @ L = E + F 123 .ifc \b3, t4l 124 vext.8 t4l, \bd, \bd, #3 @ B3 125 .endif 126 vmull.p8 t2q, t2l, \bd @ J = A3*B 127 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 128 veor t1q, t1q, t3q @ M = G + H 129 .ifc \b4, t3l 130 vext.8 t3l, \bd, \bd, #4 @ B4 131 .endif 132 vmull.p8 t4q, \ad, \b3 @ I = A*B3 133 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 134 vmull.p8 t3q, \ad, \b4 @ K = A*B4 135 vand t0h, t0h, k48 136 vand t1h, t1h, k32 137 veor t2q, t2q, t4q @ N = I + J 138 veor t0l, t0l, t0h 139 veor t1l, t1l, t1h 140 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 141 vand t2h, t2h, k16 142 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 143 vmov.i64 t3h, #0 144 vext.8 t0q, t0q, t0q, #15 145 veor t2l, t2l, t2h 146 vext.8 t1q, t1q, t1q, #14 147 vmull.p8 \rq, \ad, \bd @ D = A*B 148 vext.8 t2q, t2q, t2q, #13 149 vext.8 t3q, t3q, t3q, #12 150 veor t0q, t0q, t1q 151 veor t2q, t2q, t3q 152 veor \rq, \rq, t0q 153 veor \rq, \rq, t2q 154 .endm 155 156 // 157 // PMULL (64x64->128) based reduction for CPUs that can do 158 // it in a single instruction. 159 // 160 .macro __pmull_reduce_p64 161 vmull.p64 T1, XL_L, MASK 162 163 veor XH_L, XH_L, XM_H 164 vext.8 T1, T1, T1, #8 165 veor XL_H, XL_H, XM_L 166 veor T1, T1, XL 167 168 vmull.p64 XL, T1_H, MASK 169 .endm 170 171 // 172 // Alternative reduction for CPUs that lack support for the 173 // 64x64->128 PMULL instruction 174 // 175 .macro __pmull_reduce_p8 176 veor XL_H, XL_H, XM_L 177 veor XH_L, XH_L, XM_H 178 179 vshl.i64 T1, XL, #57 180 vshl.i64 T2, XL, #62 181 veor T1, T1, T2 182 vshl.i64 T2, XL, #63 183 veor T1, T1, T2 184 veor XL_H, XL_H, T1_L 185 veor XH_L, XH_L, T1_H 186 187 vshr.u64 T1, XL, #1 188 veor XH, XH, XL 189 veor XL, XL, T1 190 vshr.u64 T1, T1, #6 191 vshr.u64 XL, XL, #1 192 .endm 193 194 .macro ghash_update, pn 195 vld1.64 {XL}, [r1] 196 197 /* do the head block first, if supplied */ 198 ldr ip, [sp] 199 teq ip, #0 200 beq 0f 201 vld1.64 {T1}, [ip] 202 teq r0, #0 203 b 3f 204 2050: .ifc \pn, p64 206 tst r0, #3 // skip until #blocks is a 207 bne 2f // round multiple of 4 208 209 vld1.8 {XL2-XM2}, [r2]! 2101: vld1.8 {T3-T2}, [r2]! 211 vrev64.8 XL2, XL2 212 vrev64.8 XM2, XM2 213 214 subs r0, r0, #4 215 216 vext.8 T1, XL2, XL2, #8 217 veor XL2_H, XL2_H, XL_L 218 veor XL, XL, T1 219 220 vrev64.8 T3, T3 221 vrev64.8 T1, T2 222 223 vmull.p64 XH, HH4_H, XL_H // a1 * b1 224 veor XL2_H, XL2_H, XL_H 225 vmull.p64 XL, HH4_L, XL_L // a0 * b0 226 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 227 228 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 229 veor XM2_L, XM2_L, XM2_H 230 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 231 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 232 233 veor XH, XH, XH2 234 veor XL, XL, XL2 235 veor XM, XM, XM2 236 237 vmull.p64 XH2, HH_H, T3_L // a1 * b1 238 veor T3_L, T3_L, T3_H 239 vmull.p64 XL2, HH_L, T3_H // a0 * b0 240 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 241 242 veor XH, XH, XH2 243 veor XL, XL, XL2 244 veor XM, XM, XM2 245 246 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 247 veor T1_L, T1_L, T1_H 248 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 249 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 250 251 veor XH, XH, XH2 252 veor XL, XL, XL2 253 veor XM, XM, XM2 254 255 beq 4f 256 257 vld1.8 {XL2-XM2}, [r2]! 258 259 veor T1, XL, XH 260 veor XM, XM, T1 261 262 __pmull_reduce_p64 263 264 veor T1, T1, XH 265 veor XL, XL, T1 266 267 b 1b 268 .endif 269 2702: vld1.64 {T1}, [r2]! 271 subs r0, r0, #1 272 2733: /* multiply XL by SHASH in GF(2^128) */ 274#ifndef CONFIG_CPU_BIG_ENDIAN 275 vrev64.8 T1, T1 276#endif 277 vext.8 IN1, T1, T1, #8 278 veor T1_L, T1_L, XL_H 279 veor XL, XL, IN1 280 281 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 282 veor T1, T1, XL 283 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 284 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 285 2864: veor T1, XL, XH 287 veor XM, XM, T1 288 289 __pmull_reduce_\pn 290 291 veor T1, T1, XH 292 veor XL, XL, T1 293 294 bne 0b 295 296 vst1.64 {XL}, [r1] 297 bx lr 298 .endm 299 300 /* 301 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 302 * struct ghash_key const *k, const char *head) 303 */ 304ENTRY(pmull_ghash_update_p64) 305 vld1.64 {SHASH}, [r3]! 306 vld1.64 {HH}, [r3]! 307 vld1.64 {HH3-HH4}, [r3] 308 309 veor SHASH2_p64, SHASH_L, SHASH_H 310 veor SHASH2_H, HH_L, HH_H 311 veor HH34_L, HH3_L, HH3_H 312 veor HH34_H, HH4_L, HH4_H 313 314 vmov.i8 MASK, #0xe1 315 vshl.u64 MASK, MASK, #57 316 317 ghash_update p64 318ENDPROC(pmull_ghash_update_p64) 319 320ENTRY(pmull_ghash_update_p8) 321 vld1.64 {SHASH}, [r3] 322 veor SHASH2_p8, SHASH_L, SHASH_H 323 324 vext.8 s1l, SHASH_L, SHASH_L, #1 325 vext.8 s2l, SHASH_L, SHASH_L, #2 326 vext.8 s3l, SHASH_L, SHASH_L, #3 327 vext.8 s4l, SHASH_L, SHASH_L, #4 328 vext.8 s1h, SHASH_H, SHASH_H, #1 329 vext.8 s2h, SHASH_H, SHASH_H, #2 330 vext.8 s3h, SHASH_H, SHASH_H, #3 331 vext.8 s4h, SHASH_H, SHASH_H, #4 332 333 vmov.i64 k16, #0xffff 334 vmov.i64 k32, #0xffffffff 335 vmov.i64 k48, #0xffffffffffff 336 337 ghash_update p8 338ENDPROC(pmull_ghash_update_p8) 339