1/* 2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 3 * 4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req q0 15 T1 .req q1 16 XL .req q2 17 XM .req q3 18 XH .req q4 19 IN1 .req q4 20 21 SHASH_L .req d0 22 SHASH_H .req d1 23 T1_L .req d2 24 T1_H .req d3 25 XL_L .req d4 26 XL_H .req d5 27 XM_L .req d6 28 XM_H .req d7 29 XH_L .req d8 30 31 t0l .req d10 32 t0h .req d11 33 t1l .req d12 34 t1h .req d13 35 t2l .req d14 36 t2h .req d15 37 t3l .req d16 38 t3h .req d17 39 t4l .req d18 40 t4h .req d19 41 42 t0q .req q5 43 t1q .req q6 44 t2q .req q7 45 t3q .req q8 46 t4q .req q9 47 T2 .req q9 48 49 s1l .req d20 50 s1h .req d21 51 s2l .req d22 52 s2h .req d23 53 s3l .req d24 54 s3h .req d25 55 s4l .req d26 56 s4h .req d27 57 58 MASK .req d28 59 SHASH2_p8 .req d28 60 61 k16 .req d29 62 k32 .req d30 63 k48 .req d31 64 SHASH2_p64 .req d31 65 66 HH .req q10 67 HH3 .req q11 68 HH4 .req q12 69 HH34 .req q13 70 71 HH_L .req d20 72 HH_H .req d21 73 HH3_L .req d22 74 HH3_H .req d23 75 HH4_L .req d24 76 HH4_H .req d25 77 HH34_L .req d26 78 HH34_H .req d27 79 SHASH2_H .req d29 80 81 XL2 .req q5 82 XM2 .req q6 83 XH2 .req q7 84 T3 .req q8 85 86 XL2_L .req d10 87 XL2_H .req d11 88 XM2_L .req d12 89 XM2_H .req d13 90 T3_L .req d16 91 T3_H .req d17 92 93 .text 94 .fpu crypto-neon-fp-armv8 95 96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 97 vmull.p64 \rd, \rn, \rm 98 .endm 99 100 /* 101 * This implementation of 64x64 -> 128 bit polynomial multiplication 102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 103 * "Fast Software Polynomial Multiplication on ARM Processors Using 104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 106 * 107 * It has been slightly tweaked for in-order performance, and to allow 108 * 'rq' to overlap with 'ad' or 'bd'. 109 */ 110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 111 vext.8 t0l, \ad, \ad, #1 @ A1 112 .ifc \b1, t4l 113 vext.8 t4l, \bd, \bd, #1 @ B1 114 .endif 115 vmull.p8 t0q, t0l, \bd @ F = A1*B 116 vext.8 t1l, \ad, \ad, #2 @ A2 117 vmull.p8 t4q, \ad, \b1 @ E = A*B1 118 .ifc \b2, t3l 119 vext.8 t3l, \bd, \bd, #2 @ B2 120 .endif 121 vmull.p8 t1q, t1l, \bd @ H = A2*B 122 vext.8 t2l, \ad, \ad, #3 @ A3 123 vmull.p8 t3q, \ad, \b2 @ G = A*B2 124 veor t0q, t0q, t4q @ L = E + F 125 .ifc \b3, t4l 126 vext.8 t4l, \bd, \bd, #3 @ B3 127 .endif 128 vmull.p8 t2q, t2l, \bd @ J = A3*B 129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 130 veor t1q, t1q, t3q @ M = G + H 131 .ifc \b4, t3l 132 vext.8 t3l, \bd, \bd, #4 @ B4 133 .endif 134 vmull.p8 t4q, \ad, \b3 @ I = A*B3 135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 136 vmull.p8 t3q, \ad, \b4 @ K = A*B4 137 vand t0h, t0h, k48 138 vand t1h, t1h, k32 139 veor t2q, t2q, t4q @ N = I + J 140 veor t0l, t0l, t0h 141 veor t1l, t1l, t1h 142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 143 vand t2h, t2h, k16 144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 145 vmov.i64 t3h, #0 146 vext.8 t0q, t0q, t0q, #15 147 veor t2l, t2l, t2h 148 vext.8 t1q, t1q, t1q, #14 149 vmull.p8 \rq, \ad, \bd @ D = A*B 150 vext.8 t2q, t2q, t2q, #13 151 vext.8 t3q, t3q, t3q, #12 152 veor t0q, t0q, t1q 153 veor t2q, t2q, t3q 154 veor \rq, \rq, t0q 155 veor \rq, \rq, t2q 156 .endm 157 158 // 159 // PMULL (64x64->128) based reduction for CPUs that can do 160 // it in a single instruction. 161 // 162 .macro __pmull_reduce_p64 163 vmull.p64 T1, XL_L, MASK 164 165 veor XH_L, XH_L, XM_H 166 vext.8 T1, T1, T1, #8 167 veor XL_H, XL_H, XM_L 168 veor T1, T1, XL 169 170 vmull.p64 XL, T1_H, MASK 171 .endm 172 173 // 174 // Alternative reduction for CPUs that lack support for the 175 // 64x64->128 PMULL instruction 176 // 177 .macro __pmull_reduce_p8 178 veor XL_H, XL_H, XM_L 179 veor XH_L, XH_L, XM_H 180 181 vshl.i64 T1, XL, #57 182 vshl.i64 T2, XL, #62 183 veor T1, T1, T2 184 vshl.i64 T2, XL, #63 185 veor T1, T1, T2 186 veor XL_H, XL_H, T1_L 187 veor XH_L, XH_L, T1_H 188 189 vshr.u64 T1, XL, #1 190 veor XH, XH, XL 191 veor XL, XL, T1 192 vshr.u64 T1, T1, #6 193 vshr.u64 XL, XL, #1 194 .endm 195 196 .macro ghash_update, pn 197 vld1.64 {XL}, [r1] 198 199 /* do the head block first, if supplied */ 200 ldr ip, [sp] 201 teq ip, #0 202 beq 0f 203 vld1.64 {T1}, [ip] 204 teq r0, #0 205 b 3f 206 2070: .ifc \pn, p64 208 tst r0, #3 // skip until #blocks is a 209 bne 2f // round multiple of 4 210 211 vld1.8 {XL2-XM2}, [r2]! 2121: vld1.8 {T3-T2}, [r2]! 213 vrev64.8 XL2, XL2 214 vrev64.8 XM2, XM2 215 216 subs r0, r0, #4 217 218 vext.8 T1, XL2, XL2, #8 219 veor XL2_H, XL2_H, XL_L 220 veor XL, XL, T1 221 222 vrev64.8 T3, T3 223 vrev64.8 T1, T2 224 225 vmull.p64 XH, HH4_H, XL_H // a1 * b1 226 veor XL2_H, XL2_H, XL_H 227 vmull.p64 XL, HH4_L, XL_L // a0 * b0 228 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 229 230 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 231 veor XM2_L, XM2_L, XM2_H 232 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 233 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 234 235 veor XH, XH, XH2 236 veor XL, XL, XL2 237 veor XM, XM, XM2 238 239 vmull.p64 XH2, HH_H, T3_L // a1 * b1 240 veor T3_L, T3_L, T3_H 241 vmull.p64 XL2, HH_L, T3_H // a0 * b0 242 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 243 244 veor XH, XH, XH2 245 veor XL, XL, XL2 246 veor XM, XM, XM2 247 248 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 249 veor T1_L, T1_L, T1_H 250 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 251 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 252 253 veor XH, XH, XH2 254 veor XL, XL, XL2 255 veor XM, XM, XM2 256 257 beq 4f 258 259 vld1.8 {XL2-XM2}, [r2]! 260 261 veor T1, XL, XH 262 veor XM, XM, T1 263 264 __pmull_reduce_p64 265 266 veor T1, T1, XH 267 veor XL, XL, T1 268 269 b 1b 270 .endif 271 2722: vld1.64 {T1}, [r2]! 273 subs r0, r0, #1 274 2753: /* multiply XL by SHASH in GF(2^128) */ 276#ifndef CONFIG_CPU_BIG_ENDIAN 277 vrev64.8 T1, T1 278#endif 279 vext.8 IN1, T1, T1, #8 280 veor T1_L, T1_L, XL_H 281 veor XL, XL, IN1 282 283 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 284 veor T1, T1, XL 285 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 286 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 287 2884: veor T1, XL, XH 289 veor XM, XM, T1 290 291 __pmull_reduce_\pn 292 293 veor T1, T1, XH 294 veor XL, XL, T1 295 296 bne 0b 297 298 vst1.64 {XL}, [r1] 299 bx lr 300 .endm 301 302 /* 303 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 304 * struct ghash_key const *k, const char *head) 305 */ 306ENTRY(pmull_ghash_update_p64) 307 vld1.64 {SHASH}, [r3]! 308 vld1.64 {HH}, [r3]! 309 vld1.64 {HH3-HH4}, [r3] 310 311 veor SHASH2_p64, SHASH_L, SHASH_H 312 veor SHASH2_H, HH_L, HH_H 313 veor HH34_L, HH3_L, HH3_H 314 veor HH34_H, HH4_L, HH4_H 315 316 vmov.i8 MASK, #0xe1 317 vshl.u64 MASK, MASK, #57 318 319 ghash_update p64 320ENDPROC(pmull_ghash_update_p64) 321 322ENTRY(pmull_ghash_update_p8) 323 vld1.64 {SHASH}, [r3] 324 veor SHASH2_p8, SHASH_L, SHASH_H 325 326 vext.8 s1l, SHASH_L, SHASH_L, #1 327 vext.8 s2l, SHASH_L, SHASH_L, #2 328 vext.8 s3l, SHASH_L, SHASH_L, #3 329 vext.8 s4l, SHASH_L, SHASH_L, #4 330 vext.8 s1h, SHASH_H, SHASH_H, #1 331 vext.8 s2h, SHASH_H, SHASH_H, #2 332 vext.8 s3h, SHASH_H, SHASH_H, #3 333 vext.8 s4h, SHASH_H, SHASH_H, #4 334 335 vmov.i64 k16, #0xffff 336 vmov.i64 k32, #0xffffffff 337 vmov.i64 k48, #0xffffffffffff 338 339 ghash_update p8 340ENDPROC(pmull_ghash_update_p8) 341