1/* 2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 3 * 4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req q0 15 T1 .req q1 16 XL .req q2 17 XM .req q3 18 XH .req q4 19 IN1 .req q4 20 21 SHASH_L .req d0 22 SHASH_H .req d1 23 T1_L .req d2 24 T1_H .req d3 25 XL_L .req d4 26 XL_H .req d5 27 XM_L .req d6 28 XM_H .req d7 29 XH_L .req d8 30 31 t0l .req d10 32 t0h .req d11 33 t1l .req d12 34 t1h .req d13 35 t2l .req d14 36 t2h .req d15 37 t3l .req d16 38 t3h .req d17 39 t4l .req d18 40 t4h .req d19 41 42 t0q .req q5 43 t1q .req q6 44 t2q .req q7 45 t3q .req q8 46 t4q .req q9 47 T2 .req q9 48 49 s1l .req d20 50 s1h .req d21 51 s2l .req d22 52 s2h .req d23 53 s3l .req d24 54 s3h .req d25 55 s4l .req d26 56 s4h .req d27 57 58 MASK .req d28 59 SHASH2_p8 .req d28 60 61 k16 .req d29 62 k32 .req d30 63 k48 .req d31 64 SHASH2_p64 .req d31 65 66 .text 67 .fpu crypto-neon-fp-armv8 68 69 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 70 vmull.p64 \rd, \rn, \rm 71 .endm 72 73 /* 74 * This implementation of 64x64 -> 128 bit polynomial multiplication 75 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 76 * "Fast Software Polynomial Multiplication on ARM Processors Using 77 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 78 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 79 * 80 * It has been slightly tweaked for in-order performance, and to allow 81 * 'rq' to overlap with 'ad' or 'bd'. 82 */ 83 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 84 vext.8 t0l, \ad, \ad, #1 @ A1 85 .ifc \b1, t4l 86 vext.8 t4l, \bd, \bd, #1 @ B1 87 .endif 88 vmull.p8 t0q, t0l, \bd @ F = A1*B 89 vext.8 t1l, \ad, \ad, #2 @ A2 90 vmull.p8 t4q, \ad, \b1 @ E = A*B1 91 .ifc \b2, t3l 92 vext.8 t3l, \bd, \bd, #2 @ B2 93 .endif 94 vmull.p8 t1q, t1l, \bd @ H = A2*B 95 vext.8 t2l, \ad, \ad, #3 @ A3 96 vmull.p8 t3q, \ad, \b2 @ G = A*B2 97 veor t0q, t0q, t4q @ L = E + F 98 .ifc \b3, t4l 99 vext.8 t4l, \bd, \bd, #3 @ B3 100 .endif 101 vmull.p8 t2q, t2l, \bd @ J = A3*B 102 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 103 veor t1q, t1q, t3q @ M = G + H 104 .ifc \b4, t3l 105 vext.8 t3l, \bd, \bd, #4 @ B4 106 .endif 107 vmull.p8 t4q, \ad, \b3 @ I = A*B3 108 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 109 vmull.p8 t3q, \ad, \b4 @ K = A*B4 110 vand t0h, t0h, k48 111 vand t1h, t1h, k32 112 veor t2q, t2q, t4q @ N = I + J 113 veor t0l, t0l, t0h 114 veor t1l, t1l, t1h 115 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 116 vand t2h, t2h, k16 117 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 118 vmov.i64 t3h, #0 119 vext.8 t0q, t0q, t0q, #15 120 veor t2l, t2l, t2h 121 vext.8 t1q, t1q, t1q, #14 122 vmull.p8 \rq, \ad, \bd @ D = A*B 123 vext.8 t2q, t2q, t2q, #13 124 vext.8 t3q, t3q, t3q, #12 125 veor t0q, t0q, t1q 126 veor t2q, t2q, t3q 127 veor \rq, \rq, t0q 128 veor \rq, \rq, t2q 129 .endm 130 131 // 132 // PMULL (64x64->128) based reduction for CPUs that can do 133 // it in a single instruction. 134 // 135 .macro __pmull_reduce_p64 136 vmull.p64 T1, XL_L, MASK 137 138 veor XH_L, XH_L, XM_H 139 vext.8 T1, T1, T1, #8 140 veor XL_H, XL_H, XM_L 141 veor T1, T1, XL 142 143 vmull.p64 XL, T1_H, MASK 144 .endm 145 146 // 147 // Alternative reduction for CPUs that lack support for the 148 // 64x64->128 PMULL instruction 149 // 150 .macro __pmull_reduce_p8 151 veor XL_H, XL_H, XM_L 152 veor XH_L, XH_L, XM_H 153 154 vshl.i64 T1, XL, #57 155 vshl.i64 T2, XL, #62 156 veor T1, T1, T2 157 vshl.i64 T2, XL, #63 158 veor T1, T1, T2 159 veor XL_H, XL_H, T1_L 160 veor XH_L, XH_L, T1_H 161 162 vshr.u64 T1, XL, #1 163 veor XH, XH, XL 164 veor XL, XL, T1 165 vshr.u64 T1, T1, #6 166 vshr.u64 XL, XL, #1 167 .endm 168 169 .macro ghash_update, pn 170 vld1.64 {XL}, [r1] 171 172 /* do the head block first, if supplied */ 173 ldr ip, [sp] 174 teq ip, #0 175 beq 0f 176 vld1.64 {T1}, [ip] 177 teq r0, #0 178 b 1f 179 1800: vld1.64 {T1}, [r2]! 181 subs r0, r0, #1 182 1831: /* multiply XL by SHASH in GF(2^128) */ 184#ifndef CONFIG_CPU_BIG_ENDIAN 185 vrev64.8 T1, T1 186#endif 187 vext.8 IN1, T1, T1, #8 188 veor T1_L, T1_L, XL_H 189 veor XL, XL, IN1 190 191 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 192 veor T1, T1, XL 193 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 194 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 195 196 veor T1, XL, XH 197 veor XM, XM, T1 198 199 __pmull_reduce_\pn 200 201 veor T1, T1, XH 202 veor XL, XL, T1 203 204 bne 0b 205 206 vst1.64 {XL}, [r1] 207 bx lr 208 .endm 209 210 /* 211 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 212 * struct ghash_key const *k, const char *head) 213 */ 214ENTRY(pmull_ghash_update_p64) 215 vld1.64 {SHASH}, [r3] 216 veor SHASH2_p64, SHASH_L, SHASH_H 217 218 vmov.i8 MASK, #0xe1 219 vshl.u64 MASK, MASK, #57 220 221 ghash_update p64 222ENDPROC(pmull_ghash_update_p64) 223 224ENTRY(pmull_ghash_update_p8) 225 vld1.64 {SHASH}, [r3] 226 veor SHASH2_p8, SHASH_L, SHASH_H 227 228 vext.8 s1l, SHASH_L, SHASH_L, #1 229 vext.8 s2l, SHASH_L, SHASH_L, #2 230 vext.8 s3l, SHASH_L, SHASH_L, #3 231 vext.8 s4l, SHASH_L, SHASH_L, #4 232 vext.8 s1h, SHASH_H, SHASH_H, #1 233 vext.8 s2h, SHASH_H, SHASH_H, #2 234 vext.8 s3h, SHASH_H, SHASH_H, #3 235 vext.8 s4h, SHASH_H, SHASH_H, #4 236 237 vmov.i64 k16, #0xffff 238 vmov.i64 k32, #0xffffffff 239 vmov.i64 k48, #0xffffffffffff 240 241 ghash_update p8 242ENDPROC(pmull_ghash_update_p8) 243