xref: /openbmc/linux/arch/arm/crypto/nh-neon-core.S (revision 8795a739)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * NH - ε-almost-universal hash function, NEON accelerated version
4 *
5 * Copyright 2018 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12	.text
13	.fpu		neon
14
15	KEY		.req	r0
16	MESSAGE		.req	r1
17	MESSAGE_LEN	.req	r2
18	HASH		.req	r3
19
20	PASS0_SUMS	.req	q0
21	PASS0_SUM_A	.req	d0
22	PASS0_SUM_B	.req	d1
23	PASS1_SUMS	.req	q1
24	PASS1_SUM_A	.req	d2
25	PASS1_SUM_B	.req	d3
26	PASS2_SUMS	.req	q2
27	PASS2_SUM_A	.req	d4
28	PASS2_SUM_B	.req	d5
29	PASS3_SUMS	.req	q3
30	PASS3_SUM_A	.req	d6
31	PASS3_SUM_B	.req	d7
32	K0		.req	q4
33	K1		.req	q5
34	K2		.req	q6
35	K3		.req	q7
36	T0		.req	q8
37	T0_L		.req	d16
38	T0_H		.req	d17
39	T1		.req	q9
40	T1_L		.req	d18
41	T1_H		.req	d19
42	T2		.req	q10
43	T2_L		.req	d20
44	T2_H		.req	d21
45	T3		.req	q11
46	T3_L		.req	d22
47	T3_H		.req	d23
48
49.macro _nh_stride	k0, k1, k2, k3
50
51	// Load next message stride
52	vld1.8		{T3}, [MESSAGE]!
53
54	// Load next key stride
55	vld1.32		{\k3}, [KEY]!
56
57	// Add message words to key words
58	vadd.u32	T0, T3, \k0
59	vadd.u32	T1, T3, \k1
60	vadd.u32	T2, T3, \k2
61	vadd.u32	T3, T3, \k3
62
63	// Multiply 32x32 => 64 and accumulate
64	vmlal.u32	PASS0_SUMS, T0_L, T0_H
65	vmlal.u32	PASS1_SUMS, T1_L, T1_H
66	vmlal.u32	PASS2_SUMS, T2_L, T2_H
67	vmlal.u32	PASS3_SUMS, T3_L, T3_H
68.endm
69
70/*
71 * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
72 *		u8 hash[NH_HASH_BYTES])
73 *
74 * It's guaranteed that message_len % 16 == 0.
75 */
76ENTRY(nh_neon)
77
78	vld1.32		{K0,K1}, [KEY]!
79	  vmov.u64	PASS0_SUMS, #0
80	  vmov.u64	PASS1_SUMS, #0
81	vld1.32		{K2}, [KEY]!
82	  vmov.u64	PASS2_SUMS, #0
83	  vmov.u64	PASS3_SUMS, #0
84
85	subs		MESSAGE_LEN, MESSAGE_LEN, #64
86	blt		.Lloop4_done
87.Lloop4:
88	_nh_stride	K0, K1, K2, K3
89	_nh_stride	K1, K2, K3, K0
90	_nh_stride	K2, K3, K0, K1
91	_nh_stride	K3, K0, K1, K2
92	subs		MESSAGE_LEN, MESSAGE_LEN, #64
93	bge		.Lloop4
94
95.Lloop4_done:
96	ands		MESSAGE_LEN, MESSAGE_LEN, #63
97	beq		.Ldone
98	_nh_stride	K0, K1, K2, K3
99
100	subs		MESSAGE_LEN, MESSAGE_LEN, #16
101	beq		.Ldone
102	_nh_stride	K1, K2, K3, K0
103
104	subs		MESSAGE_LEN, MESSAGE_LEN, #16
105	beq		.Ldone
106	_nh_stride	K2, K3, K0, K1
107
108.Ldone:
109	// Sum the accumulators for each pass, then store the sums to 'hash'
110	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
111	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
112	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
113	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
114	vst1.8		{T0-T1}, [HASH]
115	bx		lr
116ENDPROC(nh_neon)
117