xref: /openbmc/linux/arch/x86/crypto/aesni-intel_avx-x86_64.S (revision b4646da0573fae9dfa2b8f1f10936cb6eedd7230)
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section	.rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY:            .octa     0xC2000000000000000000000000000001
128
129.section	.rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2:           .octa     0xC20000000000000000000001C2000000
132
133.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE:          .octa     0x00000001000000000000000000000001
136
137.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140
141.section	.rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE:             .octa     0x00000000000000000000000000000001
144
145.section	.rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf:            .octa     0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section	.rodata, "a", @progbits
152.align 16
153SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155                 .octa     0x00000000000000000000000000000000
156
157.text
158
159
160#define AadHash 16*0
161#define AadLen 16*1
162#define InLen (16*1)+8
163#define PBlockEncKey 16*2
164#define OrigIV 16*3
165#define CurCount 16*4
166#define PBlockLen 16*5
167
168HashKey        = 16*6   # store HashKey <<1 mod poly here
169HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
170HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
171HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
172HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
173HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
174HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
175HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
176HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
177HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
178HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
179HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
180HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
181HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
182HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
183HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
184
185#define arg1 %rdi
186#define arg2 %rsi
187#define arg3 %rdx
188#define arg4 %rcx
189#define arg5 %r8
190#define arg6 %r9
191#define keysize 2*15*16(arg1)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212TMP1 =   16*0    # Temporary storage for AAD
213TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
214TMP3 =   16*2    # Temporary storage for AES State 3
215TMP4 =   16*3    # Temporary storage for AES State 4
216TMP5 =   16*4    # Temporary storage for AES State 5
217TMP6 =   16*5    # Temporary storage for AES State 6
218TMP7 =   16*6    # Temporary storage for AES State 7
219TMP8 =   16*7    # Temporary storage for AES State 8
220
221VARIABLE_OFFSET = 16*8
222
223################################
224# Utility Macros
225################################
226
227.macro FUNC_SAVE
228        push    %r12
229        push    %r13
230        push    %r15
231
232	push	%rbp
233	mov	%rsp, %rbp
234
235        sub     $VARIABLE_OFFSET, %rsp
236        and     $~63, %rsp                    # align rsp to 64 bytes
237.endm
238
239.macro FUNC_RESTORE
240        mov     %rbp, %rsp
241	pop	%rbp
242
243        pop     %r15
244        pop     %r13
245        pop     %r12
246.endm
247
248# Encryption of a single block
249.macro ENCRYPT_SINGLE_BLOCK REP XMM0
250                vpxor    (arg1), \XMM0, \XMM0
251               i = 1
252               setreg
253.rep \REP
254                vaesenc  16*i(arg1), \XMM0, \XMM0
255               i = (i+1)
256               setreg
257.endr
258                vaesenclast 16*i(arg1), \XMM0, \XMM0
259.endm
260
261# combined for GCM encrypt and decrypt functions
262# clobbering all xmm registers
263# clobbering r10, r11, r12, r13, r15, rax
264.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
265        vmovdqu AadHash(arg2), %xmm8
266        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
267        add arg5, InLen(arg2)
268
269        # initialize the data pointer offset as zero
270        xor     %r11d, %r11d
271
272        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
273        sub %r11, arg5
274
275        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
276        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
277
278        mov     %r13, %r12
279        shr     $4, %r12
280        and     $7, %r12
281        jz      .L_initial_num_blocks_is_0\@
282
283        cmp     $7, %r12
284        je      .L_initial_num_blocks_is_7\@
285        cmp     $6, %r12
286        je      .L_initial_num_blocks_is_6\@
287        cmp     $5, %r12
288        je      .L_initial_num_blocks_is_5\@
289        cmp     $4, %r12
290        je      .L_initial_num_blocks_is_4\@
291        cmp     $3, %r12
292        je      .L_initial_num_blocks_is_3\@
293        cmp     $2, %r12
294        je      .L_initial_num_blocks_is_2\@
295
296        jmp     .L_initial_num_blocks_is_1\@
297
298.L_initial_num_blocks_is_7\@:
299        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
300        sub     $16*7, %r13
301        jmp     .L_initial_blocks_encrypted\@
302
303.L_initial_num_blocks_is_6\@:
304        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
305        sub     $16*6, %r13
306        jmp     .L_initial_blocks_encrypted\@
307
308.L_initial_num_blocks_is_5\@:
309        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
310        sub     $16*5, %r13
311        jmp     .L_initial_blocks_encrypted\@
312
313.L_initial_num_blocks_is_4\@:
314        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
315        sub     $16*4, %r13
316        jmp     .L_initial_blocks_encrypted\@
317
318.L_initial_num_blocks_is_3\@:
319        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
320        sub     $16*3, %r13
321        jmp     .L_initial_blocks_encrypted\@
322
323.L_initial_num_blocks_is_2\@:
324        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
325        sub     $16*2, %r13
326        jmp     .L_initial_blocks_encrypted\@
327
328.L_initial_num_blocks_is_1\@:
329        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
330        sub     $16*1, %r13
331        jmp     .L_initial_blocks_encrypted\@
332
333.L_initial_num_blocks_is_0\@:
334        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335
336
337.L_initial_blocks_encrypted\@:
338        test    %r13, %r13
339        je      .L_zero_cipher_left\@
340
341        sub     $128, %r13
342        je      .L_eight_cipher_left\@
343
344
345
346
347        vmovd   %xmm9, %r15d
348        and     $255, %r15d
349        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
350
351
352.L_encrypt_by_8_new\@:
353        cmp     $(255-8), %r15d
354        jg      .L_encrypt_by_8\@
355
356
357
358        add     $8, %r15b
359        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
360        add     $128, %r11
361        sub     $128, %r13
362        jne     .L_encrypt_by_8_new\@
363
364        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
365        jmp     .L_eight_cipher_left\@
366
367.L_encrypt_by_8\@:
368        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
369        add     $8, %r15b
370        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
371        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
372        add     $128, %r11
373        sub     $128, %r13
374        jne     .L_encrypt_by_8_new\@
375
376        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
377
378
379
380
381.L_eight_cipher_left\@:
382        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
383
384
385.L_zero_cipher_left\@:
386        vmovdqu %xmm14, AadHash(arg2)
387        vmovdqu %xmm9, CurCount(arg2)
388
389        # check for 0 length
390        mov     arg5, %r13
391        and     $15, %r13                            # r13 = (arg5 mod 16)
392
393        je      .L_multiple_of_16_bytes\@
394
395        # handle the last <16 Byte block separately
396
397        mov %r13, PBlockLen(arg2)
398
399        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
400        vmovdqu %xmm9, CurCount(arg2)
401        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
402
403        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
404        vmovdqu %xmm9, PBlockEncKey(arg2)
405
406        cmp $16, arg5
407        jge .L_large_enough_update\@
408
409        lea (arg4,%r11,1), %r10
410        mov %r13, %r12
411
412        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
413
414        lea     SHIFT_MASK+16(%rip), %r12
415        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
416						     # able to shift 16-r13 bytes (r13 is the
417	# number of bytes in plaintext mod 16)
418
419        jmp .L_final_ghash_mul\@
420
421.L_large_enough_update\@:
422        sub $16, %r11
423        add %r13, %r11
424
425        # receive the last <16 Byte block
426        vmovdqu	(arg4, %r11, 1), %xmm1
427
428        sub	%r13, %r11
429        add	$16, %r11
430
431        lea	SHIFT_MASK+16(%rip), %r12
432        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
433        # (r13 is the number of bytes in plaintext mod 16)
434        sub	%r13, %r12
435        # get the appropriate shuffle mask
436        vmovdqu	(%r12), %xmm2
437        # shift right 16-r13 bytes
438        vpshufb  %xmm2, %xmm1, %xmm1
439
440.L_final_ghash_mul\@:
441        .if  \ENC_DEC ==  DEC
442        vmovdqa %xmm1, %xmm2
443        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
444        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
445						     # mask out top 16-r13 bytes of xmm9
446        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
447        vpand   %xmm1, %xmm2, %xmm2
448        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
449        vpxor   %xmm2, %xmm14, %xmm14
450
451        vmovdqu %xmm14, AadHash(arg2)
452        .else
453        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
454        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
455						     # mask out top 16-r13 bytes of xmm9
456        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
457        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
458        vpxor   %xmm9, %xmm14, %xmm14
459
460        vmovdqu %xmm14, AadHash(arg2)
461        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
462        .endif
463
464
465        #############################
466        # output r13 Bytes
467        vmovq   %xmm9, %rax
468        cmp     $8, %r13
469        jle     .L_less_than_8_bytes_left\@
470
471        mov     %rax, (arg3 , %r11)
472        add     $8, %r11
473        vpsrldq $8, %xmm9, %xmm9
474        vmovq   %xmm9, %rax
475        sub     $8, %r13
476
477.L_less_than_8_bytes_left\@:
478        movb    %al, (arg3 , %r11)
479        add     $1, %r11
480        shr     $8, %rax
481        sub     $1, %r13
482        jne     .L_less_than_8_bytes_left\@
483        #############################
484
485.L_multiple_of_16_bytes\@:
486.endm
487
488
489# GCM_COMPLETE Finishes update of tag of last partial block
490# Output: Authorization Tag (AUTH_TAG)
491# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
492.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
493        vmovdqu AadHash(arg2), %xmm14
494        vmovdqu HashKey(arg2), %xmm13
495
496        mov PBlockLen(arg2), %r12
497        test %r12, %r12
498        je .L_partial_done\@
499
500	#GHASH computation for the last <16 Byte block
501        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
502
503.L_partial_done\@:
504        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
505        shl     $3, %r12                             # convert into number of bits
506        vmovd   %r12d, %xmm15                        # len(A) in xmm15
507
508        mov InLen(arg2), %r12
509        shl     $3, %r12                        # len(C) in bits  (*128)
510        vmovq   %r12, %xmm1
511        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
512        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
513
514        vpxor   %xmm15, %xmm14, %xmm14
515        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
516        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
517
518        vmovdqu OrigIV(arg2), %xmm9
519
520        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
521
522        vpxor   %xmm14, %xmm9, %xmm9
523
524
525
526.L_return_T\@:
527        mov     \AUTH_TAG, %r10              # r10 = authTag
528        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
529
530        cmp     $16, %r11
531        je      .L_T_16\@
532
533        cmp     $8, %r11
534        jl      .L_T_4\@
535
536.L_T_8\@:
537        vmovq   %xmm9, %rax
538        mov     %rax, (%r10)
539        add     $8, %r10
540        sub     $8, %r11
541        vpsrldq $8, %xmm9, %xmm9
542        test    %r11, %r11
543        je     .L_return_T_done\@
544.L_T_4\@:
545        vmovd   %xmm9, %eax
546        mov     %eax, (%r10)
547        add     $4, %r10
548        sub     $4, %r11
549        vpsrldq     $4, %xmm9, %xmm9
550        test    %r11, %r11
551        je     .L_return_T_done\@
552.L_T_123\@:
553        vmovd     %xmm9, %eax
554        cmp     $2, %r11
555        jl     .L_T_1\@
556        mov     %ax, (%r10)
557        cmp     $2, %r11
558        je     .L_return_T_done\@
559        add     $2, %r10
560        sar     $16, %eax
561.L_T_1\@:
562        mov     %al, (%r10)
563        jmp     .L_return_T_done\@
564
565.L_T_16\@:
566        vmovdqu %xmm9, (%r10)
567
568.L_return_T_done\@:
569.endm
570
571.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
572
573	mov     \AAD, %r10                      # r10 = AAD
574	mov     \AADLEN, %r12                      # r12 = aadLen
575
576
577	mov     %r12, %r11
578
579	vpxor   \T8, \T8, \T8
580	vpxor   \T7, \T7, \T7
581	cmp     $16, %r11
582	jl      .L_get_AAD_rest8\@
583.L_get_AAD_blocks\@:
584	vmovdqu (%r10), \T7
585	vpshufb SHUF_MASK(%rip), \T7, \T7
586	vpxor   \T7, \T8, \T8
587	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
588	add     $16, %r10
589	sub     $16, %r12
590	sub     $16, %r11
591	cmp     $16, %r11
592	jge     .L_get_AAD_blocks\@
593	vmovdqu \T8, \T7
594	test    %r11, %r11
595	je      .L_get_AAD_done\@
596
597	vpxor   \T7, \T7, \T7
598
599	/* read the last <16B of AAD. since we have at least 4B of
600	data right after the AAD (the ICV, and maybe some CT), we can
601	read 4B/8B blocks safely, and then get rid of the extra stuff */
602.L_get_AAD_rest8\@:
603	cmp     $4, %r11
604	jle     .L_get_AAD_rest4\@
605	movq    (%r10), \T1
606	add     $8, %r10
607	sub     $8, %r11
608	vpslldq $8, \T1, \T1
609	vpsrldq $8, \T7, \T7
610	vpxor   \T1, \T7, \T7
611	jmp     .L_get_AAD_rest8\@
612.L_get_AAD_rest4\@:
613	test    %r11, %r11
614	jle     .L_get_AAD_rest0\@
615	mov     (%r10), %eax
616	movq    %rax, \T1
617	add     $4, %r10
618	sub     $4, %r11
619	vpslldq $12, \T1, \T1
620	vpsrldq $4, \T7, \T7
621	vpxor   \T1, \T7, \T7
622.L_get_AAD_rest0\@:
623	/* finalize: shift out the extra bytes we read, and align
624	left. since pslldq can only shift by an immediate, we use
625	vpshufb and a pair of shuffle masks */
626	leaq	ALL_F(%rip), %r11
627	subq	%r12, %r11
628	vmovdqu	16(%r11), \T1
629	andq	$~3, %r11
630	vpshufb (%r11), \T7, \T7
631	vpand	\T1, \T7, \T7
632.L_get_AAD_rest_final\@:
633	vpshufb SHUF_MASK(%rip), \T7, \T7
634	vpxor   \T8, \T7, \T7
635	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
636
637.L_get_AAD_done\@:
638        vmovdqu \T7, AadHash(arg2)
639.endm
640
641.macro INIT GHASH_MUL PRECOMPUTE
642        mov arg6, %r11
643        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
644        xor %r11d, %r11d
645        mov %r11, InLen(arg2) # ctx_data.in_length = 0
646
647        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
648        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
649        mov arg3, %rax
650        movdqu (%rax), %xmm0
651        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
652
653        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
654        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
655
656        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
657
658        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
659        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
660        vmovdqa  %xmm6, %xmm2
661        vpsllq   $1, %xmm6, %xmm6
662        vpsrlq   $63, %xmm2, %xmm2
663        vmovdqa  %xmm2, %xmm1
664        vpslldq  $8, %xmm2, %xmm2
665        vpsrldq  $8, %xmm1, %xmm1
666        vpor     %xmm2, %xmm6, %xmm6
667        #reduction
668        vpshufd  $0b00100100, %xmm1, %xmm2
669        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
670        vpand    POLY(%rip), %xmm2, %xmm2
671        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
672        #######################################################################
673        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
674
675        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
676
677        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
678.endm
679
680
681# Reads DLEN bytes starting at DPTR and stores in XMMDst
682# where 0 < DLEN < 16
683# Clobbers %rax, DLEN
684.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
685        vpxor \XMMDst, \XMMDst, \XMMDst
686
687        cmp $8, \DLEN
688        jl .L_read_lt8_\@
689        mov (\DPTR), %rax
690        vpinsrq $0, %rax, \XMMDst, \XMMDst
691        sub $8, \DLEN
692        jz .L_done_read_partial_block_\@
693        xor %eax, %eax
694.L_read_next_byte_\@:
695        shl $8, %rax
696        mov 7(\DPTR, \DLEN, 1), %al
697        dec \DLEN
698        jnz .L_read_next_byte_\@
699        vpinsrq $1, %rax, \XMMDst, \XMMDst
700        jmp .L_done_read_partial_block_\@
701.L_read_lt8_\@:
702        xor %eax, %eax
703.L_read_next_byte_lt8_\@:
704        shl $8, %rax
705        mov -1(\DPTR, \DLEN, 1), %al
706        dec \DLEN
707        jnz .L_read_next_byte_lt8_\@
708        vpinsrq $0, %rax, \XMMDst, \XMMDst
709.L_done_read_partial_block_\@:
710.endm
711
712# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
713# between update calls.
714# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
715# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
716# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
717.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
718        AAD_HASH ENC_DEC
719        mov 	PBlockLen(arg2), %r13
720        test	%r13, %r13
721        je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
722        # Read in input data without over reading
723        cmp	$16, \PLAIN_CYPH_LEN
724        jl	.L_fewer_than_16_bytes_\@
725        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
726        jmp	.L_data_read_\@
727
728.L_fewer_than_16_bytes_\@:
729        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
730        mov	\PLAIN_CYPH_LEN, %r12
731        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
732
733        mov PBlockLen(arg2), %r13
734
735.L_data_read_\@:				# Finished reading in data
736
737        vmovdqu	PBlockEncKey(arg2), %xmm9
738        vmovdqu	HashKey(arg2), %xmm13
739
740        lea	SHIFT_MASK(%rip), %r12
741
742        # adjust the shuffle mask pointer to be able to shift r13 bytes
743        # r16-r13 is the number of bytes in plaintext mod 16)
744        add	%r13, %r12
745        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
746        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
747
748.if  \ENC_DEC ==  DEC
749        vmovdqa	%xmm1, %xmm3
750        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
751
752        mov	\PLAIN_CYPH_LEN, %r10
753        add	%r13, %r10
754        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
755        sub	$16, %r10
756        # Determine if if partial block is not being filled and
757        # shift mask accordingly
758        jge	.L_no_extra_mask_1_\@
759        sub	%r10, %r12
760.L_no_extra_mask_1_\@:
761
762        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
763        # get the appropriate mask to mask out bottom r13 bytes of xmm9
764        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
765
766        vpand	%xmm1, %xmm3, %xmm3
767        vmovdqa	SHUF_MASK(%rip), %xmm10
768        vpshufb	%xmm10, %xmm3, %xmm3
769        vpshufb	%xmm2, %xmm3, %xmm3
770        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
771
772        test	%r10, %r10
773        jl	.L_partial_incomplete_1_\@
774
775        # GHASH computation for the last <16 Byte block
776        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
777        xor	%eax,%eax
778
779        mov	%rax, PBlockLen(arg2)
780        jmp	.L_dec_done_\@
781.L_partial_incomplete_1_\@:
782        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
783.L_dec_done_\@:
784        vmovdqu	\AAD_HASH, AadHash(arg2)
785.else
786        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
787
788        mov	\PLAIN_CYPH_LEN, %r10
789        add	%r13, %r10
790        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
791        sub	$16, %r10
792        # Determine if if partial block is not being filled and
793        # shift mask accordingly
794        jge	.L_no_extra_mask_2_\@
795        sub	%r10, %r12
796.L_no_extra_mask_2_\@:
797
798        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
799        # get the appropriate mask to mask out bottom r13 bytes of xmm9
800        vpand	%xmm1, %xmm9, %xmm9
801
802        vmovdqa	SHUF_MASK(%rip), %xmm1
803        vpshufb %xmm1, %xmm9, %xmm9
804        vpshufb %xmm2, %xmm9, %xmm9
805        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
806
807        test	%r10, %r10
808        jl	.L_partial_incomplete_2_\@
809
810        # GHASH computation for the last <16 Byte block
811        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812        xor	%eax,%eax
813
814        mov	%rax, PBlockLen(arg2)
815        jmp	.L_encode_done_\@
816.L_partial_incomplete_2_\@:
817        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
818.L_encode_done_\@:
819        vmovdqu	\AAD_HASH, AadHash(arg2)
820
821        vmovdqa	SHUF_MASK(%rip), %xmm10
822        # shuffle xmm9 back to output as ciphertext
823        vpshufb	%xmm10, %xmm9, %xmm9
824        vpshufb	%xmm2, %xmm9, %xmm9
825.endif
826        # output encrypted Bytes
827        test	%r10, %r10
828        jl	.L_partial_fill_\@
829        mov	%r13, %r12
830        mov	$16, %r13
831        # Set r13 to be the number of bytes to write out
832        sub	%r12, %r13
833        jmp	.L_count_set_\@
834.L_partial_fill_\@:
835        mov	\PLAIN_CYPH_LEN, %r13
836.L_count_set_\@:
837        vmovdqa	%xmm9, %xmm0
838        vmovq	%xmm0, %rax
839        cmp	$8, %r13
840        jle	.L_less_than_8_bytes_left_\@
841
842        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
843        add	$8, \DATA_OFFSET
844        psrldq	$8, %xmm0
845        vmovq	%xmm0, %rax
846        sub	$8, %r13
847.L_less_than_8_bytes_left_\@:
848        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
849        add	$1, \DATA_OFFSET
850        shr	$8, %rax
851        sub	$1, %r13
852        jne	.L_less_than_8_bytes_left_\@
853.L_partial_block_done_\@:
854.endm # PARTIAL_BLOCK
855
856###############################################################################
857# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
858# Input: A and B (128-bits each, bit-reflected)
859# Output: C = A*B*x mod poly, (i.e. >>1 )
860# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
861# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
862###############################################################################
863.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
864
865        vpshufd         $0b01001110, \GH, \T2
866        vpshufd         $0b01001110, \HK, \T3
867        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
868        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
869
870        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
871        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
872        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
873        vpxor           \GH, \T2,\T2
874        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
875
876        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
877        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
878        vpxor           \T3, \GH, \GH
879        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
880
881        #first phase of the reduction
882        vpslld  $31, \GH, \T2                   # packed right shifting << 31
883        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
884        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
885
886        vpxor   \T3, \T2, \T2                   # xor the shifted versions
887        vpxor   \T4, \T2, \T2
888
889        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
890
891        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
892        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
893
894        #second phase of the reduction
895
896        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
897        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
898        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
899        vpxor   \T3, \T2, \T2                   # xor the shifted versions
900        vpxor   \T4, \T2, \T2
901
902        vpxor   \T5, \T2, \T2
903        vpxor   \T2, \GH, \GH
904        vpxor   \T1, \GH, \GH                   # the result is in GH
905
906
907.endm
908
909.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
910
911        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
912        vmovdqa  \HK, \T5
913
914        vpshufd  $0b01001110, \T5, \T1
915        vpxor    \T5, \T1, \T1
916        vmovdqu  \T1, HashKey_k(arg2)
917
918        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
919        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
920        vpshufd  $0b01001110, \T5, \T1
921        vpxor    \T5, \T1, \T1
922        vmovdqu  \T1, HashKey_2_k(arg2)
923
924        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
925        vmovdqu  \T5, HashKey_3(arg2)
926        vpshufd  $0b01001110, \T5, \T1
927        vpxor    \T5, \T1, \T1
928        vmovdqu  \T1, HashKey_3_k(arg2)
929
930        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
931        vmovdqu  \T5, HashKey_4(arg2)
932        vpshufd  $0b01001110, \T5, \T1
933        vpxor    \T5, \T1, \T1
934        vmovdqu  \T1, HashKey_4_k(arg2)
935
936        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
937        vmovdqu  \T5, HashKey_5(arg2)
938        vpshufd  $0b01001110, \T5, \T1
939        vpxor    \T5, \T1, \T1
940        vmovdqu  \T1, HashKey_5_k(arg2)
941
942        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
943        vmovdqu  \T5, HashKey_6(arg2)
944        vpshufd  $0b01001110, \T5, \T1
945        vpxor    \T5, \T1, \T1
946        vmovdqu  \T1, HashKey_6_k(arg2)
947
948        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
949        vmovdqu  \T5, HashKey_7(arg2)
950        vpshufd  $0b01001110, \T5, \T1
951        vpxor    \T5, \T1, \T1
952        vmovdqu  \T1, HashKey_7_k(arg2)
953
954        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
955        vmovdqu  \T5, HashKey_8(arg2)
956        vpshufd  $0b01001110, \T5, \T1
957        vpxor    \T5, \T1, \T1
958        vmovdqu  \T1, HashKey_8_k(arg2)
959
960.endm
961
962## if a = number of total plaintext bytes
963## b = floor(a/16)
964## num_initial_blocks = b mod 4#
965## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
966## r10, r11, r12, rax are clobbered
967## arg1, arg2, arg3, arg4 are used as pointers only, not modified
968
969.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
970	i = (8-\num_initial_blocks)
971	setreg
972        vmovdqu AadHash(arg2), reg_i
973
974	# start AES for num_initial_blocks blocks
975	vmovdqu CurCount(arg2), \CTR
976
977	i = (9-\num_initial_blocks)
978	setreg
979.rep \num_initial_blocks
980                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
981                vmovdqa \CTR, reg_i
982                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
983	i = (i+1)
984	setreg
985.endr
986
987	vmovdqa  (arg1), \T_key
988	i = (9-\num_initial_blocks)
989	setreg
990.rep \num_initial_blocks
991                vpxor   \T_key, reg_i, reg_i
992	i = (i+1)
993	setreg
994.endr
995
996       j = 1
997       setreg
998.rep \REP
999       vmovdqa  16*j(arg1), \T_key
1000	i = (9-\num_initial_blocks)
1001	setreg
1002.rep \num_initial_blocks
1003        vaesenc \T_key, reg_i, reg_i
1004	i = (i+1)
1005	setreg
1006.endr
1007
1008       j = (j+1)
1009       setreg
1010.endr
1011
1012	vmovdqa  16*j(arg1), \T_key
1013	i = (9-\num_initial_blocks)
1014	setreg
1015.rep \num_initial_blocks
1016        vaesenclast      \T_key, reg_i, reg_i
1017	i = (i+1)
1018	setreg
1019.endr
1020
1021	i = (9-\num_initial_blocks)
1022	setreg
1023.rep \num_initial_blocks
1024                vmovdqu (arg4, %r11), \T1
1025                vpxor   \T1, reg_i, reg_i
1026                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1027                add     $16, %r11
1028.if  \ENC_DEC == DEC
1029                vmovdqa \T1, reg_i
1030.endif
1031                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1032	i = (i+1)
1033	setreg
1034.endr
1035
1036
1037	i = (8-\num_initial_blocks)
1038	j = (9-\num_initial_blocks)
1039	setreg
1040
1041.rep \num_initial_blocks
1042        vpxor    reg_i, reg_j, reg_j
1043        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1044	i = (i+1)
1045	j = (j+1)
1046	setreg
1047.endr
1048        # XMM8 has the combined result here
1049
1050        vmovdqa  \XMM8, TMP1(%rsp)
1051        vmovdqa  \XMM8, \T3
1052
1053        cmp     $128, %r13
1054        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
1055
1056###############################################################################
1057# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1058                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1059                vmovdqa  \CTR, \XMM1
1060                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1061
1062                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1063                vmovdqa  \CTR, \XMM2
1064                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1065
1066                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1067                vmovdqa  \CTR, \XMM3
1068                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1069
1070                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1071                vmovdqa  \CTR, \XMM4
1072                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1073
1074                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1075                vmovdqa  \CTR, \XMM5
1076                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1077
1078                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1079                vmovdqa  \CTR, \XMM6
1080                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1081
1082                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1083                vmovdqa  \CTR, \XMM7
1084                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1085
1086                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1087                vmovdqa  \CTR, \XMM8
1088                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1089
1090                vmovdqa  (arg1), \T_key
1091                vpxor    \T_key, \XMM1, \XMM1
1092                vpxor    \T_key, \XMM2, \XMM2
1093                vpxor    \T_key, \XMM3, \XMM3
1094                vpxor    \T_key, \XMM4, \XMM4
1095                vpxor    \T_key, \XMM5, \XMM5
1096                vpxor    \T_key, \XMM6, \XMM6
1097                vpxor    \T_key, \XMM7, \XMM7
1098                vpxor    \T_key, \XMM8, \XMM8
1099
1100               i = 1
1101               setreg
1102.rep    \REP       # do REP rounds
1103                vmovdqa  16*i(arg1), \T_key
1104                vaesenc  \T_key, \XMM1, \XMM1
1105                vaesenc  \T_key, \XMM2, \XMM2
1106                vaesenc  \T_key, \XMM3, \XMM3
1107                vaesenc  \T_key, \XMM4, \XMM4
1108                vaesenc  \T_key, \XMM5, \XMM5
1109                vaesenc  \T_key, \XMM6, \XMM6
1110                vaesenc  \T_key, \XMM7, \XMM7
1111                vaesenc  \T_key, \XMM8, \XMM8
1112               i = (i+1)
1113               setreg
1114.endr
1115
1116                vmovdqa  16*i(arg1), \T_key
1117                vaesenclast  \T_key, \XMM1, \XMM1
1118                vaesenclast  \T_key, \XMM2, \XMM2
1119                vaesenclast  \T_key, \XMM3, \XMM3
1120                vaesenclast  \T_key, \XMM4, \XMM4
1121                vaesenclast  \T_key, \XMM5, \XMM5
1122                vaesenclast  \T_key, \XMM6, \XMM6
1123                vaesenclast  \T_key, \XMM7, \XMM7
1124                vaesenclast  \T_key, \XMM8, \XMM8
1125
1126                vmovdqu  (arg4, %r11), \T1
1127                vpxor    \T1, \XMM1, \XMM1
1128                vmovdqu  \XMM1, (arg3 , %r11)
1129                .if   \ENC_DEC == DEC
1130                vmovdqa  \T1, \XMM1
1131                .endif
1132
1133                vmovdqu  16*1(arg4, %r11), \T1
1134                vpxor    \T1, \XMM2, \XMM2
1135                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1136                .if   \ENC_DEC == DEC
1137                vmovdqa  \T1, \XMM2
1138                .endif
1139
1140                vmovdqu  16*2(arg4, %r11), \T1
1141                vpxor    \T1, \XMM3, \XMM3
1142                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1143                .if   \ENC_DEC == DEC
1144                vmovdqa  \T1, \XMM3
1145                .endif
1146
1147                vmovdqu  16*3(arg4, %r11), \T1
1148                vpxor    \T1, \XMM4, \XMM4
1149                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1150                .if   \ENC_DEC == DEC
1151                vmovdqa  \T1, \XMM4
1152                .endif
1153
1154                vmovdqu  16*4(arg4, %r11), \T1
1155                vpxor    \T1, \XMM5, \XMM5
1156                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1157                .if   \ENC_DEC == DEC
1158                vmovdqa  \T1, \XMM5
1159                .endif
1160
1161                vmovdqu  16*5(arg4, %r11), \T1
1162                vpxor    \T1, \XMM6, \XMM6
1163                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1164                .if   \ENC_DEC == DEC
1165                vmovdqa  \T1, \XMM6
1166                .endif
1167
1168                vmovdqu  16*6(arg4, %r11), \T1
1169                vpxor    \T1, \XMM7, \XMM7
1170                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1171                .if   \ENC_DEC == DEC
1172                vmovdqa  \T1, \XMM7
1173                .endif
1174
1175                vmovdqu  16*7(arg4, %r11), \T1
1176                vpxor    \T1, \XMM8, \XMM8
1177                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1178                .if   \ENC_DEC == DEC
1179                vmovdqa  \T1, \XMM8
1180                .endif
1181
1182                add     $128, %r11
1183
1184                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1185                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1186                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1187                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1188                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1189                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1190                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1191                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1192                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1193
1194###############################################################################
1195
1196.L_initial_blocks_done\@:
1197
1198.endm
1199
1200# encrypt 8 blocks at a time
1201# ghash the 8 previously encrypted ciphertext blocks
1202# arg1, arg2, arg3, arg4 are used as pointers only, not modified
1203# r11 is the data offset value
1204.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1205
1206        vmovdqa \XMM1, \T2
1207        vmovdqa \XMM2, TMP2(%rsp)
1208        vmovdqa \XMM3, TMP3(%rsp)
1209        vmovdqa \XMM4, TMP4(%rsp)
1210        vmovdqa \XMM5, TMP5(%rsp)
1211        vmovdqa \XMM6, TMP6(%rsp)
1212        vmovdqa \XMM7, TMP7(%rsp)
1213        vmovdqa \XMM8, TMP8(%rsp)
1214
1215.if \loop_idx == in_order
1216                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1217                vpaddd  ONE(%rip), \XMM1, \XMM2
1218                vpaddd  ONE(%rip), \XMM2, \XMM3
1219                vpaddd  ONE(%rip), \XMM3, \XMM4
1220                vpaddd  ONE(%rip), \XMM4, \XMM5
1221                vpaddd  ONE(%rip), \XMM5, \XMM6
1222                vpaddd  ONE(%rip), \XMM6, \XMM7
1223                vpaddd  ONE(%rip), \XMM7, \XMM8
1224                vmovdqa \XMM8, \CTR
1225
1226                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1227                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1228                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1229                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1230                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1231                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1232                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1233                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1234.else
1235                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1236                vpaddd  ONEf(%rip), \XMM1, \XMM2
1237                vpaddd  ONEf(%rip), \XMM2, \XMM3
1238                vpaddd  ONEf(%rip), \XMM3, \XMM4
1239                vpaddd  ONEf(%rip), \XMM4, \XMM5
1240                vpaddd  ONEf(%rip), \XMM5, \XMM6
1241                vpaddd  ONEf(%rip), \XMM6, \XMM7
1242                vpaddd  ONEf(%rip), \XMM7, \XMM8
1243                vmovdqa \XMM8, \CTR
1244.endif
1245
1246
1247        #######################################################################
1248
1249                vmovdqu (arg1), \T1
1250                vpxor   \T1, \XMM1, \XMM1
1251                vpxor   \T1, \XMM2, \XMM2
1252                vpxor   \T1, \XMM3, \XMM3
1253                vpxor   \T1, \XMM4, \XMM4
1254                vpxor   \T1, \XMM5, \XMM5
1255                vpxor   \T1, \XMM6, \XMM6
1256                vpxor   \T1, \XMM7, \XMM7
1257                vpxor   \T1, \XMM8, \XMM8
1258
1259        #######################################################################
1260
1261
1262
1263
1264
1265                vmovdqu 16*1(arg1), \T1
1266                vaesenc \T1, \XMM1, \XMM1
1267                vaesenc \T1, \XMM2, \XMM2
1268                vaesenc \T1, \XMM3, \XMM3
1269                vaesenc \T1, \XMM4, \XMM4
1270                vaesenc \T1, \XMM5, \XMM5
1271                vaesenc \T1, \XMM6, \XMM6
1272                vaesenc \T1, \XMM7, \XMM7
1273                vaesenc \T1, \XMM8, \XMM8
1274
1275                vmovdqu 16*2(arg1), \T1
1276                vaesenc \T1, \XMM1, \XMM1
1277                vaesenc \T1, \XMM2, \XMM2
1278                vaesenc \T1, \XMM3, \XMM3
1279                vaesenc \T1, \XMM4, \XMM4
1280                vaesenc \T1, \XMM5, \XMM5
1281                vaesenc \T1, \XMM6, \XMM6
1282                vaesenc \T1, \XMM7, \XMM7
1283                vaesenc \T1, \XMM8, \XMM8
1284
1285
1286        #######################################################################
1287
1288        vmovdqu         HashKey_8(arg2), \T5
1289        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1290        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1291
1292        vpshufd         $0b01001110, \T2, \T6
1293        vpxor           \T2, \T6, \T6
1294
1295        vmovdqu         HashKey_8_k(arg2), \T5
1296        vpclmulqdq      $0x00, \T5, \T6, \T6
1297
1298                vmovdqu 16*3(arg1), \T1
1299                vaesenc \T1, \XMM1, \XMM1
1300                vaesenc \T1, \XMM2, \XMM2
1301                vaesenc \T1, \XMM3, \XMM3
1302                vaesenc \T1, \XMM4, \XMM4
1303                vaesenc \T1, \XMM5, \XMM5
1304                vaesenc \T1, \XMM6, \XMM6
1305                vaesenc \T1, \XMM7, \XMM7
1306                vaesenc \T1, \XMM8, \XMM8
1307
1308        vmovdqa         TMP2(%rsp), \T1
1309        vmovdqu         HashKey_7(arg2), \T5
1310        vpclmulqdq      $0x11, \T5, \T1, \T3
1311        vpxor           \T3, \T4, \T4
1312        vpclmulqdq      $0x00, \T5, \T1, \T3
1313        vpxor           \T3, \T7, \T7
1314
1315        vpshufd         $0b01001110, \T1, \T3
1316        vpxor           \T1, \T3, \T3
1317        vmovdqu         HashKey_7_k(arg2), \T5
1318        vpclmulqdq      $0x10, \T5, \T3, \T3
1319        vpxor           \T3, \T6, \T6
1320
1321                vmovdqu 16*4(arg1), \T1
1322                vaesenc \T1, \XMM1, \XMM1
1323                vaesenc \T1, \XMM2, \XMM2
1324                vaesenc \T1, \XMM3, \XMM3
1325                vaesenc \T1, \XMM4, \XMM4
1326                vaesenc \T1, \XMM5, \XMM5
1327                vaesenc \T1, \XMM6, \XMM6
1328                vaesenc \T1, \XMM7, \XMM7
1329                vaesenc \T1, \XMM8, \XMM8
1330
1331        #######################################################################
1332
1333        vmovdqa         TMP3(%rsp), \T1
1334        vmovdqu         HashKey_6(arg2), \T5
1335        vpclmulqdq      $0x11, \T5, \T1, \T3
1336        vpxor           \T3, \T4, \T4
1337        vpclmulqdq      $0x00, \T5, \T1, \T3
1338        vpxor           \T3, \T7, \T7
1339
1340        vpshufd         $0b01001110, \T1, \T3
1341        vpxor           \T1, \T3, \T3
1342        vmovdqu         HashKey_6_k(arg2), \T5
1343        vpclmulqdq      $0x10, \T5, \T3, \T3
1344        vpxor           \T3, \T6, \T6
1345
1346                vmovdqu 16*5(arg1), \T1
1347                vaesenc \T1, \XMM1, \XMM1
1348                vaesenc \T1, \XMM2, \XMM2
1349                vaesenc \T1, \XMM3, \XMM3
1350                vaesenc \T1, \XMM4, \XMM4
1351                vaesenc \T1, \XMM5, \XMM5
1352                vaesenc \T1, \XMM6, \XMM6
1353                vaesenc \T1, \XMM7, \XMM7
1354                vaesenc \T1, \XMM8, \XMM8
1355
1356        vmovdqa         TMP4(%rsp), \T1
1357        vmovdqu         HashKey_5(arg2), \T5
1358        vpclmulqdq      $0x11, \T5, \T1, \T3
1359        vpxor           \T3, \T4, \T4
1360        vpclmulqdq      $0x00, \T5, \T1, \T3
1361        vpxor           \T3, \T7, \T7
1362
1363        vpshufd         $0b01001110, \T1, \T3
1364        vpxor           \T1, \T3, \T3
1365        vmovdqu         HashKey_5_k(arg2), \T5
1366        vpclmulqdq      $0x10, \T5, \T3, \T3
1367        vpxor           \T3, \T6, \T6
1368
1369                vmovdqu 16*6(arg1), \T1
1370                vaesenc \T1, \XMM1, \XMM1
1371                vaesenc \T1, \XMM2, \XMM2
1372                vaesenc \T1, \XMM3, \XMM3
1373                vaesenc \T1, \XMM4, \XMM4
1374                vaesenc \T1, \XMM5, \XMM5
1375                vaesenc \T1, \XMM6, \XMM6
1376                vaesenc \T1, \XMM7, \XMM7
1377                vaesenc \T1, \XMM8, \XMM8
1378
1379
1380        vmovdqa         TMP5(%rsp), \T1
1381        vmovdqu         HashKey_4(arg2), \T5
1382        vpclmulqdq      $0x11, \T5, \T1, \T3
1383        vpxor           \T3, \T4, \T4
1384        vpclmulqdq      $0x00, \T5, \T1, \T3
1385        vpxor           \T3, \T7, \T7
1386
1387        vpshufd         $0b01001110, \T1, \T3
1388        vpxor           \T1, \T3, \T3
1389        vmovdqu         HashKey_4_k(arg2), \T5
1390        vpclmulqdq      $0x10, \T5, \T3, \T3
1391        vpxor           \T3, \T6, \T6
1392
1393                vmovdqu 16*7(arg1), \T1
1394                vaesenc \T1, \XMM1, \XMM1
1395                vaesenc \T1, \XMM2, \XMM2
1396                vaesenc \T1, \XMM3, \XMM3
1397                vaesenc \T1, \XMM4, \XMM4
1398                vaesenc \T1, \XMM5, \XMM5
1399                vaesenc \T1, \XMM6, \XMM6
1400                vaesenc \T1, \XMM7, \XMM7
1401                vaesenc \T1, \XMM8, \XMM8
1402
1403        vmovdqa         TMP6(%rsp), \T1
1404        vmovdqu         HashKey_3(arg2), \T5
1405        vpclmulqdq      $0x11, \T5, \T1, \T3
1406        vpxor           \T3, \T4, \T4
1407        vpclmulqdq      $0x00, \T5, \T1, \T3
1408        vpxor           \T3, \T7, \T7
1409
1410        vpshufd         $0b01001110, \T1, \T3
1411        vpxor           \T1, \T3, \T3
1412        vmovdqu         HashKey_3_k(arg2), \T5
1413        vpclmulqdq      $0x10, \T5, \T3, \T3
1414        vpxor           \T3, \T6, \T6
1415
1416
1417                vmovdqu 16*8(arg1), \T1
1418                vaesenc \T1, \XMM1, \XMM1
1419                vaesenc \T1, \XMM2, \XMM2
1420                vaesenc \T1, \XMM3, \XMM3
1421                vaesenc \T1, \XMM4, \XMM4
1422                vaesenc \T1, \XMM5, \XMM5
1423                vaesenc \T1, \XMM6, \XMM6
1424                vaesenc \T1, \XMM7, \XMM7
1425                vaesenc \T1, \XMM8, \XMM8
1426
1427        vmovdqa         TMP7(%rsp), \T1
1428        vmovdqu         HashKey_2(arg2), \T5
1429        vpclmulqdq      $0x11, \T5, \T1, \T3
1430        vpxor           \T3, \T4, \T4
1431        vpclmulqdq      $0x00, \T5, \T1, \T3
1432        vpxor           \T3, \T7, \T7
1433
1434        vpshufd         $0b01001110, \T1, \T3
1435        vpxor           \T1, \T3, \T3
1436        vmovdqu         HashKey_2_k(arg2), \T5
1437        vpclmulqdq      $0x10, \T5, \T3, \T3
1438        vpxor           \T3, \T6, \T6
1439
1440        #######################################################################
1441
1442                vmovdqu 16*9(arg1), \T5
1443                vaesenc \T5, \XMM1, \XMM1
1444                vaesenc \T5, \XMM2, \XMM2
1445                vaesenc \T5, \XMM3, \XMM3
1446                vaesenc \T5, \XMM4, \XMM4
1447                vaesenc \T5, \XMM5, \XMM5
1448                vaesenc \T5, \XMM6, \XMM6
1449                vaesenc \T5, \XMM7, \XMM7
1450                vaesenc \T5, \XMM8, \XMM8
1451
1452        vmovdqa         TMP8(%rsp), \T1
1453        vmovdqu         HashKey(arg2), \T5
1454        vpclmulqdq      $0x11, \T5, \T1, \T3
1455        vpxor           \T3, \T4, \T4
1456        vpclmulqdq      $0x00, \T5, \T1, \T3
1457        vpxor           \T3, \T7, \T7
1458
1459        vpshufd         $0b01001110, \T1, \T3
1460        vpxor           \T1, \T3, \T3
1461        vmovdqu         HashKey_k(arg2), \T5
1462        vpclmulqdq      $0x10, \T5, \T3, \T3
1463        vpxor           \T3, \T6, \T6
1464
1465        vpxor           \T4, \T6, \T6
1466        vpxor           \T7, \T6, \T6
1467
1468                vmovdqu 16*10(arg1), \T5
1469
1470        i = 11
1471        setreg
1472.rep (\REP-9)
1473
1474        vaesenc \T5, \XMM1, \XMM1
1475        vaesenc \T5, \XMM2, \XMM2
1476        vaesenc \T5, \XMM3, \XMM3
1477        vaesenc \T5, \XMM4, \XMM4
1478        vaesenc \T5, \XMM5, \XMM5
1479        vaesenc \T5, \XMM6, \XMM6
1480        vaesenc \T5, \XMM7, \XMM7
1481        vaesenc \T5, \XMM8, \XMM8
1482
1483        vmovdqu 16*i(arg1), \T5
1484        i = i + 1
1485        setreg
1486.endr
1487
1488	i = 0
1489	j = 1
1490	setreg
1491.rep 8
1492		vpxor	16*i(arg4, %r11), \T5, \T2
1493                .if \ENC_DEC == ENC
1494                vaesenclast     \T2, reg_j, reg_j
1495                .else
1496                vaesenclast     \T2, reg_j, \T3
1497                vmovdqu 16*i(arg4, %r11), reg_j
1498                vmovdqu \T3, 16*i(arg3, %r11)
1499                .endif
1500	i = (i+1)
1501	j = (j+1)
1502	setreg
1503.endr
1504	#######################################################################
1505
1506
1507	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1508	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1509	vpxor	\T3, \T7, \T7
1510	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1511
1512
1513
1514	#######################################################################
1515	#first phase of the reduction
1516	#######################################################################
1517        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1518        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1519        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1520
1521        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1522        vpxor   \T4, \T2, \T2
1523
1524        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1525
1526        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1527        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1528	#######################################################################
1529                .if \ENC_DEC == ENC
1530		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1531		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1532		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1533		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1534		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1535		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1536		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1537		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1538                .endif
1539
1540	#######################################################################
1541	#second phase of the reduction
1542        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1543        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1544        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1545        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1546        vpxor   \T4, \T2, \T2
1547
1548        vpxor   \T1, \T2, \T2
1549        vpxor   \T2, \T7, \T7
1550        vpxor   \T7, \T6, \T6                           # the result is in T6
1551	#######################################################################
1552
1553		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1554		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1555		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1556		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1557		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1558		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1559		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1560		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1561
1562
1563	vpxor	\T6, \XMM1, \XMM1
1564
1565
1566
1567.endm
1568
1569
1570# GHASH the last 4 ciphertext blocks.
1571.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1572
1573        ## Karatsuba Method
1574
1575
1576        vpshufd         $0b01001110, \XMM1, \T2
1577        vpxor           \XMM1, \T2, \T2
1578        vmovdqu         HashKey_8(arg2), \T5
1579        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1580        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1581
1582        vmovdqu         HashKey_8_k(arg2), \T3
1583        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1584
1585        ######################
1586
1587        vpshufd         $0b01001110, \XMM2, \T2
1588        vpxor           \XMM2, \T2, \T2
1589        vmovdqu         HashKey_7(arg2), \T5
1590        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1591        vpxor           \T4, \T6, \T6
1592
1593        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1594        vpxor           \T4, \T7, \T7
1595
1596        vmovdqu         HashKey_7_k(arg2), \T3
1597        vpclmulqdq      $0x00, \T3, \T2, \T2
1598        vpxor           \T2, \XMM1, \XMM1
1599
1600        ######################
1601
1602        vpshufd         $0b01001110, \XMM3, \T2
1603        vpxor           \XMM3, \T2, \T2
1604        vmovdqu         HashKey_6(arg2), \T5
1605        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1606        vpxor           \T4, \T6, \T6
1607
1608        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1609        vpxor           \T4, \T7, \T7
1610
1611        vmovdqu         HashKey_6_k(arg2), \T3
1612        vpclmulqdq      $0x00, \T3, \T2, \T2
1613        vpxor           \T2, \XMM1, \XMM1
1614
1615        ######################
1616
1617        vpshufd         $0b01001110, \XMM4, \T2
1618        vpxor           \XMM4, \T2, \T2
1619        vmovdqu         HashKey_5(arg2), \T5
1620        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1621        vpxor           \T4, \T6, \T6
1622
1623        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1624        vpxor           \T4, \T7, \T7
1625
1626        vmovdqu         HashKey_5_k(arg2), \T3
1627        vpclmulqdq      $0x00, \T3, \T2, \T2
1628        vpxor           \T2, \XMM1, \XMM1
1629
1630        ######################
1631
1632        vpshufd         $0b01001110, \XMM5, \T2
1633        vpxor           \XMM5, \T2, \T2
1634        vmovdqu         HashKey_4(arg2), \T5
1635        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1636        vpxor           \T4, \T6, \T6
1637
1638        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1639        vpxor           \T4, \T7, \T7
1640
1641        vmovdqu         HashKey_4_k(arg2), \T3
1642        vpclmulqdq      $0x00, \T3, \T2, \T2
1643        vpxor           \T2, \XMM1, \XMM1
1644
1645        ######################
1646
1647        vpshufd         $0b01001110, \XMM6, \T2
1648        vpxor           \XMM6, \T2, \T2
1649        vmovdqu         HashKey_3(arg2), \T5
1650        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1651        vpxor           \T4, \T6, \T6
1652
1653        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1654        vpxor           \T4, \T7, \T7
1655
1656        vmovdqu         HashKey_3_k(arg2), \T3
1657        vpclmulqdq      $0x00, \T3, \T2, \T2
1658        vpxor           \T2, \XMM1, \XMM1
1659
1660        ######################
1661
1662        vpshufd         $0b01001110, \XMM7, \T2
1663        vpxor           \XMM7, \T2, \T2
1664        vmovdqu         HashKey_2(arg2), \T5
1665        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1666        vpxor           \T4, \T6, \T6
1667
1668        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1669        vpxor           \T4, \T7, \T7
1670
1671        vmovdqu         HashKey_2_k(arg2), \T3
1672        vpclmulqdq      $0x00, \T3, \T2, \T2
1673        vpxor           \T2, \XMM1, \XMM1
1674
1675        ######################
1676
1677        vpshufd         $0b01001110, \XMM8, \T2
1678        vpxor           \XMM8, \T2, \T2
1679        vmovdqu         HashKey(arg2), \T5
1680        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1681        vpxor           \T4, \T6, \T6
1682
1683        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1684        vpxor           \T4, \T7, \T7
1685
1686        vmovdqu         HashKey_k(arg2), \T3
1687        vpclmulqdq      $0x00, \T3, \T2, \T2
1688
1689        vpxor           \T2, \XMM1, \XMM1
1690        vpxor           \T6, \XMM1, \XMM1
1691        vpxor           \T7, \XMM1, \T2
1692
1693
1694
1695
1696        vpslldq $8, \T2, \T4
1697        vpsrldq $8, \T2, \T2
1698
1699        vpxor   \T4, \T7, \T7
1700        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1701				# the accumulated carry-less multiplications
1702
1703        #######################################################################
1704        #first phase of the reduction
1705        vpslld  $31, \T7, \T2   # packed right shifting << 31
1706        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1707        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1708
1709        vpxor   \T3, \T2, \T2   # xor the shifted versions
1710        vpxor   \T4, \T2, \T2
1711
1712        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1713
1714        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1715        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1716        #######################################################################
1717
1718
1719        #second phase of the reduction
1720        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1721        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1722        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1723        vpxor   \T3, \T2, \T2   # xor the shifted versions
1724        vpxor   \T4, \T2, \T2
1725
1726        vpxor   \T1, \T2, \T2
1727        vpxor   \T2, \T7, \T7
1728        vpxor   \T7, \T6, \T6   # the result is in T6
1729
1730.endm
1731
1732#############################################################
1733#void   aesni_gcm_precomp_avx_gen2
1734#        (gcm_data     *my_ctx_data,
1735#         gcm_context_data *data,
1736#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1737#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1738#			(from Security Association) concatenated with 8 byte
1739#			Initialisation Vector (from IPSec ESP Payload)
1740#			concatenated with 0x00000001. 16-byte aligned pointer. */
1741#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1742#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1743#############################################################
1744SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1745        FUNC_SAVE
1746        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1747        FUNC_RESTORE
1748        RET
1749SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1750
1751###############################################################################
1752#void   aesni_gcm_enc_update_avx_gen2(
1753#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1754#        gcm_context_data *data,
1755#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1756#        const   u8 *in, /* Plaintext input */
1757#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1758###############################################################################
1759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1760        FUNC_SAVE
1761        mov     keysize, %eax
1762        cmp     $32, %eax
1763        je      key_256_enc_update
1764        cmp     $16, %eax
1765        je      key_128_enc_update
1766        # must be 192
1767        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1768        FUNC_RESTORE
1769        RET
1770key_128_enc_update:
1771        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1772        FUNC_RESTORE
1773        RET
1774key_256_enc_update:
1775        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1776        FUNC_RESTORE
1777        RET
1778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1779
1780###############################################################################
1781#void   aesni_gcm_dec_update_avx_gen2(
1782#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1783#        gcm_context_data *data,
1784#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1785#        const   u8 *in, /* Ciphertext input */
1786#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1787###############################################################################
1788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1789        FUNC_SAVE
1790        mov     keysize,%eax
1791        cmp     $32, %eax
1792        je      key_256_dec_update
1793        cmp     $16, %eax
1794        je      key_128_dec_update
1795        # must be 192
1796        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1797        FUNC_RESTORE
1798        RET
1799key_128_dec_update:
1800        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1801        FUNC_RESTORE
1802        RET
1803key_256_dec_update:
1804        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1805        FUNC_RESTORE
1806        RET
1807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1808
1809###############################################################################
1810#void   aesni_gcm_finalize_avx_gen2(
1811#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1812#        gcm_context_data *data,
1813#        u8      *auth_tag, /* Authenticated Tag output. */
1814#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1815#				Valid values are 16 (most likely), 12 or 8. */
1816###############################################################################
1817SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1818        FUNC_SAVE
1819        mov	keysize,%eax
1820        cmp     $32, %eax
1821        je      key_256_finalize
1822        cmp     $16, %eax
1823        je      key_128_finalize
1824        # must be 192
1825        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1826        FUNC_RESTORE
1827        RET
1828key_128_finalize:
1829        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1830        FUNC_RESTORE
1831        RET
1832key_256_finalize:
1833        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1834        FUNC_RESTORE
1835        RET
1836SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1837
1838###############################################################################
1839# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1840# Input: A and B (128-bits each, bit-reflected)
1841# Output: C = A*B*x mod poly, (i.e. >>1 )
1842# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1843# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1844###############################################################################
1845.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1846
1847        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1848        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1849        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1850        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1851        vpxor           \T3, \GH, \GH
1852
1853
1854        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1855        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1856
1857        vpxor           \T3, \T1, \T1
1858        vpxor           \T2, \GH, \GH
1859
1860        #######################################################################
1861        #first phase of the reduction
1862        vmovdqa         POLY2(%rip), \T3
1863
1864        vpclmulqdq      $0x01, \GH, \T3, \T2
1865        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1866
1867        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1868        #######################################################################
1869        #second phase of the reduction
1870        vpclmulqdq      $0x00, \GH, \T3, \T2
1871        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1872
1873        vpclmulqdq      $0x10, \GH, \T3, \GH
1874        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1875
1876        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1877        #######################################################################
1878        vpxor           \T1, \GH, \GH          # the result is in GH
1879
1880
1881.endm
1882
1883.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1884
1885        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1886        vmovdqa  \HK, \T5
1887        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1888        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1889
1890        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1891        vmovdqu  \T5, HashKey_3(arg2)
1892
1893        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1894        vmovdqu  \T5, HashKey_4(arg2)
1895
1896        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1897        vmovdqu  \T5, HashKey_5(arg2)
1898
1899        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1900        vmovdqu  \T5, HashKey_6(arg2)
1901
1902        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1903        vmovdqu  \T5, HashKey_7(arg2)
1904
1905        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1906        vmovdqu  \T5, HashKey_8(arg2)
1907
1908.endm
1909
1910## if a = number of total plaintext bytes
1911## b = floor(a/16)
1912## num_initial_blocks = b mod 4#
1913## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1914## r10, r11, r12, rax are clobbered
1915## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1916
1917.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1918	i = (8-\num_initial_blocks)
1919	setreg
1920	vmovdqu AadHash(arg2), reg_i
1921
1922	# start AES for num_initial_blocks blocks
1923	vmovdqu CurCount(arg2), \CTR
1924
1925	i = (9-\num_initial_blocks)
1926	setreg
1927.rep \num_initial_blocks
1928                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1929                vmovdqa \CTR, reg_i
1930                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1931	i = (i+1)
1932	setreg
1933.endr
1934
1935	vmovdqa  (arg1), \T_key
1936	i = (9-\num_initial_blocks)
1937	setreg
1938.rep \num_initial_blocks
1939                vpxor   \T_key, reg_i, reg_i
1940	i = (i+1)
1941	setreg
1942.endr
1943
1944	j = 1
1945	setreg
1946.rep \REP
1947	vmovdqa  16*j(arg1), \T_key
1948	i = (9-\num_initial_blocks)
1949	setreg
1950.rep \num_initial_blocks
1951        vaesenc \T_key, reg_i, reg_i
1952	i = (i+1)
1953	setreg
1954.endr
1955
1956	j = (j+1)
1957	setreg
1958.endr
1959
1960
1961	vmovdqa  16*j(arg1), \T_key
1962	i = (9-\num_initial_blocks)
1963	setreg
1964.rep \num_initial_blocks
1965        vaesenclast      \T_key, reg_i, reg_i
1966	i = (i+1)
1967	setreg
1968.endr
1969
1970	i = (9-\num_initial_blocks)
1971	setreg
1972.rep \num_initial_blocks
1973                vmovdqu (arg4, %r11), \T1
1974                vpxor   \T1, reg_i, reg_i
1975                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
1976						       # num_initial_blocks blocks
1977                add     $16, %r11
1978.if  \ENC_DEC == DEC
1979                vmovdqa \T1, reg_i
1980.endif
1981                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1982	i = (i+1)
1983	setreg
1984.endr
1985
1986
1987	i = (8-\num_initial_blocks)
1988	j = (9-\num_initial_blocks)
1989	setreg
1990
1991.rep \num_initial_blocks
1992        vpxor    reg_i, reg_j, reg_j
1993        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1994	i = (i+1)
1995	j = (j+1)
1996	setreg
1997.endr
1998        # XMM8 has the combined result here
1999
2000        vmovdqa  \XMM8, TMP1(%rsp)
2001        vmovdqa  \XMM8, \T3
2002
2003        cmp     $128, %r13
2004        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
2005
2006###############################################################################
2007# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2008                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2009                vmovdqa  \CTR, \XMM1
2010                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2011
2012                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2013                vmovdqa  \CTR, \XMM2
2014                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2015
2016                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2017                vmovdqa  \CTR, \XMM3
2018                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2019
2020                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2021                vmovdqa  \CTR, \XMM4
2022                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2023
2024                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2025                vmovdqa  \CTR, \XMM5
2026                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2027
2028                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2029                vmovdqa  \CTR, \XMM6
2030                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2031
2032                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2033                vmovdqa  \CTR, \XMM7
2034                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2035
2036                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2037                vmovdqa  \CTR, \XMM8
2038                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2039
2040                vmovdqa  (arg1), \T_key
2041                vpxor    \T_key, \XMM1, \XMM1
2042                vpxor    \T_key, \XMM2, \XMM2
2043                vpxor    \T_key, \XMM3, \XMM3
2044                vpxor    \T_key, \XMM4, \XMM4
2045                vpxor    \T_key, \XMM5, \XMM5
2046                vpxor    \T_key, \XMM6, \XMM6
2047                vpxor    \T_key, \XMM7, \XMM7
2048                vpxor    \T_key, \XMM8, \XMM8
2049
2050		i = 1
2051		setreg
2052.rep    \REP       # do REP rounds
2053                vmovdqa  16*i(arg1), \T_key
2054                vaesenc  \T_key, \XMM1, \XMM1
2055                vaesenc  \T_key, \XMM2, \XMM2
2056                vaesenc  \T_key, \XMM3, \XMM3
2057                vaesenc  \T_key, \XMM4, \XMM4
2058                vaesenc  \T_key, \XMM5, \XMM5
2059                vaesenc  \T_key, \XMM6, \XMM6
2060                vaesenc  \T_key, \XMM7, \XMM7
2061                vaesenc  \T_key, \XMM8, \XMM8
2062		i = (i+1)
2063		setreg
2064.endr
2065
2066
2067                vmovdqa  16*i(arg1), \T_key
2068                vaesenclast  \T_key, \XMM1, \XMM1
2069                vaesenclast  \T_key, \XMM2, \XMM2
2070                vaesenclast  \T_key, \XMM3, \XMM3
2071                vaesenclast  \T_key, \XMM4, \XMM4
2072                vaesenclast  \T_key, \XMM5, \XMM5
2073                vaesenclast  \T_key, \XMM6, \XMM6
2074                vaesenclast  \T_key, \XMM7, \XMM7
2075                vaesenclast  \T_key, \XMM8, \XMM8
2076
2077                vmovdqu  (arg4, %r11), \T1
2078                vpxor    \T1, \XMM1, \XMM1
2079                vmovdqu  \XMM1, (arg3 , %r11)
2080                .if   \ENC_DEC == DEC
2081                vmovdqa  \T1, \XMM1
2082                .endif
2083
2084                vmovdqu  16*1(arg4, %r11), \T1
2085                vpxor    \T1, \XMM2, \XMM2
2086                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2087                .if   \ENC_DEC == DEC
2088                vmovdqa  \T1, \XMM2
2089                .endif
2090
2091                vmovdqu  16*2(arg4, %r11), \T1
2092                vpxor    \T1, \XMM3, \XMM3
2093                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2094                .if   \ENC_DEC == DEC
2095                vmovdqa  \T1, \XMM3
2096                .endif
2097
2098                vmovdqu  16*3(arg4, %r11), \T1
2099                vpxor    \T1, \XMM4, \XMM4
2100                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2101                .if   \ENC_DEC == DEC
2102                vmovdqa  \T1, \XMM4
2103                .endif
2104
2105                vmovdqu  16*4(arg4, %r11), \T1
2106                vpxor    \T1, \XMM5, \XMM5
2107                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2108                .if   \ENC_DEC == DEC
2109                vmovdqa  \T1, \XMM5
2110                .endif
2111
2112                vmovdqu  16*5(arg4, %r11), \T1
2113                vpxor    \T1, \XMM6, \XMM6
2114                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2115                .if   \ENC_DEC == DEC
2116                vmovdqa  \T1, \XMM6
2117                .endif
2118
2119                vmovdqu  16*6(arg4, %r11), \T1
2120                vpxor    \T1, \XMM7, \XMM7
2121                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2122                .if   \ENC_DEC == DEC
2123                vmovdqa  \T1, \XMM7
2124                .endif
2125
2126                vmovdqu  16*7(arg4, %r11), \T1
2127                vpxor    \T1, \XMM8, \XMM8
2128                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2129                .if   \ENC_DEC == DEC
2130                vmovdqa  \T1, \XMM8
2131                .endif
2132
2133                add     $128, %r11
2134
2135                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2136                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2137							   # the corresponding ciphertext
2138                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2139                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2140                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2141                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2142                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2143                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2144                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2145
2146###############################################################################
2147
2148.L_initial_blocks_done\@:
2149
2150
2151.endm
2152
2153
2154
2155# encrypt 8 blocks at a time
2156# ghash the 8 previously encrypted ciphertext blocks
2157# arg1, arg2, arg3, arg4 are used as pointers only, not modified
2158# r11 is the data offset value
2159.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2160
2161        vmovdqa \XMM1, \T2
2162        vmovdqa \XMM2, TMP2(%rsp)
2163        vmovdqa \XMM3, TMP3(%rsp)
2164        vmovdqa \XMM4, TMP4(%rsp)
2165        vmovdqa \XMM5, TMP5(%rsp)
2166        vmovdqa \XMM6, TMP6(%rsp)
2167        vmovdqa \XMM7, TMP7(%rsp)
2168        vmovdqa \XMM8, TMP8(%rsp)
2169
2170.if \loop_idx == in_order
2171                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2172                vpaddd  ONE(%rip), \XMM1, \XMM2
2173                vpaddd  ONE(%rip), \XMM2, \XMM3
2174                vpaddd  ONE(%rip), \XMM3, \XMM4
2175                vpaddd  ONE(%rip), \XMM4, \XMM5
2176                vpaddd  ONE(%rip), \XMM5, \XMM6
2177                vpaddd  ONE(%rip), \XMM6, \XMM7
2178                vpaddd  ONE(%rip), \XMM7, \XMM8
2179                vmovdqa \XMM8, \CTR
2180
2181                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2182                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2183                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2184                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2185                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2186                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2187                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2188                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2189.else
2190                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2191                vpaddd  ONEf(%rip), \XMM1, \XMM2
2192                vpaddd  ONEf(%rip), \XMM2, \XMM3
2193                vpaddd  ONEf(%rip), \XMM3, \XMM4
2194                vpaddd  ONEf(%rip), \XMM4, \XMM5
2195                vpaddd  ONEf(%rip), \XMM5, \XMM6
2196                vpaddd  ONEf(%rip), \XMM6, \XMM7
2197                vpaddd  ONEf(%rip), \XMM7, \XMM8
2198                vmovdqa \XMM8, \CTR
2199.endif
2200
2201
2202        #######################################################################
2203
2204                vmovdqu (arg1), \T1
2205                vpxor   \T1, \XMM1, \XMM1
2206                vpxor   \T1, \XMM2, \XMM2
2207                vpxor   \T1, \XMM3, \XMM3
2208                vpxor   \T1, \XMM4, \XMM4
2209                vpxor   \T1, \XMM5, \XMM5
2210                vpxor   \T1, \XMM6, \XMM6
2211                vpxor   \T1, \XMM7, \XMM7
2212                vpxor   \T1, \XMM8, \XMM8
2213
2214        #######################################################################
2215
2216
2217
2218
2219
2220                vmovdqu 16*1(arg1), \T1
2221                vaesenc \T1, \XMM1, \XMM1
2222                vaesenc \T1, \XMM2, \XMM2
2223                vaesenc \T1, \XMM3, \XMM3
2224                vaesenc \T1, \XMM4, \XMM4
2225                vaesenc \T1, \XMM5, \XMM5
2226                vaesenc \T1, \XMM6, \XMM6
2227                vaesenc \T1, \XMM7, \XMM7
2228                vaesenc \T1, \XMM8, \XMM8
2229
2230                vmovdqu 16*2(arg1), \T1
2231                vaesenc \T1, \XMM1, \XMM1
2232                vaesenc \T1, \XMM2, \XMM2
2233                vaesenc \T1, \XMM3, \XMM3
2234                vaesenc \T1, \XMM4, \XMM4
2235                vaesenc \T1, \XMM5, \XMM5
2236                vaesenc \T1, \XMM6, \XMM6
2237                vaesenc \T1, \XMM7, \XMM7
2238                vaesenc \T1, \XMM8, \XMM8
2239
2240
2241        #######################################################################
2242
2243        vmovdqu         HashKey_8(arg2), \T5
2244        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2245        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2246        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2247        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2248        vpxor           \T5, \T6, \T6
2249
2250                vmovdqu 16*3(arg1), \T1
2251                vaesenc \T1, \XMM1, \XMM1
2252                vaesenc \T1, \XMM2, \XMM2
2253                vaesenc \T1, \XMM3, \XMM3
2254                vaesenc \T1, \XMM4, \XMM4
2255                vaesenc \T1, \XMM5, \XMM5
2256                vaesenc \T1, \XMM6, \XMM6
2257                vaesenc \T1, \XMM7, \XMM7
2258                vaesenc \T1, \XMM8, \XMM8
2259
2260        vmovdqa         TMP2(%rsp), \T1
2261        vmovdqu         HashKey_7(arg2), \T5
2262        vpclmulqdq      $0x11, \T5, \T1, \T3
2263        vpxor           \T3, \T4, \T4
2264
2265        vpclmulqdq      $0x00, \T5, \T1, \T3
2266        vpxor           \T3, \T7, \T7
2267
2268        vpclmulqdq      $0x01, \T5, \T1, \T3
2269        vpxor           \T3, \T6, \T6
2270
2271        vpclmulqdq      $0x10, \T5, \T1, \T3
2272        vpxor           \T3, \T6, \T6
2273
2274                vmovdqu 16*4(arg1), \T1
2275                vaesenc \T1, \XMM1, \XMM1
2276                vaesenc \T1, \XMM2, \XMM2
2277                vaesenc \T1, \XMM3, \XMM3
2278                vaesenc \T1, \XMM4, \XMM4
2279                vaesenc \T1, \XMM5, \XMM5
2280                vaesenc \T1, \XMM6, \XMM6
2281                vaesenc \T1, \XMM7, \XMM7
2282                vaesenc \T1, \XMM8, \XMM8
2283
2284        #######################################################################
2285
2286        vmovdqa         TMP3(%rsp), \T1
2287        vmovdqu         HashKey_6(arg2), \T5
2288        vpclmulqdq      $0x11, \T5, \T1, \T3
2289        vpxor           \T3, \T4, \T4
2290
2291        vpclmulqdq      $0x00, \T5, \T1, \T3
2292        vpxor           \T3, \T7, \T7
2293
2294        vpclmulqdq      $0x01, \T5, \T1, \T3
2295        vpxor           \T3, \T6, \T6
2296
2297        vpclmulqdq      $0x10, \T5, \T1, \T3
2298        vpxor           \T3, \T6, \T6
2299
2300                vmovdqu 16*5(arg1), \T1
2301                vaesenc \T1, \XMM1, \XMM1
2302                vaesenc \T1, \XMM2, \XMM2
2303                vaesenc \T1, \XMM3, \XMM3
2304                vaesenc \T1, \XMM4, \XMM4
2305                vaesenc \T1, \XMM5, \XMM5
2306                vaesenc \T1, \XMM6, \XMM6
2307                vaesenc \T1, \XMM7, \XMM7
2308                vaesenc \T1, \XMM8, \XMM8
2309
2310        vmovdqa         TMP4(%rsp), \T1
2311        vmovdqu         HashKey_5(arg2), \T5
2312        vpclmulqdq      $0x11, \T5, \T1, \T3
2313        vpxor           \T3, \T4, \T4
2314
2315        vpclmulqdq      $0x00, \T5, \T1, \T3
2316        vpxor           \T3, \T7, \T7
2317
2318        vpclmulqdq      $0x01, \T5, \T1, \T3
2319        vpxor           \T3, \T6, \T6
2320
2321        vpclmulqdq      $0x10, \T5, \T1, \T3
2322        vpxor           \T3, \T6, \T6
2323
2324                vmovdqu 16*6(arg1), \T1
2325                vaesenc \T1, \XMM1, \XMM1
2326                vaesenc \T1, \XMM2, \XMM2
2327                vaesenc \T1, \XMM3, \XMM3
2328                vaesenc \T1, \XMM4, \XMM4
2329                vaesenc \T1, \XMM5, \XMM5
2330                vaesenc \T1, \XMM6, \XMM6
2331                vaesenc \T1, \XMM7, \XMM7
2332                vaesenc \T1, \XMM8, \XMM8
2333
2334
2335        vmovdqa         TMP5(%rsp), \T1
2336        vmovdqu         HashKey_4(arg2), \T5
2337        vpclmulqdq      $0x11, \T5, \T1, \T3
2338        vpxor           \T3, \T4, \T4
2339
2340        vpclmulqdq      $0x00, \T5, \T1, \T3
2341        vpxor           \T3, \T7, \T7
2342
2343        vpclmulqdq      $0x01, \T5, \T1, \T3
2344        vpxor           \T3, \T6, \T6
2345
2346        vpclmulqdq      $0x10, \T5, \T1, \T3
2347        vpxor           \T3, \T6, \T6
2348
2349                vmovdqu 16*7(arg1), \T1
2350                vaesenc \T1, \XMM1, \XMM1
2351                vaesenc \T1, \XMM2, \XMM2
2352                vaesenc \T1, \XMM3, \XMM3
2353                vaesenc \T1, \XMM4, \XMM4
2354                vaesenc \T1, \XMM5, \XMM5
2355                vaesenc \T1, \XMM6, \XMM6
2356                vaesenc \T1, \XMM7, \XMM7
2357                vaesenc \T1, \XMM8, \XMM8
2358
2359        vmovdqa         TMP6(%rsp), \T1
2360        vmovdqu         HashKey_3(arg2), \T5
2361        vpclmulqdq      $0x11, \T5, \T1, \T3
2362        vpxor           \T3, \T4, \T4
2363
2364        vpclmulqdq      $0x00, \T5, \T1, \T3
2365        vpxor           \T3, \T7, \T7
2366
2367        vpclmulqdq      $0x01, \T5, \T1, \T3
2368        vpxor           \T3, \T6, \T6
2369
2370        vpclmulqdq      $0x10, \T5, \T1, \T3
2371        vpxor           \T3, \T6, \T6
2372
2373                vmovdqu 16*8(arg1), \T1
2374                vaesenc \T1, \XMM1, \XMM1
2375                vaesenc \T1, \XMM2, \XMM2
2376                vaesenc \T1, \XMM3, \XMM3
2377                vaesenc \T1, \XMM4, \XMM4
2378                vaesenc \T1, \XMM5, \XMM5
2379                vaesenc \T1, \XMM6, \XMM6
2380                vaesenc \T1, \XMM7, \XMM7
2381                vaesenc \T1, \XMM8, \XMM8
2382
2383        vmovdqa         TMP7(%rsp), \T1
2384        vmovdqu         HashKey_2(arg2), \T5
2385        vpclmulqdq      $0x11, \T5, \T1, \T3
2386        vpxor           \T3, \T4, \T4
2387
2388        vpclmulqdq      $0x00, \T5, \T1, \T3
2389        vpxor           \T3, \T7, \T7
2390
2391        vpclmulqdq      $0x01, \T5, \T1, \T3
2392        vpxor           \T3, \T6, \T6
2393
2394        vpclmulqdq      $0x10, \T5, \T1, \T3
2395        vpxor           \T3, \T6, \T6
2396
2397
2398        #######################################################################
2399
2400                vmovdqu 16*9(arg1), \T5
2401                vaesenc \T5, \XMM1, \XMM1
2402                vaesenc \T5, \XMM2, \XMM2
2403                vaesenc \T5, \XMM3, \XMM3
2404                vaesenc \T5, \XMM4, \XMM4
2405                vaesenc \T5, \XMM5, \XMM5
2406                vaesenc \T5, \XMM6, \XMM6
2407                vaesenc \T5, \XMM7, \XMM7
2408                vaesenc \T5, \XMM8, \XMM8
2409
2410        vmovdqa         TMP8(%rsp), \T1
2411        vmovdqu         HashKey(arg2), \T5
2412
2413        vpclmulqdq      $0x00, \T5, \T1, \T3
2414        vpxor           \T3, \T7, \T7
2415
2416        vpclmulqdq      $0x01, \T5, \T1, \T3
2417        vpxor           \T3, \T6, \T6
2418
2419        vpclmulqdq      $0x10, \T5, \T1, \T3
2420        vpxor           \T3, \T6, \T6
2421
2422        vpclmulqdq      $0x11, \T5, \T1, \T3
2423        vpxor           \T3, \T4, \T1
2424
2425
2426                vmovdqu 16*10(arg1), \T5
2427
2428        i = 11
2429        setreg
2430.rep (\REP-9)
2431        vaesenc \T5, \XMM1, \XMM1
2432        vaesenc \T5, \XMM2, \XMM2
2433        vaesenc \T5, \XMM3, \XMM3
2434        vaesenc \T5, \XMM4, \XMM4
2435        vaesenc \T5, \XMM5, \XMM5
2436        vaesenc \T5, \XMM6, \XMM6
2437        vaesenc \T5, \XMM7, \XMM7
2438        vaesenc \T5, \XMM8, \XMM8
2439
2440        vmovdqu 16*i(arg1), \T5
2441        i = i + 1
2442        setreg
2443.endr
2444
2445	i = 0
2446	j = 1
2447	setreg
2448.rep 8
2449		vpxor	16*i(arg4, %r11), \T5, \T2
2450                .if \ENC_DEC == ENC
2451                vaesenclast     \T2, reg_j, reg_j
2452                .else
2453                vaesenclast     \T2, reg_j, \T3
2454                vmovdqu 16*i(arg4, %r11), reg_j
2455                vmovdqu \T3, 16*i(arg3, %r11)
2456                .endif
2457	i = (i+1)
2458	j = (j+1)
2459	setreg
2460.endr
2461	#######################################################################
2462
2463
2464	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2465	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2466	vpxor	\T3, \T7, \T7
2467	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2468
2469
2470
2471	#######################################################################
2472	#first phase of the reduction
2473	vmovdqa         POLY2(%rip), \T3
2474
2475	vpclmulqdq	$0x01, \T7, \T3, \T2
2476	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2477
2478	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2479	#######################################################################
2480                .if \ENC_DEC == ENC
2481		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2482		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2483		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2484		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2485		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2486		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2487		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2488		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2489                .endif
2490
2491	#######################################################################
2492	#second phase of the reduction
2493	vpclmulqdq	$0x00, \T7, \T3, \T2
2494	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2495
2496	vpclmulqdq	$0x10, \T7, \T3, \T4
2497	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2498
2499	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2500	#######################################################################
2501	vpxor		\T4, \T1, \T1			# the result is in T1
2502
2503		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2504		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2505		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2506		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2507		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2508		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2509		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2510		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2511
2512
2513	vpxor	\T1, \XMM1, \XMM1
2514
2515
2516
2517.endm
2518
2519
2520# GHASH the last 4 ciphertext blocks.
2521.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2522
2523        ## Karatsuba Method
2524
2525        vmovdqu         HashKey_8(arg2), \T5
2526
2527        vpshufd         $0b01001110, \XMM1, \T2
2528        vpshufd         $0b01001110, \T5, \T3
2529        vpxor           \XMM1, \T2, \T2
2530        vpxor           \T5, \T3, \T3
2531
2532        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2533        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2534
2535        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2536
2537        ######################
2538
2539        vmovdqu         HashKey_7(arg2), \T5
2540        vpshufd         $0b01001110, \XMM2, \T2
2541        vpshufd         $0b01001110, \T5, \T3
2542        vpxor           \XMM2, \T2, \T2
2543        vpxor           \T5, \T3, \T3
2544
2545        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2546        vpxor           \T4, \T6, \T6
2547
2548        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2549        vpxor           \T4, \T7, \T7
2550
2551        vpclmulqdq      $0x00, \T3, \T2, \T2
2552
2553        vpxor           \T2, \XMM1, \XMM1
2554
2555        ######################
2556
2557        vmovdqu         HashKey_6(arg2), \T5
2558        vpshufd         $0b01001110, \XMM3, \T2
2559        vpshufd         $0b01001110, \T5, \T3
2560        vpxor           \XMM3, \T2, \T2
2561        vpxor           \T5, \T3, \T3
2562
2563        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2564        vpxor           \T4, \T6, \T6
2565
2566        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2567        vpxor           \T4, \T7, \T7
2568
2569        vpclmulqdq      $0x00, \T3, \T2, \T2
2570
2571        vpxor           \T2, \XMM1, \XMM1
2572
2573        ######################
2574
2575        vmovdqu         HashKey_5(arg2), \T5
2576        vpshufd         $0b01001110, \XMM4, \T2
2577        vpshufd         $0b01001110, \T5, \T3
2578        vpxor           \XMM4, \T2, \T2
2579        vpxor           \T5, \T3, \T3
2580
2581        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2582        vpxor           \T4, \T6, \T6
2583
2584        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2585        vpxor           \T4, \T7, \T7
2586
2587        vpclmulqdq      $0x00, \T3, \T2, \T2
2588
2589        vpxor           \T2, \XMM1, \XMM1
2590
2591        ######################
2592
2593        vmovdqu         HashKey_4(arg2), \T5
2594        vpshufd         $0b01001110, \XMM5, \T2
2595        vpshufd         $0b01001110, \T5, \T3
2596        vpxor           \XMM5, \T2, \T2
2597        vpxor           \T5, \T3, \T3
2598
2599        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2600        vpxor           \T4, \T6, \T6
2601
2602        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2603        vpxor           \T4, \T7, \T7
2604
2605        vpclmulqdq      $0x00, \T3, \T2, \T2
2606
2607        vpxor           \T2, \XMM1, \XMM1
2608
2609        ######################
2610
2611        vmovdqu         HashKey_3(arg2), \T5
2612        vpshufd         $0b01001110, \XMM6, \T2
2613        vpshufd         $0b01001110, \T5, \T3
2614        vpxor           \XMM6, \T2, \T2
2615        vpxor           \T5, \T3, \T3
2616
2617        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2618        vpxor           \T4, \T6, \T6
2619
2620        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2621        vpxor           \T4, \T7, \T7
2622
2623        vpclmulqdq      $0x00, \T3, \T2, \T2
2624
2625        vpxor           \T2, \XMM1, \XMM1
2626
2627        ######################
2628
2629        vmovdqu         HashKey_2(arg2), \T5
2630        vpshufd         $0b01001110, \XMM7, \T2
2631        vpshufd         $0b01001110, \T5, \T3
2632        vpxor           \XMM7, \T2, \T2
2633        vpxor           \T5, \T3, \T3
2634
2635        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2636        vpxor           \T4, \T6, \T6
2637
2638        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2639        vpxor           \T4, \T7, \T7
2640
2641        vpclmulqdq      $0x00, \T3, \T2, \T2
2642
2643        vpxor           \T2, \XMM1, \XMM1
2644
2645        ######################
2646
2647        vmovdqu         HashKey(arg2), \T5
2648        vpshufd         $0b01001110, \XMM8, \T2
2649        vpshufd         $0b01001110, \T5, \T3
2650        vpxor           \XMM8, \T2, \T2
2651        vpxor           \T5, \T3, \T3
2652
2653        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2654        vpxor           \T4, \T6, \T6
2655
2656        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2657        vpxor           \T4, \T7, \T7
2658
2659        vpclmulqdq      $0x00, \T3, \T2, \T2
2660
2661        vpxor           \T2, \XMM1, \XMM1
2662        vpxor           \T6, \XMM1, \XMM1
2663        vpxor           \T7, \XMM1, \T2
2664
2665
2666
2667
2668        vpslldq $8, \T2, \T4
2669        vpsrldq $8, \T2, \T2
2670
2671        vpxor   \T4, \T7, \T7
2672        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2673						   # accumulated carry-less multiplications
2674
2675        #######################################################################
2676        #first phase of the reduction
2677        vmovdqa         POLY2(%rip), \T3
2678
2679        vpclmulqdq      $0x01, \T7, \T3, \T2
2680        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2681
2682        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2683        #######################################################################
2684
2685
2686        #second phase of the reduction
2687        vpclmulqdq      $0x00, \T7, \T3, \T2
2688        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2689
2690        vpclmulqdq      $0x10, \T7, \T3, \T4
2691        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2692
2693        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2694        #######################################################################
2695        vpxor           \T4, \T6, \T6              # the result is in T6
2696.endm
2697
2698
2699
2700#############################################################
2701#void   aesni_gcm_init_avx_gen4
2702#        (gcm_data     *my_ctx_data,
2703#         gcm_context_data *data,
2704#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2705#			(from Security Association) concatenated with 8 byte
2706#			Initialisation Vector (from IPSec ESP Payload)
2707#			concatenated with 0x00000001. 16-byte aligned pointer. */
2708#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2709#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2710#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2711#############################################################
2712SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2713        FUNC_SAVE
2714        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2715        FUNC_RESTORE
2716        RET
2717SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2718
2719###############################################################################
2720#void   aesni_gcm_enc_avx_gen4(
2721#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2722#        gcm_context_data *data,
2723#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2724#        const   u8 *in, /* Plaintext input */
2725#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2726###############################################################################
2727SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2728        FUNC_SAVE
2729        mov     keysize,%eax
2730        cmp     $32, %eax
2731        je      key_256_enc_update4
2732        cmp     $16, %eax
2733        je      key_128_enc_update4
2734        # must be 192
2735        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2736        FUNC_RESTORE
2737	RET
2738key_128_enc_update4:
2739        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2740        FUNC_RESTORE
2741	RET
2742key_256_enc_update4:
2743        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2744        FUNC_RESTORE
2745	RET
2746SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2747
2748###############################################################################
2749#void   aesni_gcm_dec_update_avx_gen4(
2750#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2751#        gcm_context_data *data,
2752#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2753#        const   u8 *in, /* Ciphertext input */
2754#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2755###############################################################################
2756SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2757        FUNC_SAVE
2758        mov     keysize,%eax
2759        cmp     $32, %eax
2760        je      key_256_dec_update4
2761        cmp     $16, %eax
2762        je      key_128_dec_update4
2763        # must be 192
2764        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2765        FUNC_RESTORE
2766        RET
2767key_128_dec_update4:
2768        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2769        FUNC_RESTORE
2770        RET
2771key_256_dec_update4:
2772        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2773        FUNC_RESTORE
2774        RET
2775SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2776
2777###############################################################################
2778#void   aesni_gcm_finalize_avx_gen4(
2779#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2780#        gcm_context_data *data,
2781#        u8      *auth_tag, /* Authenticated Tag output. */
2782#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2783#                              Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2786        FUNC_SAVE
2787        mov	keysize,%eax
2788        cmp     $32, %eax
2789        je      key_256_finalize4
2790        cmp     $16, %eax
2791        je      key_128_finalize4
2792        # must be 192
2793        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2794        FUNC_RESTORE
2795        RET
2796key_128_finalize4:
2797        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2798        FUNC_RESTORE
2799        RET
2800key_256_finalize4:
2801        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2802        FUNC_RESTORE
2803        RET
2804SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2805