1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section	.rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY:            .octa     0xC2000000000000000000000000000001
128
129.section	.rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2:           .octa     0xC20000000000000000000001C2000000
132
133.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE:          .octa     0x00000001000000000000000000000001
136
137.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140
141.section	.rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE:             .octa     0x00000000000000000000000000000001
144
145.section	.rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf:            .octa     0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section	.rodata, "a", @progbits
152.align 16
153SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155                 .octa     0x00000000000000000000000000000000
156
157.section .rodata
158.align 16
159.type aad_shift_arr, @object
160.size aad_shift_arr, 272
161aad_shift_arr:
162        .octa     0xffffffffffffffffffffffffffffffff
163        .octa     0xffffffffffffffffffffffffffffff0C
164        .octa     0xffffffffffffffffffffffffffff0D0C
165        .octa     0xffffffffffffffffffffffffff0E0D0C
166        .octa     0xffffffffffffffffffffffff0F0E0D0C
167        .octa     0xffffffffffffffffffffff0C0B0A0908
168        .octa     0xffffffffffffffffffff0D0C0B0A0908
169        .octa     0xffffffffffffffffff0E0D0C0B0A0908
170        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
171        .octa     0xffffffffffffff0C0B0A090807060504
172        .octa     0xffffffffffff0D0C0B0A090807060504
173        .octa     0xffffffffff0E0D0C0B0A090807060504
174        .octa     0xffffffff0F0E0D0C0B0A090807060504
175        .octa     0xffffff0C0B0A09080706050403020100
176        .octa     0xffff0D0C0B0A09080706050403020100
177        .octa     0xff0E0D0C0B0A09080706050403020100
178        .octa     0x0F0E0D0C0B0A09080706050403020100
179
180
181.text
182
183
184#define AadHash 16*0
185#define AadLen 16*1
186#define InLen (16*1)+8
187#define PBlockEncKey 16*2
188#define OrigIV 16*3
189#define CurCount 16*4
190#define PBlockLen 16*5
191
192HashKey        = 16*6   # store HashKey <<1 mod poly here
193HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
194HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
195HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
196HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
197HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
198HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
199HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
200HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208
209#define arg1 %rdi
210#define arg2 %rsi
211#define arg3 %rdx
212#define arg4 %rcx
213#define arg5 %r8
214#define arg6 %r9
215#define arg7 STACK_OFFSET+8*1(%r14)
216#define arg8 STACK_OFFSET+8*2(%r14)
217#define arg9 STACK_OFFSET+8*3(%r14)
218#define arg10 STACK_OFFSET+8*4(%r14)
219#define keysize 2*15*16(arg1)
220
221i = 0
222j = 0
223
224out_order = 0
225in_order = 1
226DEC = 0
227ENC = 1
228
229.macro define_reg r n
230reg_\r = %xmm\n
231.endm
232
233.macro setreg
234.altmacro
235define_reg i %i
236define_reg j %j
237.noaltmacro
238.endm
239
240# need to push 4 registers into stack to maintain
241STACK_OFFSET = 8*4
242
243TMP1 =   16*0    # Temporary storage for AAD
244TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
245TMP3 =   16*2    # Temporary storage for AES State 3
246TMP4 =   16*3    # Temporary storage for AES State 4
247TMP5 =   16*4    # Temporary storage for AES State 5
248TMP6 =   16*5    # Temporary storage for AES State 6
249TMP7 =   16*6    # Temporary storage for AES State 7
250TMP8 =   16*7    # Temporary storage for AES State 8
251
252VARIABLE_OFFSET = 16*8
253
254################################
255# Utility Macros
256################################
257
258.macro FUNC_SAVE
259        #the number of pushes must equal STACK_OFFSET
260        push    %r12
261        push    %r13
262        push    %r14
263        push    %r15
264
265        mov     %rsp, %r14
266
267
268
269        sub     $VARIABLE_OFFSET, %rsp
270        and     $~63, %rsp                    # align rsp to 64 bytes
271.endm
272
273.macro FUNC_RESTORE
274        mov     %r14, %rsp
275
276        pop     %r15
277        pop     %r14
278        pop     %r13
279        pop     %r12
280.endm
281
282# Encryption of a single block
283.macro ENCRYPT_SINGLE_BLOCK REP XMM0
284                vpxor    (arg1), \XMM0, \XMM0
285               i = 1
286               setreg
287.rep \REP
288                vaesenc  16*i(arg1), \XMM0, \XMM0
289               i = (i+1)
290               setreg
291.endr
292                vaesenclast 16*i(arg1), \XMM0, \XMM0
293.endm
294
295# combined for GCM encrypt and decrypt functions
296# clobbering all xmm registers
297# clobbering r10, r11, r12, r13, r14, r15
298.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299        vmovdqu AadHash(arg2), %xmm8
300        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
301        add arg5, InLen(arg2)
302
303        # initialize the data pointer offset as zero
304        xor     %r11d, %r11d
305
306        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
307        sub %r11, arg5
308
309        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
310        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
311
312        mov     %r13, %r12
313        shr     $4, %r12
314        and     $7, %r12
315        jz      _initial_num_blocks_is_0\@
316
317        cmp     $7, %r12
318        je      _initial_num_blocks_is_7\@
319        cmp     $6, %r12
320        je      _initial_num_blocks_is_6\@
321        cmp     $5, %r12
322        je      _initial_num_blocks_is_5\@
323        cmp     $4, %r12
324        je      _initial_num_blocks_is_4\@
325        cmp     $3, %r12
326        je      _initial_num_blocks_is_3\@
327        cmp     $2, %r12
328        je      _initial_num_blocks_is_2\@
329
330        jmp     _initial_num_blocks_is_1\@
331
332_initial_num_blocks_is_7\@:
333        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334        sub     $16*7, %r13
335        jmp     _initial_blocks_encrypted\@
336
337_initial_num_blocks_is_6\@:
338        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339        sub     $16*6, %r13
340        jmp     _initial_blocks_encrypted\@
341
342_initial_num_blocks_is_5\@:
343        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344        sub     $16*5, %r13
345        jmp     _initial_blocks_encrypted\@
346
347_initial_num_blocks_is_4\@:
348        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349        sub     $16*4, %r13
350        jmp     _initial_blocks_encrypted\@
351
352_initial_num_blocks_is_3\@:
353        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354        sub     $16*3, %r13
355        jmp     _initial_blocks_encrypted\@
356
357_initial_num_blocks_is_2\@:
358        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359        sub     $16*2, %r13
360        jmp     _initial_blocks_encrypted\@
361
362_initial_num_blocks_is_1\@:
363        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364        sub     $16*1, %r13
365        jmp     _initial_blocks_encrypted\@
366
367_initial_num_blocks_is_0\@:
368        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
369
370
371_initial_blocks_encrypted\@:
372        cmp     $0, %r13
373        je      _zero_cipher_left\@
374
375        sub     $128, %r13
376        je      _eight_cipher_left\@
377
378
379
380
381        vmovd   %xmm9, %r15d
382        and     $255, %r15d
383        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
384
385
386_encrypt_by_8_new\@:
387        cmp     $(255-8), %r15d
388        jg      _encrypt_by_8\@
389
390
391
392        add     $8, %r15b
393        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
394        add     $128, %r11
395        sub     $128, %r13
396        jne     _encrypt_by_8_new\@
397
398        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399        jmp     _eight_cipher_left\@
400
401_encrypt_by_8\@:
402        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403        add     $8, %r15b
404        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
406        add     $128, %r11
407        sub     $128, %r13
408        jne     _encrypt_by_8_new\@
409
410        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411
412
413
414
415_eight_cipher_left\@:
416        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417
418
419_zero_cipher_left\@:
420        vmovdqu %xmm14, AadHash(arg2)
421        vmovdqu %xmm9, CurCount(arg2)
422
423        # check for 0 length
424        mov     arg5, %r13
425        and     $15, %r13                            # r13 = (arg5 mod 16)
426
427        je      _multiple_of_16_bytes\@
428
429        # handle the last <16 Byte block separately
430
431        mov %r13, PBlockLen(arg2)
432
433        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
434        vmovdqu %xmm9, CurCount(arg2)
435        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
436
437        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
438        vmovdqu %xmm9, PBlockEncKey(arg2)
439
440        cmp $16, arg5
441        jge _large_enough_update\@
442
443        lea (arg4,%r11,1), %r10
444        mov %r13, %r12
445
446        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
447
448        lea     SHIFT_MASK+16(%rip), %r12
449        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
450						     # able to shift 16-r13 bytes (r13 is the
451	# number of bytes in plaintext mod 16)
452
453        jmp _final_ghash_mul\@
454
455_large_enough_update\@:
456        sub $16, %r11
457        add %r13, %r11
458
459        # receive the last <16 Byte block
460        vmovdqu	(arg4, %r11, 1), %xmm1
461
462        sub	%r13, %r11
463        add	$16, %r11
464
465        lea	SHIFT_MASK+16(%rip), %r12
466        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467        # (r13 is the number of bytes in plaintext mod 16)
468        sub	%r13, %r12
469        # get the appropriate shuffle mask
470        vmovdqu	(%r12), %xmm2
471        # shift right 16-r13 bytes
472        vpshufb  %xmm2, %xmm1, %xmm1
473
474_final_ghash_mul\@:
475        .if  \ENC_DEC ==  DEC
476        vmovdqa %xmm1, %xmm2
477        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
478        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
479						     # mask out top 16-r13 bytes of xmm9
480        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
481        vpand   %xmm1, %xmm2, %xmm2
482        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483        vpxor   %xmm2, %xmm14, %xmm14
484
485        vmovdqu %xmm14, AadHash(arg2)
486        .else
487        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
488        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
489						     # mask out top 16-r13 bytes of xmm9
490        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
491        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492        vpxor   %xmm9, %xmm14, %xmm14
493
494        vmovdqu %xmm14, AadHash(arg2)
495        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
496        .endif
497
498
499        #############################
500        # output r13 Bytes
501        vmovq   %xmm9, %rax
502        cmp     $8, %r13
503        jle     _less_than_8_bytes_left\@
504
505        mov     %rax, (arg3 , %r11)
506        add     $8, %r11
507        vpsrldq $8, %xmm9, %xmm9
508        vmovq   %xmm9, %rax
509        sub     $8, %r13
510
511_less_than_8_bytes_left\@:
512        movb    %al, (arg3 , %r11)
513        add     $1, %r11
514        shr     $8, %rax
515        sub     $1, %r13
516        jne     _less_than_8_bytes_left\@
517        #############################
518
519_multiple_of_16_bytes\@:
520.endm
521
522
523# GCM_COMPLETE Finishes update of tag of last partial block
524# Output: Authorization Tag (AUTH_TAG)
525# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527        vmovdqu AadHash(arg2), %xmm14
528        vmovdqu HashKey(arg2), %xmm13
529
530        mov PBlockLen(arg2), %r12
531        cmp $0, %r12
532        je _partial_done\@
533
534	#GHASH computation for the last <16 Byte block
535        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
536
537_partial_done\@:
538        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
539        shl     $3, %r12                             # convert into number of bits
540        vmovd   %r12d, %xmm15                        # len(A) in xmm15
541
542        mov InLen(arg2), %r12
543        shl     $3, %r12                        # len(C) in bits  (*128)
544        vmovq   %r12, %xmm1
545        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
546        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
547
548        vpxor   %xmm15, %xmm14, %xmm14
549        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
550        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
551
552        vmovdqu OrigIV(arg2), %xmm9
553
554        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
555
556        vpxor   %xmm14, %xmm9, %xmm9
557
558
559
560_return_T\@:
561        mov     \AUTH_TAG, %r10              # r10 = authTag
562        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
563
564        cmp     $16, %r11
565        je      _T_16\@
566
567        cmp     $8, %r11
568        jl      _T_4\@
569
570_T_8\@:
571        vmovq   %xmm9, %rax
572        mov     %rax, (%r10)
573        add     $8, %r10
574        sub     $8, %r11
575        vpsrldq $8, %xmm9, %xmm9
576        cmp     $0, %r11
577        je     _return_T_done\@
578_T_4\@:
579        vmovd   %xmm9, %eax
580        mov     %eax, (%r10)
581        add     $4, %r10
582        sub     $4, %r11
583        vpsrldq     $4, %xmm9, %xmm9
584        cmp     $0, %r11
585        je     _return_T_done\@
586_T_123\@:
587        vmovd     %xmm9, %eax
588        cmp     $2, %r11
589        jl     _T_1\@
590        mov     %ax, (%r10)
591        cmp     $2, %r11
592        je     _return_T_done\@
593        add     $2, %r10
594        sar     $16, %eax
595_T_1\@:
596        mov     %al, (%r10)
597        jmp     _return_T_done\@
598
599_T_16\@:
600        vmovdqu %xmm9, (%r10)
601
602_return_T_done\@:
603.endm
604
605.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
606
607	mov     \AAD, %r10                      # r10 = AAD
608	mov     \AADLEN, %r12                      # r12 = aadLen
609
610
611	mov     %r12, %r11
612
613	vpxor   \T8, \T8, \T8
614	vpxor   \T7, \T7, \T7
615	cmp     $16, %r11
616	jl      _get_AAD_rest8\@
617_get_AAD_blocks\@:
618	vmovdqu (%r10), \T7
619	vpshufb SHUF_MASK(%rip), \T7, \T7
620	vpxor   \T7, \T8, \T8
621	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
622	add     $16, %r10
623	sub     $16, %r12
624	sub     $16, %r11
625	cmp     $16, %r11
626	jge     _get_AAD_blocks\@
627	vmovdqu \T8, \T7
628	cmp     $0, %r11
629	je      _get_AAD_done\@
630
631	vpxor   \T7, \T7, \T7
632
633	/* read the last <16B of AAD. since we have at least 4B of
634	data right after the AAD (the ICV, and maybe some CT), we can
635	read 4B/8B blocks safely, and then get rid of the extra stuff */
636_get_AAD_rest8\@:
637	cmp     $4, %r11
638	jle     _get_AAD_rest4\@
639	movq    (%r10), \T1
640	add     $8, %r10
641	sub     $8, %r11
642	vpslldq $8, \T1, \T1
643	vpsrldq $8, \T7, \T7
644	vpxor   \T1, \T7, \T7
645	jmp     _get_AAD_rest8\@
646_get_AAD_rest4\@:
647	cmp     $0, %r11
648	jle      _get_AAD_rest0\@
649	mov     (%r10), %eax
650	movq    %rax, \T1
651	add     $4, %r10
652	sub     $4, %r11
653	vpslldq $12, \T1, \T1
654	vpsrldq $4, \T7, \T7
655	vpxor   \T1, \T7, \T7
656_get_AAD_rest0\@:
657	/* finalize: shift out the extra bytes we read, and align
658	left. since pslldq can only shift by an immediate, we use
659	vpshufb and an array of shuffle masks */
660	movq    %r12, %r11
661	salq    $4, %r11
662	vmovdqu  aad_shift_arr(%r11), \T1
663	vpshufb \T1, \T7, \T7
664_get_AAD_rest_final\@:
665	vpshufb SHUF_MASK(%rip), \T7, \T7
666	vpxor   \T8, \T7, \T7
667	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
668
669_get_AAD_done\@:
670        vmovdqu \T7, AadHash(arg2)
671.endm
672
673.macro INIT GHASH_MUL PRECOMPUTE
674        mov arg6, %r11
675        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
676        xor %r11d, %r11d
677        mov %r11, InLen(arg2) # ctx_data.in_length = 0
678
679        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
681        mov arg3, %rax
682        movdqu (%rax), %xmm0
683        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
684
685        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
687
688        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
689
690        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
691        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
692        vmovdqa  %xmm6, %xmm2
693        vpsllq   $1, %xmm6, %xmm6
694        vpsrlq   $63, %xmm2, %xmm2
695        vmovdqa  %xmm2, %xmm1
696        vpslldq  $8, %xmm2, %xmm2
697        vpsrldq  $8, %xmm1, %xmm1
698        vpor     %xmm2, %xmm6, %xmm6
699        #reduction
700        vpshufd  $0b00100100, %xmm1, %xmm2
701        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702        vpand    POLY(%rip), %xmm2, %xmm2
703        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
704        #######################################################################
705        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
706
707        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
708
709        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
710.endm
711
712
713# Reads DLEN bytes starting at DPTR and stores in XMMDst
714# where 0 < DLEN < 16
715# Clobbers %rax, DLEN
716.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717        vpxor \XMMDst, \XMMDst, \XMMDst
718
719        cmp $8, \DLEN
720        jl _read_lt8_\@
721        mov (\DPTR), %rax
722        vpinsrq $0, %rax, \XMMDst, \XMMDst
723        sub $8, \DLEN
724        jz _done_read_partial_block_\@
725        xor %eax, %eax
726_read_next_byte_\@:
727        shl $8, %rax
728        mov 7(\DPTR, \DLEN, 1), %al
729        dec \DLEN
730        jnz _read_next_byte_\@
731        vpinsrq $1, %rax, \XMMDst, \XMMDst
732        jmp _done_read_partial_block_\@
733_read_lt8_\@:
734        xor %eax, %eax
735_read_next_byte_lt8_\@:
736        shl $8, %rax
737        mov -1(\DPTR, \DLEN, 1), %al
738        dec \DLEN
739        jnz _read_next_byte_lt8_\@
740        vpinsrq $0, %rax, \XMMDst, \XMMDst
741_done_read_partial_block_\@:
742.endm
743
744# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745# between update calls.
746# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
750        AAD_HASH ENC_DEC
751        mov 	PBlockLen(arg2), %r13
752        cmp	$0, %r13
753        je	_partial_block_done_\@	# Leave Macro if no partial blocks
754        # Read in input data without over reading
755        cmp	$16, \PLAIN_CYPH_LEN
756        jl	_fewer_than_16_bytes_\@
757        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
758        jmp	_data_read_\@
759
760_fewer_than_16_bytes_\@:
761        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762        mov	\PLAIN_CYPH_LEN, %r12
763        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
764
765        mov PBlockLen(arg2), %r13
766
767_data_read_\@:				# Finished reading in data
768
769        vmovdqu	PBlockEncKey(arg2), %xmm9
770        vmovdqu	HashKey(arg2), %xmm13
771
772        lea	SHIFT_MASK(%rip), %r12
773
774        # adjust the shuffle mask pointer to be able to shift r13 bytes
775        # r16-r13 is the number of bytes in plaintext mod 16)
776        add	%r13, %r12
777        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
778        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
779
780.if  \ENC_DEC ==  DEC
781        vmovdqa	%xmm1, %xmm3
782        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
783
784        mov	\PLAIN_CYPH_LEN, %r10
785        add	%r13, %r10
786        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
787        sub	$16, %r10
788        # Determine if if partial block is not being filled and
789        # shift mask accordingly
790        jge	_no_extra_mask_1_\@
791        sub	%r10, %r12
792_no_extra_mask_1_\@:
793
794        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
795        # get the appropriate mask to mask out bottom r13 bytes of xmm9
796        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
797
798        vpand	%xmm1, %xmm3, %xmm3
799        vmovdqa	SHUF_MASK(%rip), %xmm10
800        vpshufb	%xmm10, %xmm3, %xmm3
801        vpshufb	%xmm2, %xmm3, %xmm3
802        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
803
804        cmp	$0, %r10
805        jl	_partial_incomplete_1_\@
806
807        # GHASH computation for the last <16 Byte block
808        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
809        xor	%eax,%eax
810
811        mov	%rax, PBlockLen(arg2)
812        jmp	_dec_done_\@
813_partial_incomplete_1_\@:
814        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
815_dec_done_\@:
816        vmovdqu	\AAD_HASH, AadHash(arg2)
817.else
818        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
819
820        mov	\PLAIN_CYPH_LEN, %r10
821        add	%r13, %r10
822        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
823        sub	$16, %r10
824        # Determine if if partial block is not being filled and
825        # shift mask accordingly
826        jge	_no_extra_mask_2_\@
827        sub	%r10, %r12
828_no_extra_mask_2_\@:
829
830        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
831        # get the appropriate mask to mask out bottom r13 bytes of xmm9
832        vpand	%xmm1, %xmm9, %xmm9
833
834        vmovdqa	SHUF_MASK(%rip), %xmm1
835        vpshufb %xmm1, %xmm9, %xmm9
836        vpshufb %xmm2, %xmm9, %xmm9
837        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
838
839        cmp	$0, %r10
840        jl	_partial_incomplete_2_\@
841
842        # GHASH computation for the last <16 Byte block
843        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
844        xor	%eax,%eax
845
846        mov	%rax, PBlockLen(arg2)
847        jmp	_encode_done_\@
848_partial_incomplete_2_\@:
849        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
850_encode_done_\@:
851        vmovdqu	\AAD_HASH, AadHash(arg2)
852
853        vmovdqa	SHUF_MASK(%rip), %xmm10
854        # shuffle xmm9 back to output as ciphertext
855        vpshufb	%xmm10, %xmm9, %xmm9
856        vpshufb	%xmm2, %xmm9, %xmm9
857.endif
858        # output encrypted Bytes
859        cmp	$0, %r10
860        jl	_partial_fill_\@
861        mov	%r13, %r12
862        mov	$16, %r13
863        # Set r13 to be the number of bytes to write out
864        sub	%r12, %r13
865        jmp	_count_set_\@
866_partial_fill_\@:
867        mov	\PLAIN_CYPH_LEN, %r13
868_count_set_\@:
869        vmovdqa	%xmm9, %xmm0
870        vmovq	%xmm0, %rax
871        cmp	$8, %r13
872        jle	_less_than_8_bytes_left_\@
873
874        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
875        add	$8, \DATA_OFFSET
876        psrldq	$8, %xmm0
877        vmovq	%xmm0, %rax
878        sub	$8, %r13
879_less_than_8_bytes_left_\@:
880        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
881        add	$1, \DATA_OFFSET
882        shr	$8, %rax
883        sub	$1, %r13
884        jne	_less_than_8_bytes_left_\@
885_partial_block_done_\@:
886.endm # PARTIAL_BLOCK
887
888###############################################################################
889# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
890# Input: A and B (128-bits each, bit-reflected)
891# Output: C = A*B*x mod poly, (i.e. >>1 )
892# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
893# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
894###############################################################################
895.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
896
897        vpshufd         $0b01001110, \GH, \T2
898        vpshufd         $0b01001110, \HK, \T3
899        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
900        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
901
902        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
903        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
904        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
905        vpxor           \GH, \T2,\T2
906        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
907
908        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
909        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
910        vpxor           \T3, \GH, \GH
911        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
912
913        #first phase of the reduction
914        vpslld  $31, \GH, \T2                   # packed right shifting << 31
915        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
916        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
917
918        vpxor   \T3, \T2, \T2                   # xor the shifted versions
919        vpxor   \T4, \T2, \T2
920
921        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
922
923        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
924        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
925
926        #second phase of the reduction
927
928        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
929        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
930        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
931        vpxor   \T3, \T2, \T2                   # xor the shifted versions
932        vpxor   \T4, \T2, \T2
933
934        vpxor   \T5, \T2, \T2
935        vpxor   \T2, \GH, \GH
936        vpxor   \T1, \GH, \GH                   # the result is in GH
937
938
939.endm
940
941.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
942
943        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
944        vmovdqa  \HK, \T5
945
946        vpshufd  $0b01001110, \T5, \T1
947        vpxor    \T5, \T1, \T1
948        vmovdqu  \T1, HashKey_k(arg2)
949
950        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
951        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
952        vpshufd  $0b01001110, \T5, \T1
953        vpxor    \T5, \T1, \T1
954        vmovdqu  \T1, HashKey_2_k(arg2)
955
956        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
957        vmovdqu  \T5, HashKey_3(arg2)
958        vpshufd  $0b01001110, \T5, \T1
959        vpxor    \T5, \T1, \T1
960        vmovdqu  \T1, HashKey_3_k(arg2)
961
962        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
963        vmovdqu  \T5, HashKey_4(arg2)
964        vpshufd  $0b01001110, \T5, \T1
965        vpxor    \T5, \T1, \T1
966        vmovdqu  \T1, HashKey_4_k(arg2)
967
968        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
969        vmovdqu  \T5, HashKey_5(arg2)
970        vpshufd  $0b01001110, \T5, \T1
971        vpxor    \T5, \T1, \T1
972        vmovdqu  \T1, HashKey_5_k(arg2)
973
974        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
975        vmovdqu  \T5, HashKey_6(arg2)
976        vpshufd  $0b01001110, \T5, \T1
977        vpxor    \T5, \T1, \T1
978        vmovdqu  \T1, HashKey_6_k(arg2)
979
980        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
981        vmovdqu  \T5, HashKey_7(arg2)
982        vpshufd  $0b01001110, \T5, \T1
983        vpxor    \T5, \T1, \T1
984        vmovdqu  \T1, HashKey_7_k(arg2)
985
986        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
987        vmovdqu  \T5, HashKey_8(arg2)
988        vpshufd  $0b01001110, \T5, \T1
989        vpxor    \T5, \T1, \T1
990        vmovdqu  \T1, HashKey_8_k(arg2)
991
992.endm
993
994## if a = number of total plaintext bytes
995## b = floor(a/16)
996## num_initial_blocks = b mod 4#
997## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
998## r10, r11, r12, rax are clobbered
999## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1000
1001.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1002	i = (8-\num_initial_blocks)
1003	setreg
1004        vmovdqu AadHash(arg2), reg_i
1005
1006	# start AES for num_initial_blocks blocks
1007	vmovdqu CurCount(arg2), \CTR
1008
1009	i = (9-\num_initial_blocks)
1010	setreg
1011.rep \num_initial_blocks
1012                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1013                vmovdqa \CTR, reg_i
1014                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1015	i = (i+1)
1016	setreg
1017.endr
1018
1019	vmovdqa  (arg1), \T_key
1020	i = (9-\num_initial_blocks)
1021	setreg
1022.rep \num_initial_blocks
1023                vpxor   \T_key, reg_i, reg_i
1024	i = (i+1)
1025	setreg
1026.endr
1027
1028       j = 1
1029       setreg
1030.rep \REP
1031       vmovdqa  16*j(arg1), \T_key
1032	i = (9-\num_initial_blocks)
1033	setreg
1034.rep \num_initial_blocks
1035        vaesenc \T_key, reg_i, reg_i
1036	i = (i+1)
1037	setreg
1038.endr
1039
1040       j = (j+1)
1041       setreg
1042.endr
1043
1044	vmovdqa  16*j(arg1), \T_key
1045	i = (9-\num_initial_blocks)
1046	setreg
1047.rep \num_initial_blocks
1048        vaesenclast      \T_key, reg_i, reg_i
1049	i = (i+1)
1050	setreg
1051.endr
1052
1053	i = (9-\num_initial_blocks)
1054	setreg
1055.rep \num_initial_blocks
1056                vmovdqu (arg4, %r11), \T1
1057                vpxor   \T1, reg_i, reg_i
1058                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1059                add     $16, %r11
1060.if  \ENC_DEC == DEC
1061                vmovdqa \T1, reg_i
1062.endif
1063                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1064	i = (i+1)
1065	setreg
1066.endr
1067
1068
1069	i = (8-\num_initial_blocks)
1070	j = (9-\num_initial_blocks)
1071	setreg
1072
1073.rep \num_initial_blocks
1074        vpxor    reg_i, reg_j, reg_j
1075        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1076	i = (i+1)
1077	j = (j+1)
1078	setreg
1079.endr
1080        # XMM8 has the combined result here
1081
1082        vmovdqa  \XMM8, TMP1(%rsp)
1083        vmovdqa  \XMM8, \T3
1084
1085        cmp     $128, %r13
1086        jl      _initial_blocks_done\@                  # no need for precomputed constants
1087
1088###############################################################################
1089# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1090                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1091                vmovdqa  \CTR, \XMM1
1092                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1093
1094                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1095                vmovdqa  \CTR, \XMM2
1096                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1097
1098                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1099                vmovdqa  \CTR, \XMM3
1100                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1101
1102                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1103                vmovdqa  \CTR, \XMM4
1104                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1105
1106                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1107                vmovdqa  \CTR, \XMM5
1108                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1109
1110                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1111                vmovdqa  \CTR, \XMM6
1112                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1113
1114                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1115                vmovdqa  \CTR, \XMM7
1116                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1117
1118                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1119                vmovdqa  \CTR, \XMM8
1120                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1121
1122                vmovdqa  (arg1), \T_key
1123                vpxor    \T_key, \XMM1, \XMM1
1124                vpxor    \T_key, \XMM2, \XMM2
1125                vpxor    \T_key, \XMM3, \XMM3
1126                vpxor    \T_key, \XMM4, \XMM4
1127                vpxor    \T_key, \XMM5, \XMM5
1128                vpxor    \T_key, \XMM6, \XMM6
1129                vpxor    \T_key, \XMM7, \XMM7
1130                vpxor    \T_key, \XMM8, \XMM8
1131
1132               i = 1
1133               setreg
1134.rep    \REP       # do REP rounds
1135                vmovdqa  16*i(arg1), \T_key
1136                vaesenc  \T_key, \XMM1, \XMM1
1137                vaesenc  \T_key, \XMM2, \XMM2
1138                vaesenc  \T_key, \XMM3, \XMM3
1139                vaesenc  \T_key, \XMM4, \XMM4
1140                vaesenc  \T_key, \XMM5, \XMM5
1141                vaesenc  \T_key, \XMM6, \XMM6
1142                vaesenc  \T_key, \XMM7, \XMM7
1143                vaesenc  \T_key, \XMM8, \XMM8
1144               i = (i+1)
1145               setreg
1146.endr
1147
1148                vmovdqa  16*i(arg1), \T_key
1149                vaesenclast  \T_key, \XMM1, \XMM1
1150                vaesenclast  \T_key, \XMM2, \XMM2
1151                vaesenclast  \T_key, \XMM3, \XMM3
1152                vaesenclast  \T_key, \XMM4, \XMM4
1153                vaesenclast  \T_key, \XMM5, \XMM5
1154                vaesenclast  \T_key, \XMM6, \XMM6
1155                vaesenclast  \T_key, \XMM7, \XMM7
1156                vaesenclast  \T_key, \XMM8, \XMM8
1157
1158                vmovdqu  (arg4, %r11), \T1
1159                vpxor    \T1, \XMM1, \XMM1
1160                vmovdqu  \XMM1, (arg3 , %r11)
1161                .if   \ENC_DEC == DEC
1162                vmovdqa  \T1, \XMM1
1163                .endif
1164
1165                vmovdqu  16*1(arg4, %r11), \T1
1166                vpxor    \T1, \XMM2, \XMM2
1167                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1168                .if   \ENC_DEC == DEC
1169                vmovdqa  \T1, \XMM2
1170                .endif
1171
1172                vmovdqu  16*2(arg4, %r11), \T1
1173                vpxor    \T1, \XMM3, \XMM3
1174                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1175                .if   \ENC_DEC == DEC
1176                vmovdqa  \T1, \XMM3
1177                .endif
1178
1179                vmovdqu  16*3(arg4, %r11), \T1
1180                vpxor    \T1, \XMM4, \XMM4
1181                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1182                .if   \ENC_DEC == DEC
1183                vmovdqa  \T1, \XMM4
1184                .endif
1185
1186                vmovdqu  16*4(arg4, %r11), \T1
1187                vpxor    \T1, \XMM5, \XMM5
1188                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1189                .if   \ENC_DEC == DEC
1190                vmovdqa  \T1, \XMM5
1191                .endif
1192
1193                vmovdqu  16*5(arg4, %r11), \T1
1194                vpxor    \T1, \XMM6, \XMM6
1195                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1196                .if   \ENC_DEC == DEC
1197                vmovdqa  \T1, \XMM6
1198                .endif
1199
1200                vmovdqu  16*6(arg4, %r11), \T1
1201                vpxor    \T1, \XMM7, \XMM7
1202                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1203                .if   \ENC_DEC == DEC
1204                vmovdqa  \T1, \XMM7
1205                .endif
1206
1207                vmovdqu  16*7(arg4, %r11), \T1
1208                vpxor    \T1, \XMM8, \XMM8
1209                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1210                .if   \ENC_DEC == DEC
1211                vmovdqa  \T1, \XMM8
1212                .endif
1213
1214                add     $128, %r11
1215
1216                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1217                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1218                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1219                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1220                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1221                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1222                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1223                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1224                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1225
1226###############################################################################
1227
1228_initial_blocks_done\@:
1229
1230.endm
1231
1232# encrypt 8 blocks at a time
1233# ghash the 8 previously encrypted ciphertext blocks
1234# arg1, arg3, arg4 are used as pointers only, not modified
1235# r11 is the data offset value
1236.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1237
1238        vmovdqa \XMM1, \T2
1239        vmovdqa \XMM2, TMP2(%rsp)
1240        vmovdqa \XMM3, TMP3(%rsp)
1241        vmovdqa \XMM4, TMP4(%rsp)
1242        vmovdqa \XMM5, TMP5(%rsp)
1243        vmovdqa \XMM6, TMP6(%rsp)
1244        vmovdqa \XMM7, TMP7(%rsp)
1245        vmovdqa \XMM8, TMP8(%rsp)
1246
1247.if \loop_idx == in_order
1248                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1249                vpaddd  ONE(%rip), \XMM1, \XMM2
1250                vpaddd  ONE(%rip), \XMM2, \XMM3
1251                vpaddd  ONE(%rip), \XMM3, \XMM4
1252                vpaddd  ONE(%rip), \XMM4, \XMM5
1253                vpaddd  ONE(%rip), \XMM5, \XMM6
1254                vpaddd  ONE(%rip), \XMM6, \XMM7
1255                vpaddd  ONE(%rip), \XMM7, \XMM8
1256                vmovdqa \XMM8, \CTR
1257
1258                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1259                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1260                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1261                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1262                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1263                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1264                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1265                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1266.else
1267                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1268                vpaddd  ONEf(%rip), \XMM1, \XMM2
1269                vpaddd  ONEf(%rip), \XMM2, \XMM3
1270                vpaddd  ONEf(%rip), \XMM3, \XMM4
1271                vpaddd  ONEf(%rip), \XMM4, \XMM5
1272                vpaddd  ONEf(%rip), \XMM5, \XMM6
1273                vpaddd  ONEf(%rip), \XMM6, \XMM7
1274                vpaddd  ONEf(%rip), \XMM7, \XMM8
1275                vmovdqa \XMM8, \CTR
1276.endif
1277
1278
1279        #######################################################################
1280
1281                vmovdqu (arg1), \T1
1282                vpxor   \T1, \XMM1, \XMM1
1283                vpxor   \T1, \XMM2, \XMM2
1284                vpxor   \T1, \XMM3, \XMM3
1285                vpxor   \T1, \XMM4, \XMM4
1286                vpxor   \T1, \XMM5, \XMM5
1287                vpxor   \T1, \XMM6, \XMM6
1288                vpxor   \T1, \XMM7, \XMM7
1289                vpxor   \T1, \XMM8, \XMM8
1290
1291        #######################################################################
1292
1293
1294
1295
1296
1297                vmovdqu 16*1(arg1), \T1
1298                vaesenc \T1, \XMM1, \XMM1
1299                vaesenc \T1, \XMM2, \XMM2
1300                vaesenc \T1, \XMM3, \XMM3
1301                vaesenc \T1, \XMM4, \XMM4
1302                vaesenc \T1, \XMM5, \XMM5
1303                vaesenc \T1, \XMM6, \XMM6
1304                vaesenc \T1, \XMM7, \XMM7
1305                vaesenc \T1, \XMM8, \XMM8
1306
1307                vmovdqu 16*2(arg1), \T1
1308                vaesenc \T1, \XMM1, \XMM1
1309                vaesenc \T1, \XMM2, \XMM2
1310                vaesenc \T1, \XMM3, \XMM3
1311                vaesenc \T1, \XMM4, \XMM4
1312                vaesenc \T1, \XMM5, \XMM5
1313                vaesenc \T1, \XMM6, \XMM6
1314                vaesenc \T1, \XMM7, \XMM7
1315                vaesenc \T1, \XMM8, \XMM8
1316
1317
1318        #######################################################################
1319
1320        vmovdqu         HashKey_8(arg2), \T5
1321        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1322        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1323
1324        vpshufd         $0b01001110, \T2, \T6
1325        vpxor           \T2, \T6, \T6
1326
1327        vmovdqu         HashKey_8_k(arg2), \T5
1328        vpclmulqdq      $0x00, \T5, \T6, \T6
1329
1330                vmovdqu 16*3(arg1), \T1
1331                vaesenc \T1, \XMM1, \XMM1
1332                vaesenc \T1, \XMM2, \XMM2
1333                vaesenc \T1, \XMM3, \XMM3
1334                vaesenc \T1, \XMM4, \XMM4
1335                vaesenc \T1, \XMM5, \XMM5
1336                vaesenc \T1, \XMM6, \XMM6
1337                vaesenc \T1, \XMM7, \XMM7
1338                vaesenc \T1, \XMM8, \XMM8
1339
1340        vmovdqa         TMP2(%rsp), \T1
1341        vmovdqu         HashKey_7(arg2), \T5
1342        vpclmulqdq      $0x11, \T5, \T1, \T3
1343        vpxor           \T3, \T4, \T4
1344        vpclmulqdq      $0x00, \T5, \T1, \T3
1345        vpxor           \T3, \T7, \T7
1346
1347        vpshufd         $0b01001110, \T1, \T3
1348        vpxor           \T1, \T3, \T3
1349        vmovdqu         HashKey_7_k(arg2), \T5
1350        vpclmulqdq      $0x10, \T5, \T3, \T3
1351        vpxor           \T3, \T6, \T6
1352
1353                vmovdqu 16*4(arg1), \T1
1354                vaesenc \T1, \XMM1, \XMM1
1355                vaesenc \T1, \XMM2, \XMM2
1356                vaesenc \T1, \XMM3, \XMM3
1357                vaesenc \T1, \XMM4, \XMM4
1358                vaesenc \T1, \XMM5, \XMM5
1359                vaesenc \T1, \XMM6, \XMM6
1360                vaesenc \T1, \XMM7, \XMM7
1361                vaesenc \T1, \XMM8, \XMM8
1362
1363        #######################################################################
1364
1365        vmovdqa         TMP3(%rsp), \T1
1366        vmovdqu         HashKey_6(arg2), \T5
1367        vpclmulqdq      $0x11, \T5, \T1, \T3
1368        vpxor           \T3, \T4, \T4
1369        vpclmulqdq      $0x00, \T5, \T1, \T3
1370        vpxor           \T3, \T7, \T7
1371
1372        vpshufd         $0b01001110, \T1, \T3
1373        vpxor           \T1, \T3, \T3
1374        vmovdqu         HashKey_6_k(arg2), \T5
1375        vpclmulqdq      $0x10, \T5, \T3, \T3
1376        vpxor           \T3, \T6, \T6
1377
1378                vmovdqu 16*5(arg1), \T1
1379                vaesenc \T1, \XMM1, \XMM1
1380                vaesenc \T1, \XMM2, \XMM2
1381                vaesenc \T1, \XMM3, \XMM3
1382                vaesenc \T1, \XMM4, \XMM4
1383                vaesenc \T1, \XMM5, \XMM5
1384                vaesenc \T1, \XMM6, \XMM6
1385                vaesenc \T1, \XMM7, \XMM7
1386                vaesenc \T1, \XMM8, \XMM8
1387
1388        vmovdqa         TMP4(%rsp), \T1
1389        vmovdqu         HashKey_5(arg2), \T5
1390        vpclmulqdq      $0x11, \T5, \T1, \T3
1391        vpxor           \T3, \T4, \T4
1392        vpclmulqdq      $0x00, \T5, \T1, \T3
1393        vpxor           \T3, \T7, \T7
1394
1395        vpshufd         $0b01001110, \T1, \T3
1396        vpxor           \T1, \T3, \T3
1397        vmovdqu         HashKey_5_k(arg2), \T5
1398        vpclmulqdq      $0x10, \T5, \T3, \T3
1399        vpxor           \T3, \T6, \T6
1400
1401                vmovdqu 16*6(arg1), \T1
1402                vaesenc \T1, \XMM1, \XMM1
1403                vaesenc \T1, \XMM2, \XMM2
1404                vaesenc \T1, \XMM3, \XMM3
1405                vaesenc \T1, \XMM4, \XMM4
1406                vaesenc \T1, \XMM5, \XMM5
1407                vaesenc \T1, \XMM6, \XMM6
1408                vaesenc \T1, \XMM7, \XMM7
1409                vaesenc \T1, \XMM8, \XMM8
1410
1411
1412        vmovdqa         TMP5(%rsp), \T1
1413        vmovdqu         HashKey_4(arg2), \T5
1414        vpclmulqdq      $0x11, \T5, \T1, \T3
1415        vpxor           \T3, \T4, \T4
1416        vpclmulqdq      $0x00, \T5, \T1, \T3
1417        vpxor           \T3, \T7, \T7
1418
1419        vpshufd         $0b01001110, \T1, \T3
1420        vpxor           \T1, \T3, \T3
1421        vmovdqu         HashKey_4_k(arg2), \T5
1422        vpclmulqdq      $0x10, \T5, \T3, \T3
1423        vpxor           \T3, \T6, \T6
1424
1425                vmovdqu 16*7(arg1), \T1
1426                vaesenc \T1, \XMM1, \XMM1
1427                vaesenc \T1, \XMM2, \XMM2
1428                vaesenc \T1, \XMM3, \XMM3
1429                vaesenc \T1, \XMM4, \XMM4
1430                vaesenc \T1, \XMM5, \XMM5
1431                vaesenc \T1, \XMM6, \XMM6
1432                vaesenc \T1, \XMM7, \XMM7
1433                vaesenc \T1, \XMM8, \XMM8
1434
1435        vmovdqa         TMP6(%rsp), \T1
1436        vmovdqu         HashKey_3(arg2), \T5
1437        vpclmulqdq      $0x11, \T5, \T1, \T3
1438        vpxor           \T3, \T4, \T4
1439        vpclmulqdq      $0x00, \T5, \T1, \T3
1440        vpxor           \T3, \T7, \T7
1441
1442        vpshufd         $0b01001110, \T1, \T3
1443        vpxor           \T1, \T3, \T3
1444        vmovdqu         HashKey_3_k(arg2), \T5
1445        vpclmulqdq      $0x10, \T5, \T3, \T3
1446        vpxor           \T3, \T6, \T6
1447
1448
1449                vmovdqu 16*8(arg1), \T1
1450                vaesenc \T1, \XMM1, \XMM1
1451                vaesenc \T1, \XMM2, \XMM2
1452                vaesenc \T1, \XMM3, \XMM3
1453                vaesenc \T1, \XMM4, \XMM4
1454                vaesenc \T1, \XMM5, \XMM5
1455                vaesenc \T1, \XMM6, \XMM6
1456                vaesenc \T1, \XMM7, \XMM7
1457                vaesenc \T1, \XMM8, \XMM8
1458
1459        vmovdqa         TMP7(%rsp), \T1
1460        vmovdqu         HashKey_2(arg2), \T5
1461        vpclmulqdq      $0x11, \T5, \T1, \T3
1462        vpxor           \T3, \T4, \T4
1463        vpclmulqdq      $0x00, \T5, \T1, \T3
1464        vpxor           \T3, \T7, \T7
1465
1466        vpshufd         $0b01001110, \T1, \T3
1467        vpxor           \T1, \T3, \T3
1468        vmovdqu         HashKey_2_k(arg2), \T5
1469        vpclmulqdq      $0x10, \T5, \T3, \T3
1470        vpxor           \T3, \T6, \T6
1471
1472        #######################################################################
1473
1474                vmovdqu 16*9(arg1), \T5
1475                vaesenc \T5, \XMM1, \XMM1
1476                vaesenc \T5, \XMM2, \XMM2
1477                vaesenc \T5, \XMM3, \XMM3
1478                vaesenc \T5, \XMM4, \XMM4
1479                vaesenc \T5, \XMM5, \XMM5
1480                vaesenc \T5, \XMM6, \XMM6
1481                vaesenc \T5, \XMM7, \XMM7
1482                vaesenc \T5, \XMM8, \XMM8
1483
1484        vmovdqa         TMP8(%rsp), \T1
1485        vmovdqu         HashKey(arg2), \T5
1486        vpclmulqdq      $0x11, \T5, \T1, \T3
1487        vpxor           \T3, \T4, \T4
1488        vpclmulqdq      $0x00, \T5, \T1, \T3
1489        vpxor           \T3, \T7, \T7
1490
1491        vpshufd         $0b01001110, \T1, \T3
1492        vpxor           \T1, \T3, \T3
1493        vmovdqu         HashKey_k(arg2), \T5
1494        vpclmulqdq      $0x10, \T5, \T3, \T3
1495        vpxor           \T3, \T6, \T6
1496
1497        vpxor           \T4, \T6, \T6
1498        vpxor           \T7, \T6, \T6
1499
1500                vmovdqu 16*10(arg1), \T5
1501
1502        i = 11
1503        setreg
1504.rep (\REP-9)
1505
1506        vaesenc \T5, \XMM1, \XMM1
1507        vaesenc \T5, \XMM2, \XMM2
1508        vaesenc \T5, \XMM3, \XMM3
1509        vaesenc \T5, \XMM4, \XMM4
1510        vaesenc \T5, \XMM5, \XMM5
1511        vaesenc \T5, \XMM6, \XMM6
1512        vaesenc \T5, \XMM7, \XMM7
1513        vaesenc \T5, \XMM8, \XMM8
1514
1515        vmovdqu 16*i(arg1), \T5
1516        i = i + 1
1517        setreg
1518.endr
1519
1520	i = 0
1521	j = 1
1522	setreg
1523.rep 8
1524		vpxor	16*i(arg4, %r11), \T5, \T2
1525                .if \ENC_DEC == ENC
1526                vaesenclast     \T2, reg_j, reg_j
1527                .else
1528                vaesenclast     \T2, reg_j, \T3
1529                vmovdqu 16*i(arg4, %r11), reg_j
1530                vmovdqu \T3, 16*i(arg3, %r11)
1531                .endif
1532	i = (i+1)
1533	j = (j+1)
1534	setreg
1535.endr
1536	#######################################################################
1537
1538
1539	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1540	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1541	vpxor	\T3, \T7, \T7
1542	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1543
1544
1545
1546	#######################################################################
1547	#first phase of the reduction
1548	#######################################################################
1549        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1550        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1551        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1552
1553        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1554        vpxor   \T4, \T2, \T2
1555
1556        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1557
1558        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1559        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1560	#######################################################################
1561                .if \ENC_DEC == ENC
1562		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1563		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1564		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1565		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1566		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1567		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1568		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1569		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1570                .endif
1571
1572	#######################################################################
1573	#second phase of the reduction
1574        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1575        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1576        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1577        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1578        vpxor   \T4, \T2, \T2
1579
1580        vpxor   \T1, \T2, \T2
1581        vpxor   \T2, \T7, \T7
1582        vpxor   \T7, \T6, \T6                           # the result is in T6
1583	#######################################################################
1584
1585		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1586		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1587		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1588		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1589		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1590		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1591		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1592		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1593
1594
1595	vpxor	\T6, \XMM1, \XMM1
1596
1597
1598
1599.endm
1600
1601
1602# GHASH the last 4 ciphertext blocks.
1603.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1604
1605        ## Karatsuba Method
1606
1607
1608        vpshufd         $0b01001110, \XMM1, \T2
1609        vpxor           \XMM1, \T2, \T2
1610        vmovdqu         HashKey_8(arg2), \T5
1611        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1612        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1613
1614        vmovdqu         HashKey_8_k(arg2), \T3
1615        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1616
1617        ######################
1618
1619        vpshufd         $0b01001110, \XMM2, \T2
1620        vpxor           \XMM2, \T2, \T2
1621        vmovdqu         HashKey_7(arg2), \T5
1622        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1623        vpxor           \T4, \T6, \T6
1624
1625        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1626        vpxor           \T4, \T7, \T7
1627
1628        vmovdqu         HashKey_7_k(arg2), \T3
1629        vpclmulqdq      $0x00, \T3, \T2, \T2
1630        vpxor           \T2, \XMM1, \XMM1
1631
1632        ######################
1633
1634        vpshufd         $0b01001110, \XMM3, \T2
1635        vpxor           \XMM3, \T2, \T2
1636        vmovdqu         HashKey_6(arg2), \T5
1637        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1638        vpxor           \T4, \T6, \T6
1639
1640        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1641        vpxor           \T4, \T7, \T7
1642
1643        vmovdqu         HashKey_6_k(arg2), \T3
1644        vpclmulqdq      $0x00, \T3, \T2, \T2
1645        vpxor           \T2, \XMM1, \XMM1
1646
1647        ######################
1648
1649        vpshufd         $0b01001110, \XMM4, \T2
1650        vpxor           \XMM4, \T2, \T2
1651        vmovdqu         HashKey_5(arg2), \T5
1652        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1653        vpxor           \T4, \T6, \T6
1654
1655        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1656        vpxor           \T4, \T7, \T7
1657
1658        vmovdqu         HashKey_5_k(arg2), \T3
1659        vpclmulqdq      $0x00, \T3, \T2, \T2
1660        vpxor           \T2, \XMM1, \XMM1
1661
1662        ######################
1663
1664        vpshufd         $0b01001110, \XMM5, \T2
1665        vpxor           \XMM5, \T2, \T2
1666        vmovdqu         HashKey_4(arg2), \T5
1667        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1668        vpxor           \T4, \T6, \T6
1669
1670        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1671        vpxor           \T4, \T7, \T7
1672
1673        vmovdqu         HashKey_4_k(arg2), \T3
1674        vpclmulqdq      $0x00, \T3, \T2, \T2
1675        vpxor           \T2, \XMM1, \XMM1
1676
1677        ######################
1678
1679        vpshufd         $0b01001110, \XMM6, \T2
1680        vpxor           \XMM6, \T2, \T2
1681        vmovdqu         HashKey_3(arg2), \T5
1682        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1683        vpxor           \T4, \T6, \T6
1684
1685        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1686        vpxor           \T4, \T7, \T7
1687
1688        vmovdqu         HashKey_3_k(arg2), \T3
1689        vpclmulqdq      $0x00, \T3, \T2, \T2
1690        vpxor           \T2, \XMM1, \XMM1
1691
1692        ######################
1693
1694        vpshufd         $0b01001110, \XMM7, \T2
1695        vpxor           \XMM7, \T2, \T2
1696        vmovdqu         HashKey_2(arg2), \T5
1697        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1698        vpxor           \T4, \T6, \T6
1699
1700        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1701        vpxor           \T4, \T7, \T7
1702
1703        vmovdqu         HashKey_2_k(arg2), \T3
1704        vpclmulqdq      $0x00, \T3, \T2, \T2
1705        vpxor           \T2, \XMM1, \XMM1
1706
1707        ######################
1708
1709        vpshufd         $0b01001110, \XMM8, \T2
1710        vpxor           \XMM8, \T2, \T2
1711        vmovdqu         HashKey(arg2), \T5
1712        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1713        vpxor           \T4, \T6, \T6
1714
1715        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1716        vpxor           \T4, \T7, \T7
1717
1718        vmovdqu         HashKey_k(arg2), \T3
1719        vpclmulqdq      $0x00, \T3, \T2, \T2
1720
1721        vpxor           \T2, \XMM1, \XMM1
1722        vpxor           \T6, \XMM1, \XMM1
1723        vpxor           \T7, \XMM1, \T2
1724
1725
1726
1727
1728        vpslldq $8, \T2, \T4
1729        vpsrldq $8, \T2, \T2
1730
1731        vpxor   \T4, \T7, \T7
1732        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1733				# the accumulated carry-less multiplications
1734
1735        #######################################################################
1736        #first phase of the reduction
1737        vpslld  $31, \T7, \T2   # packed right shifting << 31
1738        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1739        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1740
1741        vpxor   \T3, \T2, \T2   # xor the shifted versions
1742        vpxor   \T4, \T2, \T2
1743
1744        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1745
1746        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1747        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1748        #######################################################################
1749
1750
1751        #second phase of the reduction
1752        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1753        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1754        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1755        vpxor   \T3, \T2, \T2   # xor the shifted versions
1756        vpxor   \T4, \T2, \T2
1757
1758        vpxor   \T1, \T2, \T2
1759        vpxor   \T2, \T7, \T7
1760        vpxor   \T7, \T6, \T6   # the result is in T6
1761
1762.endm
1763
1764#############################################################
1765#void   aesni_gcm_precomp_avx_gen2
1766#        (gcm_data     *my_ctx_data,
1767#         gcm_context_data *data,
1768#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1770#			(from Security Association) concatenated with 8 byte
1771#			Initialisation Vector (from IPSec ESP Payload)
1772#			concatenated with 0x00000001. 16-byte aligned pointer. */
1773#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1774#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1775#############################################################
1776SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1777        FUNC_SAVE
1778        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1779        FUNC_RESTORE
1780        ret
1781SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1782
1783###############################################################################
1784#void   aesni_gcm_enc_update_avx_gen2(
1785#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1786#        gcm_context_data *data,
1787#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1788#        const   u8 *in, /* Plaintext input */
1789#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1790###############################################################################
1791SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1792        FUNC_SAVE
1793        mov     keysize, %eax
1794        cmp     $32, %eax
1795        je      key_256_enc_update
1796        cmp     $16, %eax
1797        je      key_128_enc_update
1798        # must be 192
1799        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1800        FUNC_RESTORE
1801        ret
1802key_128_enc_update:
1803        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1804        FUNC_RESTORE
1805        ret
1806key_256_enc_update:
1807        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1808        FUNC_RESTORE
1809        ret
1810SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1811
1812###############################################################################
1813#void   aesni_gcm_dec_update_avx_gen2(
1814#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1815#        gcm_context_data *data,
1816#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1817#        const   u8 *in, /* Ciphertext input */
1818#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1819###############################################################################
1820SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1821        FUNC_SAVE
1822        mov     keysize,%eax
1823        cmp     $32, %eax
1824        je      key_256_dec_update
1825        cmp     $16, %eax
1826        je      key_128_dec_update
1827        # must be 192
1828        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1829        FUNC_RESTORE
1830        ret
1831key_128_dec_update:
1832        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1833        FUNC_RESTORE
1834        ret
1835key_256_dec_update:
1836        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1837        FUNC_RESTORE
1838        ret
1839SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1840
1841###############################################################################
1842#void   aesni_gcm_finalize_avx_gen2(
1843#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1844#        gcm_context_data *data,
1845#        u8      *auth_tag, /* Authenticated Tag output. */
1846#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1847#				Valid values are 16 (most likely), 12 or 8. */
1848###############################################################################
1849SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1850        FUNC_SAVE
1851        mov	keysize,%eax
1852        cmp     $32, %eax
1853        je      key_256_finalize
1854        cmp     $16, %eax
1855        je      key_128_finalize
1856        # must be 192
1857        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1858        FUNC_RESTORE
1859        ret
1860key_128_finalize:
1861        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1862        FUNC_RESTORE
1863        ret
1864key_256_finalize:
1865        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1866        FUNC_RESTORE
1867        ret
1868SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1869
1870###############################################################################
1871# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1872# Input: A and B (128-bits each, bit-reflected)
1873# Output: C = A*B*x mod poly, (i.e. >>1 )
1874# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1875# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1876###############################################################################
1877.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1878
1879        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1880        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1881        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1882        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1883        vpxor           \T3, \GH, \GH
1884
1885
1886        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1887        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1888
1889        vpxor           \T3, \T1, \T1
1890        vpxor           \T2, \GH, \GH
1891
1892        #######################################################################
1893        #first phase of the reduction
1894        vmovdqa         POLY2(%rip), \T3
1895
1896        vpclmulqdq      $0x01, \GH, \T3, \T2
1897        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1898
1899        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1900        #######################################################################
1901        #second phase of the reduction
1902        vpclmulqdq      $0x00, \GH, \T3, \T2
1903        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1904
1905        vpclmulqdq      $0x10, \GH, \T3, \GH
1906        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1907
1908        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1909        #######################################################################
1910        vpxor           \T1, \GH, \GH          # the result is in GH
1911
1912
1913.endm
1914
1915.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1916
1917        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1918        vmovdqa  \HK, \T5
1919        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1920        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1921
1922        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1923        vmovdqu  \T5, HashKey_3(arg2)
1924
1925        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1926        vmovdqu  \T5, HashKey_4(arg2)
1927
1928        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1929        vmovdqu  \T5, HashKey_5(arg2)
1930
1931        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1932        vmovdqu  \T5, HashKey_6(arg2)
1933
1934        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1935        vmovdqu  \T5, HashKey_7(arg2)
1936
1937        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1938        vmovdqu  \T5, HashKey_8(arg2)
1939
1940.endm
1941
1942## if a = number of total plaintext bytes
1943## b = floor(a/16)
1944## num_initial_blocks = b mod 4#
1945## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1946## r10, r11, r12, rax are clobbered
1947## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1948
1949.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1950	i = (8-\num_initial_blocks)
1951	setreg
1952	vmovdqu AadHash(arg2), reg_i
1953
1954	# start AES for num_initial_blocks blocks
1955	vmovdqu CurCount(arg2), \CTR
1956
1957	i = (9-\num_initial_blocks)
1958	setreg
1959.rep \num_initial_blocks
1960                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1961                vmovdqa \CTR, reg_i
1962                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1963	i = (i+1)
1964	setreg
1965.endr
1966
1967	vmovdqa  (arg1), \T_key
1968	i = (9-\num_initial_blocks)
1969	setreg
1970.rep \num_initial_blocks
1971                vpxor   \T_key, reg_i, reg_i
1972	i = (i+1)
1973	setreg
1974.endr
1975
1976	j = 1
1977	setreg
1978.rep \REP
1979	vmovdqa  16*j(arg1), \T_key
1980	i = (9-\num_initial_blocks)
1981	setreg
1982.rep \num_initial_blocks
1983        vaesenc \T_key, reg_i, reg_i
1984	i = (i+1)
1985	setreg
1986.endr
1987
1988	j = (j+1)
1989	setreg
1990.endr
1991
1992
1993	vmovdqa  16*j(arg1), \T_key
1994	i = (9-\num_initial_blocks)
1995	setreg
1996.rep \num_initial_blocks
1997        vaesenclast      \T_key, reg_i, reg_i
1998	i = (i+1)
1999	setreg
2000.endr
2001
2002	i = (9-\num_initial_blocks)
2003	setreg
2004.rep \num_initial_blocks
2005                vmovdqu (arg4, %r11), \T1
2006                vpxor   \T1, reg_i, reg_i
2007                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2008						       # num_initial_blocks blocks
2009                add     $16, %r11
2010.if  \ENC_DEC == DEC
2011                vmovdqa \T1, reg_i
2012.endif
2013                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2014	i = (i+1)
2015	setreg
2016.endr
2017
2018
2019	i = (8-\num_initial_blocks)
2020	j = (9-\num_initial_blocks)
2021	setreg
2022
2023.rep \num_initial_blocks
2024        vpxor    reg_i, reg_j, reg_j
2025        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2026	i = (i+1)
2027	j = (j+1)
2028	setreg
2029.endr
2030        # XMM8 has the combined result here
2031
2032        vmovdqa  \XMM8, TMP1(%rsp)
2033        vmovdqa  \XMM8, \T3
2034
2035        cmp     $128, %r13
2036        jl      _initial_blocks_done\@                  # no need for precomputed constants
2037
2038###############################################################################
2039# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2040                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2041                vmovdqa  \CTR, \XMM1
2042                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2043
2044                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2045                vmovdqa  \CTR, \XMM2
2046                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2047
2048                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2049                vmovdqa  \CTR, \XMM3
2050                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2051
2052                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2053                vmovdqa  \CTR, \XMM4
2054                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2055
2056                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2057                vmovdqa  \CTR, \XMM5
2058                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2059
2060                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2061                vmovdqa  \CTR, \XMM6
2062                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2063
2064                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2065                vmovdqa  \CTR, \XMM7
2066                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2067
2068                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2069                vmovdqa  \CTR, \XMM8
2070                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2071
2072                vmovdqa  (arg1), \T_key
2073                vpxor    \T_key, \XMM1, \XMM1
2074                vpxor    \T_key, \XMM2, \XMM2
2075                vpxor    \T_key, \XMM3, \XMM3
2076                vpxor    \T_key, \XMM4, \XMM4
2077                vpxor    \T_key, \XMM5, \XMM5
2078                vpxor    \T_key, \XMM6, \XMM6
2079                vpxor    \T_key, \XMM7, \XMM7
2080                vpxor    \T_key, \XMM8, \XMM8
2081
2082		i = 1
2083		setreg
2084.rep    \REP       # do REP rounds
2085                vmovdqa  16*i(arg1), \T_key
2086                vaesenc  \T_key, \XMM1, \XMM1
2087                vaesenc  \T_key, \XMM2, \XMM2
2088                vaesenc  \T_key, \XMM3, \XMM3
2089                vaesenc  \T_key, \XMM4, \XMM4
2090                vaesenc  \T_key, \XMM5, \XMM5
2091                vaesenc  \T_key, \XMM6, \XMM6
2092                vaesenc  \T_key, \XMM7, \XMM7
2093                vaesenc  \T_key, \XMM8, \XMM8
2094		i = (i+1)
2095		setreg
2096.endr
2097
2098
2099                vmovdqa  16*i(arg1), \T_key
2100                vaesenclast  \T_key, \XMM1, \XMM1
2101                vaesenclast  \T_key, \XMM2, \XMM2
2102                vaesenclast  \T_key, \XMM3, \XMM3
2103                vaesenclast  \T_key, \XMM4, \XMM4
2104                vaesenclast  \T_key, \XMM5, \XMM5
2105                vaesenclast  \T_key, \XMM6, \XMM6
2106                vaesenclast  \T_key, \XMM7, \XMM7
2107                vaesenclast  \T_key, \XMM8, \XMM8
2108
2109                vmovdqu  (arg4, %r11), \T1
2110                vpxor    \T1, \XMM1, \XMM1
2111                vmovdqu  \XMM1, (arg3 , %r11)
2112                .if   \ENC_DEC == DEC
2113                vmovdqa  \T1, \XMM1
2114                .endif
2115
2116                vmovdqu  16*1(arg4, %r11), \T1
2117                vpxor    \T1, \XMM2, \XMM2
2118                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2119                .if   \ENC_DEC == DEC
2120                vmovdqa  \T1, \XMM2
2121                .endif
2122
2123                vmovdqu  16*2(arg4, %r11), \T1
2124                vpxor    \T1, \XMM3, \XMM3
2125                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2126                .if   \ENC_DEC == DEC
2127                vmovdqa  \T1, \XMM3
2128                .endif
2129
2130                vmovdqu  16*3(arg4, %r11), \T1
2131                vpxor    \T1, \XMM4, \XMM4
2132                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2133                .if   \ENC_DEC == DEC
2134                vmovdqa  \T1, \XMM4
2135                .endif
2136
2137                vmovdqu  16*4(arg4, %r11), \T1
2138                vpxor    \T1, \XMM5, \XMM5
2139                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2140                .if   \ENC_DEC == DEC
2141                vmovdqa  \T1, \XMM5
2142                .endif
2143
2144                vmovdqu  16*5(arg4, %r11), \T1
2145                vpxor    \T1, \XMM6, \XMM6
2146                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2147                .if   \ENC_DEC == DEC
2148                vmovdqa  \T1, \XMM6
2149                .endif
2150
2151                vmovdqu  16*6(arg4, %r11), \T1
2152                vpxor    \T1, \XMM7, \XMM7
2153                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2154                .if   \ENC_DEC == DEC
2155                vmovdqa  \T1, \XMM7
2156                .endif
2157
2158                vmovdqu  16*7(arg4, %r11), \T1
2159                vpxor    \T1, \XMM8, \XMM8
2160                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2161                .if   \ENC_DEC == DEC
2162                vmovdqa  \T1, \XMM8
2163                .endif
2164
2165                add     $128, %r11
2166
2167                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2168                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2169							   # the corresponding ciphertext
2170                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2171                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2172                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2173                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2174                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2175                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2176                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2177
2178###############################################################################
2179
2180_initial_blocks_done\@:
2181
2182
2183.endm
2184
2185
2186
2187# encrypt 8 blocks at a time
2188# ghash the 8 previously encrypted ciphertext blocks
2189# arg1, arg3, arg4 are used as pointers only, not modified
2190# r11 is the data offset value
2191.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2192
2193        vmovdqa \XMM1, \T2
2194        vmovdqa \XMM2, TMP2(%rsp)
2195        vmovdqa \XMM3, TMP3(%rsp)
2196        vmovdqa \XMM4, TMP4(%rsp)
2197        vmovdqa \XMM5, TMP5(%rsp)
2198        vmovdqa \XMM6, TMP6(%rsp)
2199        vmovdqa \XMM7, TMP7(%rsp)
2200        vmovdqa \XMM8, TMP8(%rsp)
2201
2202.if \loop_idx == in_order
2203                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2204                vpaddd  ONE(%rip), \XMM1, \XMM2
2205                vpaddd  ONE(%rip), \XMM2, \XMM3
2206                vpaddd  ONE(%rip), \XMM3, \XMM4
2207                vpaddd  ONE(%rip), \XMM4, \XMM5
2208                vpaddd  ONE(%rip), \XMM5, \XMM6
2209                vpaddd  ONE(%rip), \XMM6, \XMM7
2210                vpaddd  ONE(%rip), \XMM7, \XMM8
2211                vmovdqa \XMM8, \CTR
2212
2213                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2214                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2215                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2216                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2217                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2218                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2219                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2220                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2221.else
2222                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2223                vpaddd  ONEf(%rip), \XMM1, \XMM2
2224                vpaddd  ONEf(%rip), \XMM2, \XMM3
2225                vpaddd  ONEf(%rip), \XMM3, \XMM4
2226                vpaddd  ONEf(%rip), \XMM4, \XMM5
2227                vpaddd  ONEf(%rip), \XMM5, \XMM6
2228                vpaddd  ONEf(%rip), \XMM6, \XMM7
2229                vpaddd  ONEf(%rip), \XMM7, \XMM8
2230                vmovdqa \XMM8, \CTR
2231.endif
2232
2233
2234        #######################################################################
2235
2236                vmovdqu (arg1), \T1
2237                vpxor   \T1, \XMM1, \XMM1
2238                vpxor   \T1, \XMM2, \XMM2
2239                vpxor   \T1, \XMM3, \XMM3
2240                vpxor   \T1, \XMM4, \XMM4
2241                vpxor   \T1, \XMM5, \XMM5
2242                vpxor   \T1, \XMM6, \XMM6
2243                vpxor   \T1, \XMM7, \XMM7
2244                vpxor   \T1, \XMM8, \XMM8
2245
2246        #######################################################################
2247
2248
2249
2250
2251
2252                vmovdqu 16*1(arg1), \T1
2253                vaesenc \T1, \XMM1, \XMM1
2254                vaesenc \T1, \XMM2, \XMM2
2255                vaesenc \T1, \XMM3, \XMM3
2256                vaesenc \T1, \XMM4, \XMM4
2257                vaesenc \T1, \XMM5, \XMM5
2258                vaesenc \T1, \XMM6, \XMM6
2259                vaesenc \T1, \XMM7, \XMM7
2260                vaesenc \T1, \XMM8, \XMM8
2261
2262                vmovdqu 16*2(arg1), \T1
2263                vaesenc \T1, \XMM1, \XMM1
2264                vaesenc \T1, \XMM2, \XMM2
2265                vaesenc \T1, \XMM3, \XMM3
2266                vaesenc \T1, \XMM4, \XMM4
2267                vaesenc \T1, \XMM5, \XMM5
2268                vaesenc \T1, \XMM6, \XMM6
2269                vaesenc \T1, \XMM7, \XMM7
2270                vaesenc \T1, \XMM8, \XMM8
2271
2272
2273        #######################################################################
2274
2275        vmovdqu         HashKey_8(arg2), \T5
2276        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2277        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2278        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2279        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2280        vpxor           \T5, \T6, \T6
2281
2282                vmovdqu 16*3(arg1), \T1
2283                vaesenc \T1, \XMM1, \XMM1
2284                vaesenc \T1, \XMM2, \XMM2
2285                vaesenc \T1, \XMM3, \XMM3
2286                vaesenc \T1, \XMM4, \XMM4
2287                vaesenc \T1, \XMM5, \XMM5
2288                vaesenc \T1, \XMM6, \XMM6
2289                vaesenc \T1, \XMM7, \XMM7
2290                vaesenc \T1, \XMM8, \XMM8
2291
2292        vmovdqa         TMP2(%rsp), \T1
2293        vmovdqu         HashKey_7(arg2), \T5
2294        vpclmulqdq      $0x11, \T5, \T1, \T3
2295        vpxor           \T3, \T4, \T4
2296
2297        vpclmulqdq      $0x00, \T5, \T1, \T3
2298        vpxor           \T3, \T7, \T7
2299
2300        vpclmulqdq      $0x01, \T5, \T1, \T3
2301        vpxor           \T3, \T6, \T6
2302
2303        vpclmulqdq      $0x10, \T5, \T1, \T3
2304        vpxor           \T3, \T6, \T6
2305
2306                vmovdqu 16*4(arg1), \T1
2307                vaesenc \T1, \XMM1, \XMM1
2308                vaesenc \T1, \XMM2, \XMM2
2309                vaesenc \T1, \XMM3, \XMM3
2310                vaesenc \T1, \XMM4, \XMM4
2311                vaesenc \T1, \XMM5, \XMM5
2312                vaesenc \T1, \XMM6, \XMM6
2313                vaesenc \T1, \XMM7, \XMM7
2314                vaesenc \T1, \XMM8, \XMM8
2315
2316        #######################################################################
2317
2318        vmovdqa         TMP3(%rsp), \T1
2319        vmovdqu         HashKey_6(arg2), \T5
2320        vpclmulqdq      $0x11, \T5, \T1, \T3
2321        vpxor           \T3, \T4, \T4
2322
2323        vpclmulqdq      $0x00, \T5, \T1, \T3
2324        vpxor           \T3, \T7, \T7
2325
2326        vpclmulqdq      $0x01, \T5, \T1, \T3
2327        vpxor           \T3, \T6, \T6
2328
2329        vpclmulqdq      $0x10, \T5, \T1, \T3
2330        vpxor           \T3, \T6, \T6
2331
2332                vmovdqu 16*5(arg1), \T1
2333                vaesenc \T1, \XMM1, \XMM1
2334                vaesenc \T1, \XMM2, \XMM2
2335                vaesenc \T1, \XMM3, \XMM3
2336                vaesenc \T1, \XMM4, \XMM4
2337                vaesenc \T1, \XMM5, \XMM5
2338                vaesenc \T1, \XMM6, \XMM6
2339                vaesenc \T1, \XMM7, \XMM7
2340                vaesenc \T1, \XMM8, \XMM8
2341
2342        vmovdqa         TMP4(%rsp), \T1
2343        vmovdqu         HashKey_5(arg2), \T5
2344        vpclmulqdq      $0x11, \T5, \T1, \T3
2345        vpxor           \T3, \T4, \T4
2346
2347        vpclmulqdq      $0x00, \T5, \T1, \T3
2348        vpxor           \T3, \T7, \T7
2349
2350        vpclmulqdq      $0x01, \T5, \T1, \T3
2351        vpxor           \T3, \T6, \T6
2352
2353        vpclmulqdq      $0x10, \T5, \T1, \T3
2354        vpxor           \T3, \T6, \T6
2355
2356                vmovdqu 16*6(arg1), \T1
2357                vaesenc \T1, \XMM1, \XMM1
2358                vaesenc \T1, \XMM2, \XMM2
2359                vaesenc \T1, \XMM3, \XMM3
2360                vaesenc \T1, \XMM4, \XMM4
2361                vaesenc \T1, \XMM5, \XMM5
2362                vaesenc \T1, \XMM6, \XMM6
2363                vaesenc \T1, \XMM7, \XMM7
2364                vaesenc \T1, \XMM8, \XMM8
2365
2366
2367        vmovdqa         TMP5(%rsp), \T1
2368        vmovdqu         HashKey_4(arg2), \T5
2369        vpclmulqdq      $0x11, \T5, \T1, \T3
2370        vpxor           \T3, \T4, \T4
2371
2372        vpclmulqdq      $0x00, \T5, \T1, \T3
2373        vpxor           \T3, \T7, \T7
2374
2375        vpclmulqdq      $0x01, \T5, \T1, \T3
2376        vpxor           \T3, \T6, \T6
2377
2378        vpclmulqdq      $0x10, \T5, \T1, \T3
2379        vpxor           \T3, \T6, \T6
2380
2381                vmovdqu 16*7(arg1), \T1
2382                vaesenc \T1, \XMM1, \XMM1
2383                vaesenc \T1, \XMM2, \XMM2
2384                vaesenc \T1, \XMM3, \XMM3
2385                vaesenc \T1, \XMM4, \XMM4
2386                vaesenc \T1, \XMM5, \XMM5
2387                vaesenc \T1, \XMM6, \XMM6
2388                vaesenc \T1, \XMM7, \XMM7
2389                vaesenc \T1, \XMM8, \XMM8
2390
2391        vmovdqa         TMP6(%rsp), \T1
2392        vmovdqu         HashKey_3(arg2), \T5
2393        vpclmulqdq      $0x11, \T5, \T1, \T3
2394        vpxor           \T3, \T4, \T4
2395
2396        vpclmulqdq      $0x00, \T5, \T1, \T3
2397        vpxor           \T3, \T7, \T7
2398
2399        vpclmulqdq      $0x01, \T5, \T1, \T3
2400        vpxor           \T3, \T6, \T6
2401
2402        vpclmulqdq      $0x10, \T5, \T1, \T3
2403        vpxor           \T3, \T6, \T6
2404
2405                vmovdqu 16*8(arg1), \T1
2406                vaesenc \T1, \XMM1, \XMM1
2407                vaesenc \T1, \XMM2, \XMM2
2408                vaesenc \T1, \XMM3, \XMM3
2409                vaesenc \T1, \XMM4, \XMM4
2410                vaesenc \T1, \XMM5, \XMM5
2411                vaesenc \T1, \XMM6, \XMM6
2412                vaesenc \T1, \XMM7, \XMM7
2413                vaesenc \T1, \XMM8, \XMM8
2414
2415        vmovdqa         TMP7(%rsp), \T1
2416        vmovdqu         HashKey_2(arg2), \T5
2417        vpclmulqdq      $0x11, \T5, \T1, \T3
2418        vpxor           \T3, \T4, \T4
2419
2420        vpclmulqdq      $0x00, \T5, \T1, \T3
2421        vpxor           \T3, \T7, \T7
2422
2423        vpclmulqdq      $0x01, \T5, \T1, \T3
2424        vpxor           \T3, \T6, \T6
2425
2426        vpclmulqdq      $0x10, \T5, \T1, \T3
2427        vpxor           \T3, \T6, \T6
2428
2429
2430        #######################################################################
2431
2432                vmovdqu 16*9(arg1), \T5
2433                vaesenc \T5, \XMM1, \XMM1
2434                vaesenc \T5, \XMM2, \XMM2
2435                vaesenc \T5, \XMM3, \XMM3
2436                vaesenc \T5, \XMM4, \XMM4
2437                vaesenc \T5, \XMM5, \XMM5
2438                vaesenc \T5, \XMM6, \XMM6
2439                vaesenc \T5, \XMM7, \XMM7
2440                vaesenc \T5, \XMM8, \XMM8
2441
2442        vmovdqa         TMP8(%rsp), \T1
2443        vmovdqu         HashKey(arg2), \T5
2444
2445        vpclmulqdq      $0x00, \T5, \T1, \T3
2446        vpxor           \T3, \T7, \T7
2447
2448        vpclmulqdq      $0x01, \T5, \T1, \T3
2449        vpxor           \T3, \T6, \T6
2450
2451        vpclmulqdq      $0x10, \T5, \T1, \T3
2452        vpxor           \T3, \T6, \T6
2453
2454        vpclmulqdq      $0x11, \T5, \T1, \T3
2455        vpxor           \T3, \T4, \T1
2456
2457
2458                vmovdqu 16*10(arg1), \T5
2459
2460        i = 11
2461        setreg
2462.rep (\REP-9)
2463        vaesenc \T5, \XMM1, \XMM1
2464        vaesenc \T5, \XMM2, \XMM2
2465        vaesenc \T5, \XMM3, \XMM3
2466        vaesenc \T5, \XMM4, \XMM4
2467        vaesenc \T5, \XMM5, \XMM5
2468        vaesenc \T5, \XMM6, \XMM6
2469        vaesenc \T5, \XMM7, \XMM7
2470        vaesenc \T5, \XMM8, \XMM8
2471
2472        vmovdqu 16*i(arg1), \T5
2473        i = i + 1
2474        setreg
2475.endr
2476
2477	i = 0
2478	j = 1
2479	setreg
2480.rep 8
2481		vpxor	16*i(arg4, %r11), \T5, \T2
2482                .if \ENC_DEC == ENC
2483                vaesenclast     \T2, reg_j, reg_j
2484                .else
2485                vaesenclast     \T2, reg_j, \T3
2486                vmovdqu 16*i(arg4, %r11), reg_j
2487                vmovdqu \T3, 16*i(arg3, %r11)
2488                .endif
2489	i = (i+1)
2490	j = (j+1)
2491	setreg
2492.endr
2493	#######################################################################
2494
2495
2496	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2497	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2498	vpxor	\T3, \T7, \T7
2499	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2500
2501
2502
2503	#######################################################################
2504	#first phase of the reduction
2505	vmovdqa         POLY2(%rip), \T3
2506
2507	vpclmulqdq	$0x01, \T7, \T3, \T2
2508	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2509
2510	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2511	#######################################################################
2512                .if \ENC_DEC == ENC
2513		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2514		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2515		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2516		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2517		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2518		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2519		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2520		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2521                .endif
2522
2523	#######################################################################
2524	#second phase of the reduction
2525	vpclmulqdq	$0x00, \T7, \T3, \T2
2526	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2527
2528	vpclmulqdq	$0x10, \T7, \T3, \T4
2529	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2530
2531	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2532	#######################################################################
2533	vpxor		\T4, \T1, \T1			# the result is in T1
2534
2535		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2536		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2537		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2538		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2539		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2540		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2541		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2542		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2543
2544
2545	vpxor	\T1, \XMM1, \XMM1
2546
2547
2548
2549.endm
2550
2551
2552# GHASH the last 4 ciphertext blocks.
2553.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2554
2555        ## Karatsuba Method
2556
2557        vmovdqu         HashKey_8(arg2), \T5
2558
2559        vpshufd         $0b01001110, \XMM1, \T2
2560        vpshufd         $0b01001110, \T5, \T3
2561        vpxor           \XMM1, \T2, \T2
2562        vpxor           \T5, \T3, \T3
2563
2564        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2565        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2566
2567        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2568
2569        ######################
2570
2571        vmovdqu         HashKey_7(arg2), \T5
2572        vpshufd         $0b01001110, \XMM2, \T2
2573        vpshufd         $0b01001110, \T5, \T3
2574        vpxor           \XMM2, \T2, \T2
2575        vpxor           \T5, \T3, \T3
2576
2577        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2578        vpxor           \T4, \T6, \T6
2579
2580        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2581        vpxor           \T4, \T7, \T7
2582
2583        vpclmulqdq      $0x00, \T3, \T2, \T2
2584
2585        vpxor           \T2, \XMM1, \XMM1
2586
2587        ######################
2588
2589        vmovdqu         HashKey_6(arg2), \T5
2590        vpshufd         $0b01001110, \XMM3, \T2
2591        vpshufd         $0b01001110, \T5, \T3
2592        vpxor           \XMM3, \T2, \T2
2593        vpxor           \T5, \T3, \T3
2594
2595        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2596        vpxor           \T4, \T6, \T6
2597
2598        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2599        vpxor           \T4, \T7, \T7
2600
2601        vpclmulqdq      $0x00, \T3, \T2, \T2
2602
2603        vpxor           \T2, \XMM1, \XMM1
2604
2605        ######################
2606
2607        vmovdqu         HashKey_5(arg2), \T5
2608        vpshufd         $0b01001110, \XMM4, \T2
2609        vpshufd         $0b01001110, \T5, \T3
2610        vpxor           \XMM4, \T2, \T2
2611        vpxor           \T5, \T3, \T3
2612
2613        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2614        vpxor           \T4, \T6, \T6
2615
2616        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2617        vpxor           \T4, \T7, \T7
2618
2619        vpclmulqdq      $0x00, \T3, \T2, \T2
2620
2621        vpxor           \T2, \XMM1, \XMM1
2622
2623        ######################
2624
2625        vmovdqu         HashKey_4(arg2), \T5
2626        vpshufd         $0b01001110, \XMM5, \T2
2627        vpshufd         $0b01001110, \T5, \T3
2628        vpxor           \XMM5, \T2, \T2
2629        vpxor           \T5, \T3, \T3
2630
2631        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2632        vpxor           \T4, \T6, \T6
2633
2634        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2635        vpxor           \T4, \T7, \T7
2636
2637        vpclmulqdq      $0x00, \T3, \T2, \T2
2638
2639        vpxor           \T2, \XMM1, \XMM1
2640
2641        ######################
2642
2643        vmovdqu         HashKey_3(arg2), \T5
2644        vpshufd         $0b01001110, \XMM6, \T2
2645        vpshufd         $0b01001110, \T5, \T3
2646        vpxor           \XMM6, \T2, \T2
2647        vpxor           \T5, \T3, \T3
2648
2649        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2650        vpxor           \T4, \T6, \T6
2651
2652        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2653        vpxor           \T4, \T7, \T7
2654
2655        vpclmulqdq      $0x00, \T3, \T2, \T2
2656
2657        vpxor           \T2, \XMM1, \XMM1
2658
2659        ######################
2660
2661        vmovdqu         HashKey_2(arg2), \T5
2662        vpshufd         $0b01001110, \XMM7, \T2
2663        vpshufd         $0b01001110, \T5, \T3
2664        vpxor           \XMM7, \T2, \T2
2665        vpxor           \T5, \T3, \T3
2666
2667        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2668        vpxor           \T4, \T6, \T6
2669
2670        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2671        vpxor           \T4, \T7, \T7
2672
2673        vpclmulqdq      $0x00, \T3, \T2, \T2
2674
2675        vpxor           \T2, \XMM1, \XMM1
2676
2677        ######################
2678
2679        vmovdqu         HashKey(arg2), \T5
2680        vpshufd         $0b01001110, \XMM8, \T2
2681        vpshufd         $0b01001110, \T5, \T3
2682        vpxor           \XMM8, \T2, \T2
2683        vpxor           \T5, \T3, \T3
2684
2685        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2686        vpxor           \T4, \T6, \T6
2687
2688        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2689        vpxor           \T4, \T7, \T7
2690
2691        vpclmulqdq      $0x00, \T3, \T2, \T2
2692
2693        vpxor           \T2, \XMM1, \XMM1
2694        vpxor           \T6, \XMM1, \XMM1
2695        vpxor           \T7, \XMM1, \T2
2696
2697
2698
2699
2700        vpslldq $8, \T2, \T4
2701        vpsrldq $8, \T2, \T2
2702
2703        vpxor   \T4, \T7, \T7
2704        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2705						   # accumulated carry-less multiplications
2706
2707        #######################################################################
2708        #first phase of the reduction
2709        vmovdqa         POLY2(%rip), \T3
2710
2711        vpclmulqdq      $0x01, \T7, \T3, \T2
2712        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2713
2714        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2715        #######################################################################
2716
2717
2718        #second phase of the reduction
2719        vpclmulqdq      $0x00, \T7, \T3, \T2
2720        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2721
2722        vpclmulqdq      $0x10, \T7, \T3, \T4
2723        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2724
2725        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2726        #######################################################################
2727        vpxor           \T4, \T6, \T6              # the result is in T6
2728.endm
2729
2730
2731
2732#############################################################
2733#void   aesni_gcm_init_avx_gen4
2734#        (gcm_data     *my_ctx_data,
2735#         gcm_context_data *data,
2736#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2737#			(from Security Association) concatenated with 8 byte
2738#			Initialisation Vector (from IPSec ESP Payload)
2739#			concatenated with 0x00000001. 16-byte aligned pointer. */
2740#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2741#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2742#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743#############################################################
2744SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2745        FUNC_SAVE
2746        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2747        FUNC_RESTORE
2748        ret
2749SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2750
2751###############################################################################
2752#void   aesni_gcm_enc_avx_gen4(
2753#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2754#        gcm_context_data *data,
2755#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2756#        const   u8 *in, /* Plaintext input */
2757#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2758###############################################################################
2759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2760        FUNC_SAVE
2761        mov     keysize,%eax
2762        cmp     $32, %eax
2763        je      key_256_enc_update4
2764        cmp     $16, %eax
2765        je      key_128_enc_update4
2766        # must be 192
2767        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2768        FUNC_RESTORE
2769	ret
2770key_128_enc_update4:
2771        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2772        FUNC_RESTORE
2773	ret
2774key_256_enc_update4:
2775        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2776        FUNC_RESTORE
2777	ret
2778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2779
2780###############################################################################
2781#void   aesni_gcm_dec_update_avx_gen4(
2782#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2783#        gcm_context_data *data,
2784#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2785#        const   u8 *in, /* Ciphertext input */
2786#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2787###############################################################################
2788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2789        FUNC_SAVE
2790        mov     keysize,%eax
2791        cmp     $32, %eax
2792        je      key_256_dec_update4
2793        cmp     $16, %eax
2794        je      key_128_dec_update4
2795        # must be 192
2796        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2797        FUNC_RESTORE
2798        ret
2799key_128_dec_update4:
2800        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2801        FUNC_RESTORE
2802        ret
2803key_256_dec_update4:
2804        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2805        FUNC_RESTORE
2806        ret
2807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2808
2809###############################################################################
2810#void   aesni_gcm_finalize_avx_gen4(
2811#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2812#        gcm_context_data *data,
2813#        u8      *auth_tag, /* Authenticated Tag output. */
2814#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2815#                              Valid values are 16 (most likely), 12 or 8. */
2816###############################################################################
2817SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2818        FUNC_SAVE
2819        mov	keysize,%eax
2820        cmp     $32, %eax
2821        je      key_256_finalize4
2822        cmp     $16, %eax
2823        je      key_128_finalize4
2824        # must be 192
2825        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2826        FUNC_RESTORE
2827        ret
2828key_128_finalize4:
2829        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2830        FUNC_RESTORE
2831        ret
2832key_256_finalize4:
2833        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2834        FUNC_RESTORE
2835        ret
2836SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2837