1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section	.rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY:            .octa     0xC2000000000000000000000000000001
128
129.section	.rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2:           .octa     0xC20000000000000000000001C2000000
132
133.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE:          .octa     0x00000001000000000000000000000001
136
137.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140
141.section	.rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE:             .octa     0x00000000000000000000000000000001
144
145.section	.rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf:            .octa     0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section	.rodata, "a", @progbits
152.align 16
153SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155                 .octa     0x00000000000000000000000000000000
156
157.section .rodata
158.align 16
159.type aad_shift_arr, @object
160.size aad_shift_arr, 272
161aad_shift_arr:
162        .octa     0xffffffffffffffffffffffffffffffff
163        .octa     0xffffffffffffffffffffffffffffff0C
164        .octa     0xffffffffffffffffffffffffffff0D0C
165        .octa     0xffffffffffffffffffffffffff0E0D0C
166        .octa     0xffffffffffffffffffffffff0F0E0D0C
167        .octa     0xffffffffffffffffffffff0C0B0A0908
168        .octa     0xffffffffffffffffffff0D0C0B0A0908
169        .octa     0xffffffffffffffffff0E0D0C0B0A0908
170        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
171        .octa     0xffffffffffffff0C0B0A090807060504
172        .octa     0xffffffffffff0D0C0B0A090807060504
173        .octa     0xffffffffff0E0D0C0B0A090807060504
174        .octa     0xffffffff0F0E0D0C0B0A090807060504
175        .octa     0xffffff0C0B0A09080706050403020100
176        .octa     0xffff0D0C0B0A09080706050403020100
177        .octa     0xff0E0D0C0B0A09080706050403020100
178        .octa     0x0F0E0D0C0B0A09080706050403020100
179
180
181.text
182
183
184#define AadHash 16*0
185#define AadLen 16*1
186#define InLen (16*1)+8
187#define PBlockEncKey 16*2
188#define OrigIV 16*3
189#define CurCount 16*4
190#define PBlockLen 16*5
191
192HashKey        = 16*6   # store HashKey <<1 mod poly here
193HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
194HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
195HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
196HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
197HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
198HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
199HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
200HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208
209#define arg1 %rdi
210#define arg2 %rsi
211#define arg3 %rdx
212#define arg4 %rcx
213#define arg5 %r8
214#define arg6 %r9
215#define keysize 2*15*16(arg1)
216
217i = 0
218j = 0
219
220out_order = 0
221in_order = 1
222DEC = 0
223ENC = 1
224
225.macro define_reg r n
226reg_\r = %xmm\n
227.endm
228
229.macro setreg
230.altmacro
231define_reg i %i
232define_reg j %j
233.noaltmacro
234.endm
235
236TMP1 =   16*0    # Temporary storage for AAD
237TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
238TMP3 =   16*2    # Temporary storage for AES State 3
239TMP4 =   16*3    # Temporary storage for AES State 4
240TMP5 =   16*4    # Temporary storage for AES State 5
241TMP6 =   16*5    # Temporary storage for AES State 6
242TMP7 =   16*6    # Temporary storage for AES State 7
243TMP8 =   16*7    # Temporary storage for AES State 8
244
245VARIABLE_OFFSET = 16*8
246
247################################
248# Utility Macros
249################################
250
251.macro FUNC_SAVE
252        push    %r12
253        push    %r13
254        push    %r15
255
256	push	%rbp
257	mov	%rsp, %rbp
258
259        sub     $VARIABLE_OFFSET, %rsp
260        and     $~63, %rsp                    # align rsp to 64 bytes
261.endm
262
263.macro FUNC_RESTORE
264        mov     %rbp, %rsp
265	pop	%rbp
266
267        pop     %r15
268        pop     %r13
269        pop     %r12
270.endm
271
272# Encryption of a single block
273.macro ENCRYPT_SINGLE_BLOCK REP XMM0
274                vpxor    (arg1), \XMM0, \XMM0
275               i = 1
276               setreg
277.rep \REP
278                vaesenc  16*i(arg1), \XMM0, \XMM0
279               i = (i+1)
280               setreg
281.endr
282                vaesenclast 16*i(arg1), \XMM0, \XMM0
283.endm
284
285# combined for GCM encrypt and decrypt functions
286# clobbering all xmm registers
287# clobbering r10, r11, r12, r13, r15, rax
288.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
289        vmovdqu AadHash(arg2), %xmm8
290        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
291        add arg5, InLen(arg2)
292
293        # initialize the data pointer offset as zero
294        xor     %r11d, %r11d
295
296        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
297        sub %r11, arg5
298
299        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
300        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
301
302        mov     %r13, %r12
303        shr     $4, %r12
304        and     $7, %r12
305        jz      _initial_num_blocks_is_0\@
306
307        cmp     $7, %r12
308        je      _initial_num_blocks_is_7\@
309        cmp     $6, %r12
310        je      _initial_num_blocks_is_6\@
311        cmp     $5, %r12
312        je      _initial_num_blocks_is_5\@
313        cmp     $4, %r12
314        je      _initial_num_blocks_is_4\@
315        cmp     $3, %r12
316        je      _initial_num_blocks_is_3\@
317        cmp     $2, %r12
318        je      _initial_num_blocks_is_2\@
319
320        jmp     _initial_num_blocks_is_1\@
321
322_initial_num_blocks_is_7\@:
323        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
324        sub     $16*7, %r13
325        jmp     _initial_blocks_encrypted\@
326
327_initial_num_blocks_is_6\@:
328        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
329        sub     $16*6, %r13
330        jmp     _initial_blocks_encrypted\@
331
332_initial_num_blocks_is_5\@:
333        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334        sub     $16*5, %r13
335        jmp     _initial_blocks_encrypted\@
336
337_initial_num_blocks_is_4\@:
338        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339        sub     $16*4, %r13
340        jmp     _initial_blocks_encrypted\@
341
342_initial_num_blocks_is_3\@:
343        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344        sub     $16*3, %r13
345        jmp     _initial_blocks_encrypted\@
346
347_initial_num_blocks_is_2\@:
348        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349        sub     $16*2, %r13
350        jmp     _initial_blocks_encrypted\@
351
352_initial_num_blocks_is_1\@:
353        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354        sub     $16*1, %r13
355        jmp     _initial_blocks_encrypted\@
356
357_initial_num_blocks_is_0\@:
358        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359
360
361_initial_blocks_encrypted\@:
362        test    %r13, %r13
363        je      _zero_cipher_left\@
364
365        sub     $128, %r13
366        je      _eight_cipher_left\@
367
368
369
370
371        vmovd   %xmm9, %r15d
372        and     $255, %r15d
373        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
374
375
376_encrypt_by_8_new\@:
377        cmp     $(255-8), %r15d
378        jg      _encrypt_by_8\@
379
380
381
382        add     $8, %r15b
383        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
384        add     $128, %r11
385        sub     $128, %r13
386        jne     _encrypt_by_8_new\@
387
388        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
389        jmp     _eight_cipher_left\@
390
391_encrypt_by_8\@:
392        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
393        add     $8, %r15b
394        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
395        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
396        add     $128, %r11
397        sub     $128, %r13
398        jne     _encrypt_by_8_new\@
399
400        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
401
402
403
404
405_eight_cipher_left\@:
406        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
407
408
409_zero_cipher_left\@:
410        vmovdqu %xmm14, AadHash(arg2)
411        vmovdqu %xmm9, CurCount(arg2)
412
413        # check for 0 length
414        mov     arg5, %r13
415        and     $15, %r13                            # r13 = (arg5 mod 16)
416
417        je      _multiple_of_16_bytes\@
418
419        # handle the last <16 Byte block separately
420
421        mov %r13, PBlockLen(arg2)
422
423        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
424        vmovdqu %xmm9, CurCount(arg2)
425        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
426
427        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
428        vmovdqu %xmm9, PBlockEncKey(arg2)
429
430        cmp $16, arg5
431        jge _large_enough_update\@
432
433        lea (arg4,%r11,1), %r10
434        mov %r13, %r12
435
436        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
437
438        lea     SHIFT_MASK+16(%rip), %r12
439        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
440						     # able to shift 16-r13 bytes (r13 is the
441	# number of bytes in plaintext mod 16)
442
443        jmp _final_ghash_mul\@
444
445_large_enough_update\@:
446        sub $16, %r11
447        add %r13, %r11
448
449        # receive the last <16 Byte block
450        vmovdqu	(arg4, %r11, 1), %xmm1
451
452        sub	%r13, %r11
453        add	$16, %r11
454
455        lea	SHIFT_MASK+16(%rip), %r12
456        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
457        # (r13 is the number of bytes in plaintext mod 16)
458        sub	%r13, %r12
459        # get the appropriate shuffle mask
460        vmovdqu	(%r12), %xmm2
461        # shift right 16-r13 bytes
462        vpshufb  %xmm2, %xmm1, %xmm1
463
464_final_ghash_mul\@:
465        .if  \ENC_DEC ==  DEC
466        vmovdqa %xmm1, %xmm2
467        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
468        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
469						     # mask out top 16-r13 bytes of xmm9
470        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
471        vpand   %xmm1, %xmm2, %xmm2
472        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
473        vpxor   %xmm2, %xmm14, %xmm14
474
475        vmovdqu %xmm14, AadHash(arg2)
476        .else
477        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
478        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
479						     # mask out top 16-r13 bytes of xmm9
480        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
481        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
482        vpxor   %xmm9, %xmm14, %xmm14
483
484        vmovdqu %xmm14, AadHash(arg2)
485        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
486        .endif
487
488
489        #############################
490        # output r13 Bytes
491        vmovq   %xmm9, %rax
492        cmp     $8, %r13
493        jle     _less_than_8_bytes_left\@
494
495        mov     %rax, (arg3 , %r11)
496        add     $8, %r11
497        vpsrldq $8, %xmm9, %xmm9
498        vmovq   %xmm9, %rax
499        sub     $8, %r13
500
501_less_than_8_bytes_left\@:
502        movb    %al, (arg3 , %r11)
503        add     $1, %r11
504        shr     $8, %rax
505        sub     $1, %r13
506        jne     _less_than_8_bytes_left\@
507        #############################
508
509_multiple_of_16_bytes\@:
510.endm
511
512
513# GCM_COMPLETE Finishes update of tag of last partial block
514# Output: Authorization Tag (AUTH_TAG)
515# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
516.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
517        vmovdqu AadHash(arg2), %xmm14
518        vmovdqu HashKey(arg2), %xmm13
519
520        mov PBlockLen(arg2), %r12
521        test %r12, %r12
522        je _partial_done\@
523
524	#GHASH computation for the last <16 Byte block
525        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
526
527_partial_done\@:
528        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
529        shl     $3, %r12                             # convert into number of bits
530        vmovd   %r12d, %xmm15                        # len(A) in xmm15
531
532        mov InLen(arg2), %r12
533        shl     $3, %r12                        # len(C) in bits  (*128)
534        vmovq   %r12, %xmm1
535        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
536        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
537
538        vpxor   %xmm15, %xmm14, %xmm14
539        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
540        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
541
542        vmovdqu OrigIV(arg2), %xmm9
543
544        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
545
546        vpxor   %xmm14, %xmm9, %xmm9
547
548
549
550_return_T\@:
551        mov     \AUTH_TAG, %r10              # r10 = authTag
552        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
553
554        cmp     $16, %r11
555        je      _T_16\@
556
557        cmp     $8, %r11
558        jl      _T_4\@
559
560_T_8\@:
561        vmovq   %xmm9, %rax
562        mov     %rax, (%r10)
563        add     $8, %r10
564        sub     $8, %r11
565        vpsrldq $8, %xmm9, %xmm9
566        test    %r11, %r11
567        je     _return_T_done\@
568_T_4\@:
569        vmovd   %xmm9, %eax
570        mov     %eax, (%r10)
571        add     $4, %r10
572        sub     $4, %r11
573        vpsrldq     $4, %xmm9, %xmm9
574        test    %r11, %r11
575        je     _return_T_done\@
576_T_123\@:
577        vmovd     %xmm9, %eax
578        cmp     $2, %r11
579        jl     _T_1\@
580        mov     %ax, (%r10)
581        cmp     $2, %r11
582        je     _return_T_done\@
583        add     $2, %r10
584        sar     $16, %eax
585_T_1\@:
586        mov     %al, (%r10)
587        jmp     _return_T_done\@
588
589_T_16\@:
590        vmovdqu %xmm9, (%r10)
591
592_return_T_done\@:
593.endm
594
595.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
596
597	mov     \AAD, %r10                      # r10 = AAD
598	mov     \AADLEN, %r12                      # r12 = aadLen
599
600
601	mov     %r12, %r11
602
603	vpxor   \T8, \T8, \T8
604	vpxor   \T7, \T7, \T7
605	cmp     $16, %r11
606	jl      _get_AAD_rest8\@
607_get_AAD_blocks\@:
608	vmovdqu (%r10), \T7
609	vpshufb SHUF_MASK(%rip), \T7, \T7
610	vpxor   \T7, \T8, \T8
611	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
612	add     $16, %r10
613	sub     $16, %r12
614	sub     $16, %r11
615	cmp     $16, %r11
616	jge     _get_AAD_blocks\@
617	vmovdqu \T8, \T7
618	test    %r11, %r11
619	je      _get_AAD_done\@
620
621	vpxor   \T7, \T7, \T7
622
623	/* read the last <16B of AAD. since we have at least 4B of
624	data right after the AAD (the ICV, and maybe some CT), we can
625	read 4B/8B blocks safely, and then get rid of the extra stuff */
626_get_AAD_rest8\@:
627	cmp     $4, %r11
628	jle     _get_AAD_rest4\@
629	movq    (%r10), \T1
630	add     $8, %r10
631	sub     $8, %r11
632	vpslldq $8, \T1, \T1
633	vpsrldq $8, \T7, \T7
634	vpxor   \T1, \T7, \T7
635	jmp     _get_AAD_rest8\@
636_get_AAD_rest4\@:
637	test    %r11, %r11
638	jle      _get_AAD_rest0\@
639	mov     (%r10), %eax
640	movq    %rax, \T1
641	add     $4, %r10
642	sub     $4, %r11
643	vpslldq $12, \T1, \T1
644	vpsrldq $4, \T7, \T7
645	vpxor   \T1, \T7, \T7
646_get_AAD_rest0\@:
647	/* finalize: shift out the extra bytes we read, and align
648	left. since pslldq can only shift by an immediate, we use
649	vpshufb and an array of shuffle masks */
650	movq    %r12, %r11
651	salq    $4, %r11
652	vmovdqu  aad_shift_arr(%r11), \T1
653	vpshufb \T1, \T7, \T7
654_get_AAD_rest_final\@:
655	vpshufb SHUF_MASK(%rip), \T7, \T7
656	vpxor   \T8, \T7, \T7
657	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
658
659_get_AAD_done\@:
660        vmovdqu \T7, AadHash(arg2)
661.endm
662
663.macro INIT GHASH_MUL PRECOMPUTE
664        mov arg6, %r11
665        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
666        xor %r11d, %r11d
667        mov %r11, InLen(arg2) # ctx_data.in_length = 0
668
669        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
670        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
671        mov arg3, %rax
672        movdqu (%rax), %xmm0
673        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
674
675        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
676        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
677
678        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
679
680        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
681        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
682        vmovdqa  %xmm6, %xmm2
683        vpsllq   $1, %xmm6, %xmm6
684        vpsrlq   $63, %xmm2, %xmm2
685        vmovdqa  %xmm2, %xmm1
686        vpslldq  $8, %xmm2, %xmm2
687        vpsrldq  $8, %xmm1, %xmm1
688        vpor     %xmm2, %xmm6, %xmm6
689        #reduction
690        vpshufd  $0b00100100, %xmm1, %xmm2
691        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
692        vpand    POLY(%rip), %xmm2, %xmm2
693        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
694        #######################################################################
695        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
696
697        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
698
699        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
700.endm
701
702
703# Reads DLEN bytes starting at DPTR and stores in XMMDst
704# where 0 < DLEN < 16
705# Clobbers %rax, DLEN
706.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
707        vpxor \XMMDst, \XMMDst, \XMMDst
708
709        cmp $8, \DLEN
710        jl _read_lt8_\@
711        mov (\DPTR), %rax
712        vpinsrq $0, %rax, \XMMDst, \XMMDst
713        sub $8, \DLEN
714        jz _done_read_partial_block_\@
715        xor %eax, %eax
716_read_next_byte_\@:
717        shl $8, %rax
718        mov 7(\DPTR, \DLEN, 1), %al
719        dec \DLEN
720        jnz _read_next_byte_\@
721        vpinsrq $1, %rax, \XMMDst, \XMMDst
722        jmp _done_read_partial_block_\@
723_read_lt8_\@:
724        xor %eax, %eax
725_read_next_byte_lt8_\@:
726        shl $8, %rax
727        mov -1(\DPTR, \DLEN, 1), %al
728        dec \DLEN
729        jnz _read_next_byte_lt8_\@
730        vpinsrq $0, %rax, \XMMDst, \XMMDst
731_done_read_partial_block_\@:
732.endm
733
734# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
735# between update calls.
736# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
737# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
738# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
739.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
740        AAD_HASH ENC_DEC
741        mov 	PBlockLen(arg2), %r13
742        test	%r13, %r13
743        je	_partial_block_done_\@	# Leave Macro if no partial blocks
744        # Read in input data without over reading
745        cmp	$16, \PLAIN_CYPH_LEN
746        jl	_fewer_than_16_bytes_\@
747        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
748        jmp	_data_read_\@
749
750_fewer_than_16_bytes_\@:
751        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
752        mov	\PLAIN_CYPH_LEN, %r12
753        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
754
755        mov PBlockLen(arg2), %r13
756
757_data_read_\@:				# Finished reading in data
758
759        vmovdqu	PBlockEncKey(arg2), %xmm9
760        vmovdqu	HashKey(arg2), %xmm13
761
762        lea	SHIFT_MASK(%rip), %r12
763
764        # adjust the shuffle mask pointer to be able to shift r13 bytes
765        # r16-r13 is the number of bytes in plaintext mod 16)
766        add	%r13, %r12
767        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
768        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
769
770.if  \ENC_DEC ==  DEC
771        vmovdqa	%xmm1, %xmm3
772        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
773
774        mov	\PLAIN_CYPH_LEN, %r10
775        add	%r13, %r10
776        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
777        sub	$16, %r10
778        # Determine if if partial block is not being filled and
779        # shift mask accordingly
780        jge	_no_extra_mask_1_\@
781        sub	%r10, %r12
782_no_extra_mask_1_\@:
783
784        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
785        # get the appropriate mask to mask out bottom r13 bytes of xmm9
786        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
787
788        vpand	%xmm1, %xmm3, %xmm3
789        vmovdqa	SHUF_MASK(%rip), %xmm10
790        vpshufb	%xmm10, %xmm3, %xmm3
791        vpshufb	%xmm2, %xmm3, %xmm3
792        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
793
794        test	%r10, %r10
795        jl	_partial_incomplete_1_\@
796
797        # GHASH computation for the last <16 Byte block
798        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
799        xor	%eax,%eax
800
801        mov	%rax, PBlockLen(arg2)
802        jmp	_dec_done_\@
803_partial_incomplete_1_\@:
804        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
805_dec_done_\@:
806        vmovdqu	\AAD_HASH, AadHash(arg2)
807.else
808        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
809
810        mov	\PLAIN_CYPH_LEN, %r10
811        add	%r13, %r10
812        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
813        sub	$16, %r10
814        # Determine if if partial block is not being filled and
815        # shift mask accordingly
816        jge	_no_extra_mask_2_\@
817        sub	%r10, %r12
818_no_extra_mask_2_\@:
819
820        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
821        # get the appropriate mask to mask out bottom r13 bytes of xmm9
822        vpand	%xmm1, %xmm9, %xmm9
823
824        vmovdqa	SHUF_MASK(%rip), %xmm1
825        vpshufb %xmm1, %xmm9, %xmm9
826        vpshufb %xmm2, %xmm9, %xmm9
827        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
828
829        test	%r10, %r10
830        jl	_partial_incomplete_2_\@
831
832        # GHASH computation for the last <16 Byte block
833        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
834        xor	%eax,%eax
835
836        mov	%rax, PBlockLen(arg2)
837        jmp	_encode_done_\@
838_partial_incomplete_2_\@:
839        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
840_encode_done_\@:
841        vmovdqu	\AAD_HASH, AadHash(arg2)
842
843        vmovdqa	SHUF_MASK(%rip), %xmm10
844        # shuffle xmm9 back to output as ciphertext
845        vpshufb	%xmm10, %xmm9, %xmm9
846        vpshufb	%xmm2, %xmm9, %xmm9
847.endif
848        # output encrypted Bytes
849        test	%r10, %r10
850        jl	_partial_fill_\@
851        mov	%r13, %r12
852        mov	$16, %r13
853        # Set r13 to be the number of bytes to write out
854        sub	%r12, %r13
855        jmp	_count_set_\@
856_partial_fill_\@:
857        mov	\PLAIN_CYPH_LEN, %r13
858_count_set_\@:
859        vmovdqa	%xmm9, %xmm0
860        vmovq	%xmm0, %rax
861        cmp	$8, %r13
862        jle	_less_than_8_bytes_left_\@
863
864        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
865        add	$8, \DATA_OFFSET
866        psrldq	$8, %xmm0
867        vmovq	%xmm0, %rax
868        sub	$8, %r13
869_less_than_8_bytes_left_\@:
870        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
871        add	$1, \DATA_OFFSET
872        shr	$8, %rax
873        sub	$1, %r13
874        jne	_less_than_8_bytes_left_\@
875_partial_block_done_\@:
876.endm # PARTIAL_BLOCK
877
878###############################################################################
879# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
880# Input: A and B (128-bits each, bit-reflected)
881# Output: C = A*B*x mod poly, (i.e. >>1 )
882# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
883# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
884###############################################################################
885.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
886
887        vpshufd         $0b01001110, \GH, \T2
888        vpshufd         $0b01001110, \HK, \T3
889        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
890        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
891
892        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
893        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
894        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
895        vpxor           \GH, \T2,\T2
896        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
897
898        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
899        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
900        vpxor           \T3, \GH, \GH
901        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
902
903        #first phase of the reduction
904        vpslld  $31, \GH, \T2                   # packed right shifting << 31
905        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
906        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
907
908        vpxor   \T3, \T2, \T2                   # xor the shifted versions
909        vpxor   \T4, \T2, \T2
910
911        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
912
913        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
914        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
915
916        #second phase of the reduction
917
918        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
919        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
920        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
921        vpxor   \T3, \T2, \T2                   # xor the shifted versions
922        vpxor   \T4, \T2, \T2
923
924        vpxor   \T5, \T2, \T2
925        vpxor   \T2, \GH, \GH
926        vpxor   \T1, \GH, \GH                   # the result is in GH
927
928
929.endm
930
931.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
932
933        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
934        vmovdqa  \HK, \T5
935
936        vpshufd  $0b01001110, \T5, \T1
937        vpxor    \T5, \T1, \T1
938        vmovdqu  \T1, HashKey_k(arg2)
939
940        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
941        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
942        vpshufd  $0b01001110, \T5, \T1
943        vpxor    \T5, \T1, \T1
944        vmovdqu  \T1, HashKey_2_k(arg2)
945
946        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
947        vmovdqu  \T5, HashKey_3(arg2)
948        vpshufd  $0b01001110, \T5, \T1
949        vpxor    \T5, \T1, \T1
950        vmovdqu  \T1, HashKey_3_k(arg2)
951
952        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
953        vmovdqu  \T5, HashKey_4(arg2)
954        vpshufd  $0b01001110, \T5, \T1
955        vpxor    \T5, \T1, \T1
956        vmovdqu  \T1, HashKey_4_k(arg2)
957
958        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
959        vmovdqu  \T5, HashKey_5(arg2)
960        vpshufd  $0b01001110, \T5, \T1
961        vpxor    \T5, \T1, \T1
962        vmovdqu  \T1, HashKey_5_k(arg2)
963
964        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
965        vmovdqu  \T5, HashKey_6(arg2)
966        vpshufd  $0b01001110, \T5, \T1
967        vpxor    \T5, \T1, \T1
968        vmovdqu  \T1, HashKey_6_k(arg2)
969
970        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
971        vmovdqu  \T5, HashKey_7(arg2)
972        vpshufd  $0b01001110, \T5, \T1
973        vpxor    \T5, \T1, \T1
974        vmovdqu  \T1, HashKey_7_k(arg2)
975
976        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
977        vmovdqu  \T5, HashKey_8(arg2)
978        vpshufd  $0b01001110, \T5, \T1
979        vpxor    \T5, \T1, \T1
980        vmovdqu  \T1, HashKey_8_k(arg2)
981
982.endm
983
984## if a = number of total plaintext bytes
985## b = floor(a/16)
986## num_initial_blocks = b mod 4#
987## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
988## r10, r11, r12, rax are clobbered
989## arg1, arg2, arg3, arg4 are used as pointers only, not modified
990
991.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
992	i = (8-\num_initial_blocks)
993	setreg
994        vmovdqu AadHash(arg2), reg_i
995
996	# start AES for num_initial_blocks blocks
997	vmovdqu CurCount(arg2), \CTR
998
999	i = (9-\num_initial_blocks)
1000	setreg
1001.rep \num_initial_blocks
1002                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1003                vmovdqa \CTR, reg_i
1004                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1005	i = (i+1)
1006	setreg
1007.endr
1008
1009	vmovdqa  (arg1), \T_key
1010	i = (9-\num_initial_blocks)
1011	setreg
1012.rep \num_initial_blocks
1013                vpxor   \T_key, reg_i, reg_i
1014	i = (i+1)
1015	setreg
1016.endr
1017
1018       j = 1
1019       setreg
1020.rep \REP
1021       vmovdqa  16*j(arg1), \T_key
1022	i = (9-\num_initial_blocks)
1023	setreg
1024.rep \num_initial_blocks
1025        vaesenc \T_key, reg_i, reg_i
1026	i = (i+1)
1027	setreg
1028.endr
1029
1030       j = (j+1)
1031       setreg
1032.endr
1033
1034	vmovdqa  16*j(arg1), \T_key
1035	i = (9-\num_initial_blocks)
1036	setreg
1037.rep \num_initial_blocks
1038        vaesenclast      \T_key, reg_i, reg_i
1039	i = (i+1)
1040	setreg
1041.endr
1042
1043	i = (9-\num_initial_blocks)
1044	setreg
1045.rep \num_initial_blocks
1046                vmovdqu (arg4, %r11), \T1
1047                vpxor   \T1, reg_i, reg_i
1048                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1049                add     $16, %r11
1050.if  \ENC_DEC == DEC
1051                vmovdqa \T1, reg_i
1052.endif
1053                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1054	i = (i+1)
1055	setreg
1056.endr
1057
1058
1059	i = (8-\num_initial_blocks)
1060	j = (9-\num_initial_blocks)
1061	setreg
1062
1063.rep \num_initial_blocks
1064        vpxor    reg_i, reg_j, reg_j
1065        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1066	i = (i+1)
1067	j = (j+1)
1068	setreg
1069.endr
1070        # XMM8 has the combined result here
1071
1072        vmovdqa  \XMM8, TMP1(%rsp)
1073        vmovdqa  \XMM8, \T3
1074
1075        cmp     $128, %r13
1076        jl      _initial_blocks_done\@                  # no need for precomputed constants
1077
1078###############################################################################
1079# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1080                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1081                vmovdqa  \CTR, \XMM1
1082                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1083
1084                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1085                vmovdqa  \CTR, \XMM2
1086                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1087
1088                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1089                vmovdqa  \CTR, \XMM3
1090                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1091
1092                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1093                vmovdqa  \CTR, \XMM4
1094                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1095
1096                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1097                vmovdqa  \CTR, \XMM5
1098                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1099
1100                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1101                vmovdqa  \CTR, \XMM6
1102                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1103
1104                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1105                vmovdqa  \CTR, \XMM7
1106                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1107
1108                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1109                vmovdqa  \CTR, \XMM8
1110                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1111
1112                vmovdqa  (arg1), \T_key
1113                vpxor    \T_key, \XMM1, \XMM1
1114                vpxor    \T_key, \XMM2, \XMM2
1115                vpxor    \T_key, \XMM3, \XMM3
1116                vpxor    \T_key, \XMM4, \XMM4
1117                vpxor    \T_key, \XMM5, \XMM5
1118                vpxor    \T_key, \XMM6, \XMM6
1119                vpxor    \T_key, \XMM7, \XMM7
1120                vpxor    \T_key, \XMM8, \XMM8
1121
1122               i = 1
1123               setreg
1124.rep    \REP       # do REP rounds
1125                vmovdqa  16*i(arg1), \T_key
1126                vaesenc  \T_key, \XMM1, \XMM1
1127                vaesenc  \T_key, \XMM2, \XMM2
1128                vaesenc  \T_key, \XMM3, \XMM3
1129                vaesenc  \T_key, \XMM4, \XMM4
1130                vaesenc  \T_key, \XMM5, \XMM5
1131                vaesenc  \T_key, \XMM6, \XMM6
1132                vaesenc  \T_key, \XMM7, \XMM7
1133                vaesenc  \T_key, \XMM8, \XMM8
1134               i = (i+1)
1135               setreg
1136.endr
1137
1138                vmovdqa  16*i(arg1), \T_key
1139                vaesenclast  \T_key, \XMM1, \XMM1
1140                vaesenclast  \T_key, \XMM2, \XMM2
1141                vaesenclast  \T_key, \XMM3, \XMM3
1142                vaesenclast  \T_key, \XMM4, \XMM4
1143                vaesenclast  \T_key, \XMM5, \XMM5
1144                vaesenclast  \T_key, \XMM6, \XMM6
1145                vaesenclast  \T_key, \XMM7, \XMM7
1146                vaesenclast  \T_key, \XMM8, \XMM8
1147
1148                vmovdqu  (arg4, %r11), \T1
1149                vpxor    \T1, \XMM1, \XMM1
1150                vmovdqu  \XMM1, (arg3 , %r11)
1151                .if   \ENC_DEC == DEC
1152                vmovdqa  \T1, \XMM1
1153                .endif
1154
1155                vmovdqu  16*1(arg4, %r11), \T1
1156                vpxor    \T1, \XMM2, \XMM2
1157                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1158                .if   \ENC_DEC == DEC
1159                vmovdqa  \T1, \XMM2
1160                .endif
1161
1162                vmovdqu  16*2(arg4, %r11), \T1
1163                vpxor    \T1, \XMM3, \XMM3
1164                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1165                .if   \ENC_DEC == DEC
1166                vmovdqa  \T1, \XMM3
1167                .endif
1168
1169                vmovdqu  16*3(arg4, %r11), \T1
1170                vpxor    \T1, \XMM4, \XMM4
1171                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1172                .if   \ENC_DEC == DEC
1173                vmovdqa  \T1, \XMM4
1174                .endif
1175
1176                vmovdqu  16*4(arg4, %r11), \T1
1177                vpxor    \T1, \XMM5, \XMM5
1178                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1179                .if   \ENC_DEC == DEC
1180                vmovdqa  \T1, \XMM5
1181                .endif
1182
1183                vmovdqu  16*5(arg4, %r11), \T1
1184                vpxor    \T1, \XMM6, \XMM6
1185                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1186                .if   \ENC_DEC == DEC
1187                vmovdqa  \T1, \XMM6
1188                .endif
1189
1190                vmovdqu  16*6(arg4, %r11), \T1
1191                vpxor    \T1, \XMM7, \XMM7
1192                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1193                .if   \ENC_DEC == DEC
1194                vmovdqa  \T1, \XMM7
1195                .endif
1196
1197                vmovdqu  16*7(arg4, %r11), \T1
1198                vpxor    \T1, \XMM8, \XMM8
1199                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1200                .if   \ENC_DEC == DEC
1201                vmovdqa  \T1, \XMM8
1202                .endif
1203
1204                add     $128, %r11
1205
1206                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1207                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1208                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1209                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1210                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1211                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1212                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1213                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1214                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1215
1216###############################################################################
1217
1218_initial_blocks_done\@:
1219
1220.endm
1221
1222# encrypt 8 blocks at a time
1223# ghash the 8 previously encrypted ciphertext blocks
1224# arg1, arg2, arg3, arg4 are used as pointers only, not modified
1225# r11 is the data offset value
1226.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1227
1228        vmovdqa \XMM1, \T2
1229        vmovdqa \XMM2, TMP2(%rsp)
1230        vmovdqa \XMM3, TMP3(%rsp)
1231        vmovdqa \XMM4, TMP4(%rsp)
1232        vmovdqa \XMM5, TMP5(%rsp)
1233        vmovdqa \XMM6, TMP6(%rsp)
1234        vmovdqa \XMM7, TMP7(%rsp)
1235        vmovdqa \XMM8, TMP8(%rsp)
1236
1237.if \loop_idx == in_order
1238                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1239                vpaddd  ONE(%rip), \XMM1, \XMM2
1240                vpaddd  ONE(%rip), \XMM2, \XMM3
1241                vpaddd  ONE(%rip), \XMM3, \XMM4
1242                vpaddd  ONE(%rip), \XMM4, \XMM5
1243                vpaddd  ONE(%rip), \XMM5, \XMM6
1244                vpaddd  ONE(%rip), \XMM6, \XMM7
1245                vpaddd  ONE(%rip), \XMM7, \XMM8
1246                vmovdqa \XMM8, \CTR
1247
1248                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1249                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1250                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1251                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1252                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1253                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1254                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1255                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1256.else
1257                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1258                vpaddd  ONEf(%rip), \XMM1, \XMM2
1259                vpaddd  ONEf(%rip), \XMM2, \XMM3
1260                vpaddd  ONEf(%rip), \XMM3, \XMM4
1261                vpaddd  ONEf(%rip), \XMM4, \XMM5
1262                vpaddd  ONEf(%rip), \XMM5, \XMM6
1263                vpaddd  ONEf(%rip), \XMM6, \XMM7
1264                vpaddd  ONEf(%rip), \XMM7, \XMM8
1265                vmovdqa \XMM8, \CTR
1266.endif
1267
1268
1269        #######################################################################
1270
1271                vmovdqu (arg1), \T1
1272                vpxor   \T1, \XMM1, \XMM1
1273                vpxor   \T1, \XMM2, \XMM2
1274                vpxor   \T1, \XMM3, \XMM3
1275                vpxor   \T1, \XMM4, \XMM4
1276                vpxor   \T1, \XMM5, \XMM5
1277                vpxor   \T1, \XMM6, \XMM6
1278                vpxor   \T1, \XMM7, \XMM7
1279                vpxor   \T1, \XMM8, \XMM8
1280
1281        #######################################################################
1282
1283
1284
1285
1286
1287                vmovdqu 16*1(arg1), \T1
1288                vaesenc \T1, \XMM1, \XMM1
1289                vaesenc \T1, \XMM2, \XMM2
1290                vaesenc \T1, \XMM3, \XMM3
1291                vaesenc \T1, \XMM4, \XMM4
1292                vaesenc \T1, \XMM5, \XMM5
1293                vaesenc \T1, \XMM6, \XMM6
1294                vaesenc \T1, \XMM7, \XMM7
1295                vaesenc \T1, \XMM8, \XMM8
1296
1297                vmovdqu 16*2(arg1), \T1
1298                vaesenc \T1, \XMM1, \XMM1
1299                vaesenc \T1, \XMM2, \XMM2
1300                vaesenc \T1, \XMM3, \XMM3
1301                vaesenc \T1, \XMM4, \XMM4
1302                vaesenc \T1, \XMM5, \XMM5
1303                vaesenc \T1, \XMM6, \XMM6
1304                vaesenc \T1, \XMM7, \XMM7
1305                vaesenc \T1, \XMM8, \XMM8
1306
1307
1308        #######################################################################
1309
1310        vmovdqu         HashKey_8(arg2), \T5
1311        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1312        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1313
1314        vpshufd         $0b01001110, \T2, \T6
1315        vpxor           \T2, \T6, \T6
1316
1317        vmovdqu         HashKey_8_k(arg2), \T5
1318        vpclmulqdq      $0x00, \T5, \T6, \T6
1319
1320                vmovdqu 16*3(arg1), \T1
1321                vaesenc \T1, \XMM1, \XMM1
1322                vaesenc \T1, \XMM2, \XMM2
1323                vaesenc \T1, \XMM3, \XMM3
1324                vaesenc \T1, \XMM4, \XMM4
1325                vaesenc \T1, \XMM5, \XMM5
1326                vaesenc \T1, \XMM6, \XMM6
1327                vaesenc \T1, \XMM7, \XMM7
1328                vaesenc \T1, \XMM8, \XMM8
1329
1330        vmovdqa         TMP2(%rsp), \T1
1331        vmovdqu         HashKey_7(arg2), \T5
1332        vpclmulqdq      $0x11, \T5, \T1, \T3
1333        vpxor           \T3, \T4, \T4
1334        vpclmulqdq      $0x00, \T5, \T1, \T3
1335        vpxor           \T3, \T7, \T7
1336
1337        vpshufd         $0b01001110, \T1, \T3
1338        vpxor           \T1, \T3, \T3
1339        vmovdqu         HashKey_7_k(arg2), \T5
1340        vpclmulqdq      $0x10, \T5, \T3, \T3
1341        vpxor           \T3, \T6, \T6
1342
1343                vmovdqu 16*4(arg1), \T1
1344                vaesenc \T1, \XMM1, \XMM1
1345                vaesenc \T1, \XMM2, \XMM2
1346                vaesenc \T1, \XMM3, \XMM3
1347                vaesenc \T1, \XMM4, \XMM4
1348                vaesenc \T1, \XMM5, \XMM5
1349                vaesenc \T1, \XMM6, \XMM6
1350                vaesenc \T1, \XMM7, \XMM7
1351                vaesenc \T1, \XMM8, \XMM8
1352
1353        #######################################################################
1354
1355        vmovdqa         TMP3(%rsp), \T1
1356        vmovdqu         HashKey_6(arg2), \T5
1357        vpclmulqdq      $0x11, \T5, \T1, \T3
1358        vpxor           \T3, \T4, \T4
1359        vpclmulqdq      $0x00, \T5, \T1, \T3
1360        vpxor           \T3, \T7, \T7
1361
1362        vpshufd         $0b01001110, \T1, \T3
1363        vpxor           \T1, \T3, \T3
1364        vmovdqu         HashKey_6_k(arg2), \T5
1365        vpclmulqdq      $0x10, \T5, \T3, \T3
1366        vpxor           \T3, \T6, \T6
1367
1368                vmovdqu 16*5(arg1), \T1
1369                vaesenc \T1, \XMM1, \XMM1
1370                vaesenc \T1, \XMM2, \XMM2
1371                vaesenc \T1, \XMM3, \XMM3
1372                vaesenc \T1, \XMM4, \XMM4
1373                vaesenc \T1, \XMM5, \XMM5
1374                vaesenc \T1, \XMM6, \XMM6
1375                vaesenc \T1, \XMM7, \XMM7
1376                vaesenc \T1, \XMM8, \XMM8
1377
1378        vmovdqa         TMP4(%rsp), \T1
1379        vmovdqu         HashKey_5(arg2), \T5
1380        vpclmulqdq      $0x11, \T5, \T1, \T3
1381        vpxor           \T3, \T4, \T4
1382        vpclmulqdq      $0x00, \T5, \T1, \T3
1383        vpxor           \T3, \T7, \T7
1384
1385        vpshufd         $0b01001110, \T1, \T3
1386        vpxor           \T1, \T3, \T3
1387        vmovdqu         HashKey_5_k(arg2), \T5
1388        vpclmulqdq      $0x10, \T5, \T3, \T3
1389        vpxor           \T3, \T6, \T6
1390
1391                vmovdqu 16*6(arg1), \T1
1392                vaesenc \T1, \XMM1, \XMM1
1393                vaesenc \T1, \XMM2, \XMM2
1394                vaesenc \T1, \XMM3, \XMM3
1395                vaesenc \T1, \XMM4, \XMM4
1396                vaesenc \T1, \XMM5, \XMM5
1397                vaesenc \T1, \XMM6, \XMM6
1398                vaesenc \T1, \XMM7, \XMM7
1399                vaesenc \T1, \XMM8, \XMM8
1400
1401
1402        vmovdqa         TMP5(%rsp), \T1
1403        vmovdqu         HashKey_4(arg2), \T5
1404        vpclmulqdq      $0x11, \T5, \T1, \T3
1405        vpxor           \T3, \T4, \T4
1406        vpclmulqdq      $0x00, \T5, \T1, \T3
1407        vpxor           \T3, \T7, \T7
1408
1409        vpshufd         $0b01001110, \T1, \T3
1410        vpxor           \T1, \T3, \T3
1411        vmovdqu         HashKey_4_k(arg2), \T5
1412        vpclmulqdq      $0x10, \T5, \T3, \T3
1413        vpxor           \T3, \T6, \T6
1414
1415                vmovdqu 16*7(arg1), \T1
1416                vaesenc \T1, \XMM1, \XMM1
1417                vaesenc \T1, \XMM2, \XMM2
1418                vaesenc \T1, \XMM3, \XMM3
1419                vaesenc \T1, \XMM4, \XMM4
1420                vaesenc \T1, \XMM5, \XMM5
1421                vaesenc \T1, \XMM6, \XMM6
1422                vaesenc \T1, \XMM7, \XMM7
1423                vaesenc \T1, \XMM8, \XMM8
1424
1425        vmovdqa         TMP6(%rsp), \T1
1426        vmovdqu         HashKey_3(arg2), \T5
1427        vpclmulqdq      $0x11, \T5, \T1, \T3
1428        vpxor           \T3, \T4, \T4
1429        vpclmulqdq      $0x00, \T5, \T1, \T3
1430        vpxor           \T3, \T7, \T7
1431
1432        vpshufd         $0b01001110, \T1, \T3
1433        vpxor           \T1, \T3, \T3
1434        vmovdqu         HashKey_3_k(arg2), \T5
1435        vpclmulqdq      $0x10, \T5, \T3, \T3
1436        vpxor           \T3, \T6, \T6
1437
1438
1439                vmovdqu 16*8(arg1), \T1
1440                vaesenc \T1, \XMM1, \XMM1
1441                vaesenc \T1, \XMM2, \XMM2
1442                vaesenc \T1, \XMM3, \XMM3
1443                vaesenc \T1, \XMM4, \XMM4
1444                vaesenc \T1, \XMM5, \XMM5
1445                vaesenc \T1, \XMM6, \XMM6
1446                vaesenc \T1, \XMM7, \XMM7
1447                vaesenc \T1, \XMM8, \XMM8
1448
1449        vmovdqa         TMP7(%rsp), \T1
1450        vmovdqu         HashKey_2(arg2), \T5
1451        vpclmulqdq      $0x11, \T5, \T1, \T3
1452        vpxor           \T3, \T4, \T4
1453        vpclmulqdq      $0x00, \T5, \T1, \T3
1454        vpxor           \T3, \T7, \T7
1455
1456        vpshufd         $0b01001110, \T1, \T3
1457        vpxor           \T1, \T3, \T3
1458        vmovdqu         HashKey_2_k(arg2), \T5
1459        vpclmulqdq      $0x10, \T5, \T3, \T3
1460        vpxor           \T3, \T6, \T6
1461
1462        #######################################################################
1463
1464                vmovdqu 16*9(arg1), \T5
1465                vaesenc \T5, \XMM1, \XMM1
1466                vaesenc \T5, \XMM2, \XMM2
1467                vaesenc \T5, \XMM3, \XMM3
1468                vaesenc \T5, \XMM4, \XMM4
1469                vaesenc \T5, \XMM5, \XMM5
1470                vaesenc \T5, \XMM6, \XMM6
1471                vaesenc \T5, \XMM7, \XMM7
1472                vaesenc \T5, \XMM8, \XMM8
1473
1474        vmovdqa         TMP8(%rsp), \T1
1475        vmovdqu         HashKey(arg2), \T5
1476        vpclmulqdq      $0x11, \T5, \T1, \T3
1477        vpxor           \T3, \T4, \T4
1478        vpclmulqdq      $0x00, \T5, \T1, \T3
1479        vpxor           \T3, \T7, \T7
1480
1481        vpshufd         $0b01001110, \T1, \T3
1482        vpxor           \T1, \T3, \T3
1483        vmovdqu         HashKey_k(arg2), \T5
1484        vpclmulqdq      $0x10, \T5, \T3, \T3
1485        vpxor           \T3, \T6, \T6
1486
1487        vpxor           \T4, \T6, \T6
1488        vpxor           \T7, \T6, \T6
1489
1490                vmovdqu 16*10(arg1), \T5
1491
1492        i = 11
1493        setreg
1494.rep (\REP-9)
1495
1496        vaesenc \T5, \XMM1, \XMM1
1497        vaesenc \T5, \XMM2, \XMM2
1498        vaesenc \T5, \XMM3, \XMM3
1499        vaesenc \T5, \XMM4, \XMM4
1500        vaesenc \T5, \XMM5, \XMM5
1501        vaesenc \T5, \XMM6, \XMM6
1502        vaesenc \T5, \XMM7, \XMM7
1503        vaesenc \T5, \XMM8, \XMM8
1504
1505        vmovdqu 16*i(arg1), \T5
1506        i = i + 1
1507        setreg
1508.endr
1509
1510	i = 0
1511	j = 1
1512	setreg
1513.rep 8
1514		vpxor	16*i(arg4, %r11), \T5, \T2
1515                .if \ENC_DEC == ENC
1516                vaesenclast     \T2, reg_j, reg_j
1517                .else
1518                vaesenclast     \T2, reg_j, \T3
1519                vmovdqu 16*i(arg4, %r11), reg_j
1520                vmovdqu \T3, 16*i(arg3, %r11)
1521                .endif
1522	i = (i+1)
1523	j = (j+1)
1524	setreg
1525.endr
1526	#######################################################################
1527
1528
1529	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1530	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1531	vpxor	\T3, \T7, \T7
1532	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1533
1534
1535
1536	#######################################################################
1537	#first phase of the reduction
1538	#######################################################################
1539        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1540        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1541        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1542
1543        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1544        vpxor   \T4, \T2, \T2
1545
1546        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1547
1548        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1549        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1550	#######################################################################
1551                .if \ENC_DEC == ENC
1552		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1553		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1554		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1555		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1556		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1557		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1558		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1559		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1560                .endif
1561
1562	#######################################################################
1563	#second phase of the reduction
1564        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1565        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1566        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1567        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1568        vpxor   \T4, \T2, \T2
1569
1570        vpxor   \T1, \T2, \T2
1571        vpxor   \T2, \T7, \T7
1572        vpxor   \T7, \T6, \T6                           # the result is in T6
1573	#######################################################################
1574
1575		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1576		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1577		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1578		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1579		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1580		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1581		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1582		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1583
1584
1585	vpxor	\T6, \XMM1, \XMM1
1586
1587
1588
1589.endm
1590
1591
1592# GHASH the last 4 ciphertext blocks.
1593.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1594
1595        ## Karatsuba Method
1596
1597
1598        vpshufd         $0b01001110, \XMM1, \T2
1599        vpxor           \XMM1, \T2, \T2
1600        vmovdqu         HashKey_8(arg2), \T5
1601        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1602        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1603
1604        vmovdqu         HashKey_8_k(arg2), \T3
1605        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1606
1607        ######################
1608
1609        vpshufd         $0b01001110, \XMM2, \T2
1610        vpxor           \XMM2, \T2, \T2
1611        vmovdqu         HashKey_7(arg2), \T5
1612        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1613        vpxor           \T4, \T6, \T6
1614
1615        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1616        vpxor           \T4, \T7, \T7
1617
1618        vmovdqu         HashKey_7_k(arg2), \T3
1619        vpclmulqdq      $0x00, \T3, \T2, \T2
1620        vpxor           \T2, \XMM1, \XMM1
1621
1622        ######################
1623
1624        vpshufd         $0b01001110, \XMM3, \T2
1625        vpxor           \XMM3, \T2, \T2
1626        vmovdqu         HashKey_6(arg2), \T5
1627        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1628        vpxor           \T4, \T6, \T6
1629
1630        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1631        vpxor           \T4, \T7, \T7
1632
1633        vmovdqu         HashKey_6_k(arg2), \T3
1634        vpclmulqdq      $0x00, \T3, \T2, \T2
1635        vpxor           \T2, \XMM1, \XMM1
1636
1637        ######################
1638
1639        vpshufd         $0b01001110, \XMM4, \T2
1640        vpxor           \XMM4, \T2, \T2
1641        vmovdqu         HashKey_5(arg2), \T5
1642        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1643        vpxor           \T4, \T6, \T6
1644
1645        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1646        vpxor           \T4, \T7, \T7
1647
1648        vmovdqu         HashKey_5_k(arg2), \T3
1649        vpclmulqdq      $0x00, \T3, \T2, \T2
1650        vpxor           \T2, \XMM1, \XMM1
1651
1652        ######################
1653
1654        vpshufd         $0b01001110, \XMM5, \T2
1655        vpxor           \XMM5, \T2, \T2
1656        vmovdqu         HashKey_4(arg2), \T5
1657        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1658        vpxor           \T4, \T6, \T6
1659
1660        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1661        vpxor           \T4, \T7, \T7
1662
1663        vmovdqu         HashKey_4_k(arg2), \T3
1664        vpclmulqdq      $0x00, \T3, \T2, \T2
1665        vpxor           \T2, \XMM1, \XMM1
1666
1667        ######################
1668
1669        vpshufd         $0b01001110, \XMM6, \T2
1670        vpxor           \XMM6, \T2, \T2
1671        vmovdqu         HashKey_3(arg2), \T5
1672        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1673        vpxor           \T4, \T6, \T6
1674
1675        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1676        vpxor           \T4, \T7, \T7
1677
1678        vmovdqu         HashKey_3_k(arg2), \T3
1679        vpclmulqdq      $0x00, \T3, \T2, \T2
1680        vpxor           \T2, \XMM1, \XMM1
1681
1682        ######################
1683
1684        vpshufd         $0b01001110, \XMM7, \T2
1685        vpxor           \XMM7, \T2, \T2
1686        vmovdqu         HashKey_2(arg2), \T5
1687        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1688        vpxor           \T4, \T6, \T6
1689
1690        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1691        vpxor           \T4, \T7, \T7
1692
1693        vmovdqu         HashKey_2_k(arg2), \T3
1694        vpclmulqdq      $0x00, \T3, \T2, \T2
1695        vpxor           \T2, \XMM1, \XMM1
1696
1697        ######################
1698
1699        vpshufd         $0b01001110, \XMM8, \T2
1700        vpxor           \XMM8, \T2, \T2
1701        vmovdqu         HashKey(arg2), \T5
1702        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1703        vpxor           \T4, \T6, \T6
1704
1705        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1706        vpxor           \T4, \T7, \T7
1707
1708        vmovdqu         HashKey_k(arg2), \T3
1709        vpclmulqdq      $0x00, \T3, \T2, \T2
1710
1711        vpxor           \T2, \XMM1, \XMM1
1712        vpxor           \T6, \XMM1, \XMM1
1713        vpxor           \T7, \XMM1, \T2
1714
1715
1716
1717
1718        vpslldq $8, \T2, \T4
1719        vpsrldq $8, \T2, \T2
1720
1721        vpxor   \T4, \T7, \T7
1722        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1723				# the accumulated carry-less multiplications
1724
1725        #######################################################################
1726        #first phase of the reduction
1727        vpslld  $31, \T7, \T2   # packed right shifting << 31
1728        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1729        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1730
1731        vpxor   \T3, \T2, \T2   # xor the shifted versions
1732        vpxor   \T4, \T2, \T2
1733
1734        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1735
1736        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1737        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1738        #######################################################################
1739
1740
1741        #second phase of the reduction
1742        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1743        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1744        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1745        vpxor   \T3, \T2, \T2   # xor the shifted versions
1746        vpxor   \T4, \T2, \T2
1747
1748        vpxor   \T1, \T2, \T2
1749        vpxor   \T2, \T7, \T7
1750        vpxor   \T7, \T6, \T6   # the result is in T6
1751
1752.endm
1753
1754#############################################################
1755#void   aesni_gcm_precomp_avx_gen2
1756#        (gcm_data     *my_ctx_data,
1757#         gcm_context_data *data,
1758#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1759#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1760#			(from Security Association) concatenated with 8 byte
1761#			Initialisation Vector (from IPSec ESP Payload)
1762#			concatenated with 0x00000001. 16-byte aligned pointer. */
1763#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1764#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1765#############################################################
1766SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1767        FUNC_SAVE
1768        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1769        FUNC_RESTORE
1770        ret
1771SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1772
1773###############################################################################
1774#void   aesni_gcm_enc_update_avx_gen2(
1775#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1776#        gcm_context_data *data,
1777#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1778#        const   u8 *in, /* Plaintext input */
1779#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1780###############################################################################
1781SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1782        FUNC_SAVE
1783        mov     keysize, %eax
1784        cmp     $32, %eax
1785        je      key_256_enc_update
1786        cmp     $16, %eax
1787        je      key_128_enc_update
1788        # must be 192
1789        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1790        FUNC_RESTORE
1791        ret
1792key_128_enc_update:
1793        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1794        FUNC_RESTORE
1795        ret
1796key_256_enc_update:
1797        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1798        FUNC_RESTORE
1799        ret
1800SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1801
1802###############################################################################
1803#void   aesni_gcm_dec_update_avx_gen2(
1804#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1805#        gcm_context_data *data,
1806#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1807#        const   u8 *in, /* Ciphertext input */
1808#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1809###############################################################################
1810SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1811        FUNC_SAVE
1812        mov     keysize,%eax
1813        cmp     $32, %eax
1814        je      key_256_dec_update
1815        cmp     $16, %eax
1816        je      key_128_dec_update
1817        # must be 192
1818        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1819        FUNC_RESTORE
1820        ret
1821key_128_dec_update:
1822        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1823        FUNC_RESTORE
1824        ret
1825key_256_dec_update:
1826        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1827        FUNC_RESTORE
1828        ret
1829SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1830
1831###############################################################################
1832#void   aesni_gcm_finalize_avx_gen2(
1833#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1834#        gcm_context_data *data,
1835#        u8      *auth_tag, /* Authenticated Tag output. */
1836#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1837#				Valid values are 16 (most likely), 12 or 8. */
1838###############################################################################
1839SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1840        FUNC_SAVE
1841        mov	keysize,%eax
1842        cmp     $32, %eax
1843        je      key_256_finalize
1844        cmp     $16, %eax
1845        je      key_128_finalize
1846        # must be 192
1847        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1848        FUNC_RESTORE
1849        ret
1850key_128_finalize:
1851        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1852        FUNC_RESTORE
1853        ret
1854key_256_finalize:
1855        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1856        FUNC_RESTORE
1857        ret
1858SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1859
1860###############################################################################
1861# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1862# Input: A and B (128-bits each, bit-reflected)
1863# Output: C = A*B*x mod poly, (i.e. >>1 )
1864# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1865# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1866###############################################################################
1867.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1868
1869        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1870        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1871        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1872        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1873        vpxor           \T3, \GH, \GH
1874
1875
1876        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1877        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1878
1879        vpxor           \T3, \T1, \T1
1880        vpxor           \T2, \GH, \GH
1881
1882        #######################################################################
1883        #first phase of the reduction
1884        vmovdqa         POLY2(%rip), \T3
1885
1886        vpclmulqdq      $0x01, \GH, \T3, \T2
1887        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1888
1889        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1890        #######################################################################
1891        #second phase of the reduction
1892        vpclmulqdq      $0x00, \GH, \T3, \T2
1893        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1894
1895        vpclmulqdq      $0x10, \GH, \T3, \GH
1896        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1897
1898        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1899        #######################################################################
1900        vpxor           \T1, \GH, \GH          # the result is in GH
1901
1902
1903.endm
1904
1905.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1906
1907        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1908        vmovdqa  \HK, \T5
1909        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1910        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1911
1912        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1913        vmovdqu  \T5, HashKey_3(arg2)
1914
1915        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1916        vmovdqu  \T5, HashKey_4(arg2)
1917
1918        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1919        vmovdqu  \T5, HashKey_5(arg2)
1920
1921        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1922        vmovdqu  \T5, HashKey_6(arg2)
1923
1924        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1925        vmovdqu  \T5, HashKey_7(arg2)
1926
1927        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1928        vmovdqu  \T5, HashKey_8(arg2)
1929
1930.endm
1931
1932## if a = number of total plaintext bytes
1933## b = floor(a/16)
1934## num_initial_blocks = b mod 4#
1935## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1936## r10, r11, r12, rax are clobbered
1937## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1938
1939.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940	i = (8-\num_initial_blocks)
1941	setreg
1942	vmovdqu AadHash(arg2), reg_i
1943
1944	# start AES for num_initial_blocks blocks
1945	vmovdqu CurCount(arg2), \CTR
1946
1947	i = (9-\num_initial_blocks)
1948	setreg
1949.rep \num_initial_blocks
1950                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1951                vmovdqa \CTR, reg_i
1952                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1953	i = (i+1)
1954	setreg
1955.endr
1956
1957	vmovdqa  (arg1), \T_key
1958	i = (9-\num_initial_blocks)
1959	setreg
1960.rep \num_initial_blocks
1961                vpxor   \T_key, reg_i, reg_i
1962	i = (i+1)
1963	setreg
1964.endr
1965
1966	j = 1
1967	setreg
1968.rep \REP
1969	vmovdqa  16*j(arg1), \T_key
1970	i = (9-\num_initial_blocks)
1971	setreg
1972.rep \num_initial_blocks
1973        vaesenc \T_key, reg_i, reg_i
1974	i = (i+1)
1975	setreg
1976.endr
1977
1978	j = (j+1)
1979	setreg
1980.endr
1981
1982
1983	vmovdqa  16*j(arg1), \T_key
1984	i = (9-\num_initial_blocks)
1985	setreg
1986.rep \num_initial_blocks
1987        vaesenclast      \T_key, reg_i, reg_i
1988	i = (i+1)
1989	setreg
1990.endr
1991
1992	i = (9-\num_initial_blocks)
1993	setreg
1994.rep \num_initial_blocks
1995                vmovdqu (arg4, %r11), \T1
1996                vpxor   \T1, reg_i, reg_i
1997                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
1998						       # num_initial_blocks blocks
1999                add     $16, %r11
2000.if  \ENC_DEC == DEC
2001                vmovdqa \T1, reg_i
2002.endif
2003                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2004	i = (i+1)
2005	setreg
2006.endr
2007
2008
2009	i = (8-\num_initial_blocks)
2010	j = (9-\num_initial_blocks)
2011	setreg
2012
2013.rep \num_initial_blocks
2014        vpxor    reg_i, reg_j, reg_j
2015        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2016	i = (i+1)
2017	j = (j+1)
2018	setreg
2019.endr
2020        # XMM8 has the combined result here
2021
2022        vmovdqa  \XMM8, TMP1(%rsp)
2023        vmovdqa  \XMM8, \T3
2024
2025        cmp     $128, %r13
2026        jl      _initial_blocks_done\@                  # no need for precomputed constants
2027
2028###############################################################################
2029# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2030                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2031                vmovdqa  \CTR, \XMM1
2032                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2033
2034                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2035                vmovdqa  \CTR, \XMM2
2036                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2037
2038                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2039                vmovdqa  \CTR, \XMM3
2040                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2041
2042                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2043                vmovdqa  \CTR, \XMM4
2044                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2045
2046                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2047                vmovdqa  \CTR, \XMM5
2048                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2049
2050                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2051                vmovdqa  \CTR, \XMM6
2052                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2053
2054                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2055                vmovdqa  \CTR, \XMM7
2056                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2057
2058                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2059                vmovdqa  \CTR, \XMM8
2060                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2061
2062                vmovdqa  (arg1), \T_key
2063                vpxor    \T_key, \XMM1, \XMM1
2064                vpxor    \T_key, \XMM2, \XMM2
2065                vpxor    \T_key, \XMM3, \XMM3
2066                vpxor    \T_key, \XMM4, \XMM4
2067                vpxor    \T_key, \XMM5, \XMM5
2068                vpxor    \T_key, \XMM6, \XMM6
2069                vpxor    \T_key, \XMM7, \XMM7
2070                vpxor    \T_key, \XMM8, \XMM8
2071
2072		i = 1
2073		setreg
2074.rep    \REP       # do REP rounds
2075                vmovdqa  16*i(arg1), \T_key
2076                vaesenc  \T_key, \XMM1, \XMM1
2077                vaesenc  \T_key, \XMM2, \XMM2
2078                vaesenc  \T_key, \XMM3, \XMM3
2079                vaesenc  \T_key, \XMM4, \XMM4
2080                vaesenc  \T_key, \XMM5, \XMM5
2081                vaesenc  \T_key, \XMM6, \XMM6
2082                vaesenc  \T_key, \XMM7, \XMM7
2083                vaesenc  \T_key, \XMM8, \XMM8
2084		i = (i+1)
2085		setreg
2086.endr
2087
2088
2089                vmovdqa  16*i(arg1), \T_key
2090                vaesenclast  \T_key, \XMM1, \XMM1
2091                vaesenclast  \T_key, \XMM2, \XMM2
2092                vaesenclast  \T_key, \XMM3, \XMM3
2093                vaesenclast  \T_key, \XMM4, \XMM4
2094                vaesenclast  \T_key, \XMM5, \XMM5
2095                vaesenclast  \T_key, \XMM6, \XMM6
2096                vaesenclast  \T_key, \XMM7, \XMM7
2097                vaesenclast  \T_key, \XMM8, \XMM8
2098
2099                vmovdqu  (arg4, %r11), \T1
2100                vpxor    \T1, \XMM1, \XMM1
2101                vmovdqu  \XMM1, (arg3 , %r11)
2102                .if   \ENC_DEC == DEC
2103                vmovdqa  \T1, \XMM1
2104                .endif
2105
2106                vmovdqu  16*1(arg4, %r11), \T1
2107                vpxor    \T1, \XMM2, \XMM2
2108                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2109                .if   \ENC_DEC == DEC
2110                vmovdqa  \T1, \XMM2
2111                .endif
2112
2113                vmovdqu  16*2(arg4, %r11), \T1
2114                vpxor    \T1, \XMM3, \XMM3
2115                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2116                .if   \ENC_DEC == DEC
2117                vmovdqa  \T1, \XMM3
2118                .endif
2119
2120                vmovdqu  16*3(arg4, %r11), \T1
2121                vpxor    \T1, \XMM4, \XMM4
2122                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2123                .if   \ENC_DEC == DEC
2124                vmovdqa  \T1, \XMM4
2125                .endif
2126
2127                vmovdqu  16*4(arg4, %r11), \T1
2128                vpxor    \T1, \XMM5, \XMM5
2129                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2130                .if   \ENC_DEC == DEC
2131                vmovdqa  \T1, \XMM5
2132                .endif
2133
2134                vmovdqu  16*5(arg4, %r11), \T1
2135                vpxor    \T1, \XMM6, \XMM6
2136                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2137                .if   \ENC_DEC == DEC
2138                vmovdqa  \T1, \XMM6
2139                .endif
2140
2141                vmovdqu  16*6(arg4, %r11), \T1
2142                vpxor    \T1, \XMM7, \XMM7
2143                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2144                .if   \ENC_DEC == DEC
2145                vmovdqa  \T1, \XMM7
2146                .endif
2147
2148                vmovdqu  16*7(arg4, %r11), \T1
2149                vpxor    \T1, \XMM8, \XMM8
2150                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2151                .if   \ENC_DEC == DEC
2152                vmovdqa  \T1, \XMM8
2153                .endif
2154
2155                add     $128, %r11
2156
2157                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2158                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2159							   # the corresponding ciphertext
2160                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2161                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2162                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2163                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2164                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2165                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2166                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2167
2168###############################################################################
2169
2170_initial_blocks_done\@:
2171
2172
2173.endm
2174
2175
2176
2177# encrypt 8 blocks at a time
2178# ghash the 8 previously encrypted ciphertext blocks
2179# arg1, arg2, arg3, arg4 are used as pointers only, not modified
2180# r11 is the data offset value
2181.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2182
2183        vmovdqa \XMM1, \T2
2184        vmovdqa \XMM2, TMP2(%rsp)
2185        vmovdqa \XMM3, TMP3(%rsp)
2186        vmovdqa \XMM4, TMP4(%rsp)
2187        vmovdqa \XMM5, TMP5(%rsp)
2188        vmovdqa \XMM6, TMP6(%rsp)
2189        vmovdqa \XMM7, TMP7(%rsp)
2190        vmovdqa \XMM8, TMP8(%rsp)
2191
2192.if \loop_idx == in_order
2193                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2194                vpaddd  ONE(%rip), \XMM1, \XMM2
2195                vpaddd  ONE(%rip), \XMM2, \XMM3
2196                vpaddd  ONE(%rip), \XMM3, \XMM4
2197                vpaddd  ONE(%rip), \XMM4, \XMM5
2198                vpaddd  ONE(%rip), \XMM5, \XMM6
2199                vpaddd  ONE(%rip), \XMM6, \XMM7
2200                vpaddd  ONE(%rip), \XMM7, \XMM8
2201                vmovdqa \XMM8, \CTR
2202
2203                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2204                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2205                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2206                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2207                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2208                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2209                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2210                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2211.else
2212                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2213                vpaddd  ONEf(%rip), \XMM1, \XMM2
2214                vpaddd  ONEf(%rip), \XMM2, \XMM3
2215                vpaddd  ONEf(%rip), \XMM3, \XMM4
2216                vpaddd  ONEf(%rip), \XMM4, \XMM5
2217                vpaddd  ONEf(%rip), \XMM5, \XMM6
2218                vpaddd  ONEf(%rip), \XMM6, \XMM7
2219                vpaddd  ONEf(%rip), \XMM7, \XMM8
2220                vmovdqa \XMM8, \CTR
2221.endif
2222
2223
2224        #######################################################################
2225
2226                vmovdqu (arg1), \T1
2227                vpxor   \T1, \XMM1, \XMM1
2228                vpxor   \T1, \XMM2, \XMM2
2229                vpxor   \T1, \XMM3, \XMM3
2230                vpxor   \T1, \XMM4, \XMM4
2231                vpxor   \T1, \XMM5, \XMM5
2232                vpxor   \T1, \XMM6, \XMM6
2233                vpxor   \T1, \XMM7, \XMM7
2234                vpxor   \T1, \XMM8, \XMM8
2235
2236        #######################################################################
2237
2238
2239
2240
2241
2242                vmovdqu 16*1(arg1), \T1
2243                vaesenc \T1, \XMM1, \XMM1
2244                vaesenc \T1, \XMM2, \XMM2
2245                vaesenc \T1, \XMM3, \XMM3
2246                vaesenc \T1, \XMM4, \XMM4
2247                vaesenc \T1, \XMM5, \XMM5
2248                vaesenc \T1, \XMM6, \XMM6
2249                vaesenc \T1, \XMM7, \XMM7
2250                vaesenc \T1, \XMM8, \XMM8
2251
2252                vmovdqu 16*2(arg1), \T1
2253                vaesenc \T1, \XMM1, \XMM1
2254                vaesenc \T1, \XMM2, \XMM2
2255                vaesenc \T1, \XMM3, \XMM3
2256                vaesenc \T1, \XMM4, \XMM4
2257                vaesenc \T1, \XMM5, \XMM5
2258                vaesenc \T1, \XMM6, \XMM6
2259                vaesenc \T1, \XMM7, \XMM7
2260                vaesenc \T1, \XMM8, \XMM8
2261
2262
2263        #######################################################################
2264
2265        vmovdqu         HashKey_8(arg2), \T5
2266        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2267        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2268        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2269        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2270        vpxor           \T5, \T6, \T6
2271
2272                vmovdqu 16*3(arg1), \T1
2273                vaesenc \T1, \XMM1, \XMM1
2274                vaesenc \T1, \XMM2, \XMM2
2275                vaesenc \T1, \XMM3, \XMM3
2276                vaesenc \T1, \XMM4, \XMM4
2277                vaesenc \T1, \XMM5, \XMM5
2278                vaesenc \T1, \XMM6, \XMM6
2279                vaesenc \T1, \XMM7, \XMM7
2280                vaesenc \T1, \XMM8, \XMM8
2281
2282        vmovdqa         TMP2(%rsp), \T1
2283        vmovdqu         HashKey_7(arg2), \T5
2284        vpclmulqdq      $0x11, \T5, \T1, \T3
2285        vpxor           \T3, \T4, \T4
2286
2287        vpclmulqdq      $0x00, \T5, \T1, \T3
2288        vpxor           \T3, \T7, \T7
2289
2290        vpclmulqdq      $0x01, \T5, \T1, \T3
2291        vpxor           \T3, \T6, \T6
2292
2293        vpclmulqdq      $0x10, \T5, \T1, \T3
2294        vpxor           \T3, \T6, \T6
2295
2296                vmovdqu 16*4(arg1), \T1
2297                vaesenc \T1, \XMM1, \XMM1
2298                vaesenc \T1, \XMM2, \XMM2
2299                vaesenc \T1, \XMM3, \XMM3
2300                vaesenc \T1, \XMM4, \XMM4
2301                vaesenc \T1, \XMM5, \XMM5
2302                vaesenc \T1, \XMM6, \XMM6
2303                vaesenc \T1, \XMM7, \XMM7
2304                vaesenc \T1, \XMM8, \XMM8
2305
2306        #######################################################################
2307
2308        vmovdqa         TMP3(%rsp), \T1
2309        vmovdqu         HashKey_6(arg2), \T5
2310        vpclmulqdq      $0x11, \T5, \T1, \T3
2311        vpxor           \T3, \T4, \T4
2312
2313        vpclmulqdq      $0x00, \T5, \T1, \T3
2314        vpxor           \T3, \T7, \T7
2315
2316        vpclmulqdq      $0x01, \T5, \T1, \T3
2317        vpxor           \T3, \T6, \T6
2318
2319        vpclmulqdq      $0x10, \T5, \T1, \T3
2320        vpxor           \T3, \T6, \T6
2321
2322                vmovdqu 16*5(arg1), \T1
2323                vaesenc \T1, \XMM1, \XMM1
2324                vaesenc \T1, \XMM2, \XMM2
2325                vaesenc \T1, \XMM3, \XMM3
2326                vaesenc \T1, \XMM4, \XMM4
2327                vaesenc \T1, \XMM5, \XMM5
2328                vaesenc \T1, \XMM6, \XMM6
2329                vaesenc \T1, \XMM7, \XMM7
2330                vaesenc \T1, \XMM8, \XMM8
2331
2332        vmovdqa         TMP4(%rsp), \T1
2333        vmovdqu         HashKey_5(arg2), \T5
2334        vpclmulqdq      $0x11, \T5, \T1, \T3
2335        vpxor           \T3, \T4, \T4
2336
2337        vpclmulqdq      $0x00, \T5, \T1, \T3
2338        vpxor           \T3, \T7, \T7
2339
2340        vpclmulqdq      $0x01, \T5, \T1, \T3
2341        vpxor           \T3, \T6, \T6
2342
2343        vpclmulqdq      $0x10, \T5, \T1, \T3
2344        vpxor           \T3, \T6, \T6
2345
2346                vmovdqu 16*6(arg1), \T1
2347                vaesenc \T1, \XMM1, \XMM1
2348                vaesenc \T1, \XMM2, \XMM2
2349                vaesenc \T1, \XMM3, \XMM3
2350                vaesenc \T1, \XMM4, \XMM4
2351                vaesenc \T1, \XMM5, \XMM5
2352                vaesenc \T1, \XMM6, \XMM6
2353                vaesenc \T1, \XMM7, \XMM7
2354                vaesenc \T1, \XMM8, \XMM8
2355
2356
2357        vmovdqa         TMP5(%rsp), \T1
2358        vmovdqu         HashKey_4(arg2), \T5
2359        vpclmulqdq      $0x11, \T5, \T1, \T3
2360        vpxor           \T3, \T4, \T4
2361
2362        vpclmulqdq      $0x00, \T5, \T1, \T3
2363        vpxor           \T3, \T7, \T7
2364
2365        vpclmulqdq      $0x01, \T5, \T1, \T3
2366        vpxor           \T3, \T6, \T6
2367
2368        vpclmulqdq      $0x10, \T5, \T1, \T3
2369        vpxor           \T3, \T6, \T6
2370
2371                vmovdqu 16*7(arg1), \T1
2372                vaesenc \T1, \XMM1, \XMM1
2373                vaesenc \T1, \XMM2, \XMM2
2374                vaesenc \T1, \XMM3, \XMM3
2375                vaesenc \T1, \XMM4, \XMM4
2376                vaesenc \T1, \XMM5, \XMM5
2377                vaesenc \T1, \XMM6, \XMM6
2378                vaesenc \T1, \XMM7, \XMM7
2379                vaesenc \T1, \XMM8, \XMM8
2380
2381        vmovdqa         TMP6(%rsp), \T1
2382        vmovdqu         HashKey_3(arg2), \T5
2383        vpclmulqdq      $0x11, \T5, \T1, \T3
2384        vpxor           \T3, \T4, \T4
2385
2386        vpclmulqdq      $0x00, \T5, \T1, \T3
2387        vpxor           \T3, \T7, \T7
2388
2389        vpclmulqdq      $0x01, \T5, \T1, \T3
2390        vpxor           \T3, \T6, \T6
2391
2392        vpclmulqdq      $0x10, \T5, \T1, \T3
2393        vpxor           \T3, \T6, \T6
2394
2395                vmovdqu 16*8(arg1), \T1
2396                vaesenc \T1, \XMM1, \XMM1
2397                vaesenc \T1, \XMM2, \XMM2
2398                vaesenc \T1, \XMM3, \XMM3
2399                vaesenc \T1, \XMM4, \XMM4
2400                vaesenc \T1, \XMM5, \XMM5
2401                vaesenc \T1, \XMM6, \XMM6
2402                vaesenc \T1, \XMM7, \XMM7
2403                vaesenc \T1, \XMM8, \XMM8
2404
2405        vmovdqa         TMP7(%rsp), \T1
2406        vmovdqu         HashKey_2(arg2), \T5
2407        vpclmulqdq      $0x11, \T5, \T1, \T3
2408        vpxor           \T3, \T4, \T4
2409
2410        vpclmulqdq      $0x00, \T5, \T1, \T3
2411        vpxor           \T3, \T7, \T7
2412
2413        vpclmulqdq      $0x01, \T5, \T1, \T3
2414        vpxor           \T3, \T6, \T6
2415
2416        vpclmulqdq      $0x10, \T5, \T1, \T3
2417        vpxor           \T3, \T6, \T6
2418
2419
2420        #######################################################################
2421
2422                vmovdqu 16*9(arg1), \T5
2423                vaesenc \T5, \XMM1, \XMM1
2424                vaesenc \T5, \XMM2, \XMM2
2425                vaesenc \T5, \XMM3, \XMM3
2426                vaesenc \T5, \XMM4, \XMM4
2427                vaesenc \T5, \XMM5, \XMM5
2428                vaesenc \T5, \XMM6, \XMM6
2429                vaesenc \T5, \XMM7, \XMM7
2430                vaesenc \T5, \XMM8, \XMM8
2431
2432        vmovdqa         TMP8(%rsp), \T1
2433        vmovdqu         HashKey(arg2), \T5
2434
2435        vpclmulqdq      $0x00, \T5, \T1, \T3
2436        vpxor           \T3, \T7, \T7
2437
2438        vpclmulqdq      $0x01, \T5, \T1, \T3
2439        vpxor           \T3, \T6, \T6
2440
2441        vpclmulqdq      $0x10, \T5, \T1, \T3
2442        vpxor           \T3, \T6, \T6
2443
2444        vpclmulqdq      $0x11, \T5, \T1, \T3
2445        vpxor           \T3, \T4, \T1
2446
2447
2448                vmovdqu 16*10(arg1), \T5
2449
2450        i = 11
2451        setreg
2452.rep (\REP-9)
2453        vaesenc \T5, \XMM1, \XMM1
2454        vaesenc \T5, \XMM2, \XMM2
2455        vaesenc \T5, \XMM3, \XMM3
2456        vaesenc \T5, \XMM4, \XMM4
2457        vaesenc \T5, \XMM5, \XMM5
2458        vaesenc \T5, \XMM6, \XMM6
2459        vaesenc \T5, \XMM7, \XMM7
2460        vaesenc \T5, \XMM8, \XMM8
2461
2462        vmovdqu 16*i(arg1), \T5
2463        i = i + 1
2464        setreg
2465.endr
2466
2467	i = 0
2468	j = 1
2469	setreg
2470.rep 8
2471		vpxor	16*i(arg4, %r11), \T5, \T2
2472                .if \ENC_DEC == ENC
2473                vaesenclast     \T2, reg_j, reg_j
2474                .else
2475                vaesenclast     \T2, reg_j, \T3
2476                vmovdqu 16*i(arg4, %r11), reg_j
2477                vmovdqu \T3, 16*i(arg3, %r11)
2478                .endif
2479	i = (i+1)
2480	j = (j+1)
2481	setreg
2482.endr
2483	#######################################################################
2484
2485
2486	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2487	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2488	vpxor	\T3, \T7, \T7
2489	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2490
2491
2492
2493	#######################################################################
2494	#first phase of the reduction
2495	vmovdqa         POLY2(%rip), \T3
2496
2497	vpclmulqdq	$0x01, \T7, \T3, \T2
2498	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2499
2500	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2501	#######################################################################
2502                .if \ENC_DEC == ENC
2503		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2504		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2505		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2506		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2507		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2508		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2509		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2510		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2511                .endif
2512
2513	#######################################################################
2514	#second phase of the reduction
2515	vpclmulqdq	$0x00, \T7, \T3, \T2
2516	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2517
2518	vpclmulqdq	$0x10, \T7, \T3, \T4
2519	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2520
2521	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2522	#######################################################################
2523	vpxor		\T4, \T1, \T1			# the result is in T1
2524
2525		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2526		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2527		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2528		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2529		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2530		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2531		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2532		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2533
2534
2535	vpxor	\T1, \XMM1, \XMM1
2536
2537
2538
2539.endm
2540
2541
2542# GHASH the last 4 ciphertext blocks.
2543.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2544
2545        ## Karatsuba Method
2546
2547        vmovdqu         HashKey_8(arg2), \T5
2548
2549        vpshufd         $0b01001110, \XMM1, \T2
2550        vpshufd         $0b01001110, \T5, \T3
2551        vpxor           \XMM1, \T2, \T2
2552        vpxor           \T5, \T3, \T3
2553
2554        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2555        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2556
2557        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2558
2559        ######################
2560
2561        vmovdqu         HashKey_7(arg2), \T5
2562        vpshufd         $0b01001110, \XMM2, \T2
2563        vpshufd         $0b01001110, \T5, \T3
2564        vpxor           \XMM2, \T2, \T2
2565        vpxor           \T5, \T3, \T3
2566
2567        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2568        vpxor           \T4, \T6, \T6
2569
2570        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2571        vpxor           \T4, \T7, \T7
2572
2573        vpclmulqdq      $0x00, \T3, \T2, \T2
2574
2575        vpxor           \T2, \XMM1, \XMM1
2576
2577        ######################
2578
2579        vmovdqu         HashKey_6(arg2), \T5
2580        vpshufd         $0b01001110, \XMM3, \T2
2581        vpshufd         $0b01001110, \T5, \T3
2582        vpxor           \XMM3, \T2, \T2
2583        vpxor           \T5, \T3, \T3
2584
2585        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2586        vpxor           \T4, \T6, \T6
2587
2588        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2589        vpxor           \T4, \T7, \T7
2590
2591        vpclmulqdq      $0x00, \T3, \T2, \T2
2592
2593        vpxor           \T2, \XMM1, \XMM1
2594
2595        ######################
2596
2597        vmovdqu         HashKey_5(arg2), \T5
2598        vpshufd         $0b01001110, \XMM4, \T2
2599        vpshufd         $0b01001110, \T5, \T3
2600        vpxor           \XMM4, \T2, \T2
2601        vpxor           \T5, \T3, \T3
2602
2603        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2604        vpxor           \T4, \T6, \T6
2605
2606        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2607        vpxor           \T4, \T7, \T7
2608
2609        vpclmulqdq      $0x00, \T3, \T2, \T2
2610
2611        vpxor           \T2, \XMM1, \XMM1
2612
2613        ######################
2614
2615        vmovdqu         HashKey_4(arg2), \T5
2616        vpshufd         $0b01001110, \XMM5, \T2
2617        vpshufd         $0b01001110, \T5, \T3
2618        vpxor           \XMM5, \T2, \T2
2619        vpxor           \T5, \T3, \T3
2620
2621        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2622        vpxor           \T4, \T6, \T6
2623
2624        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2625        vpxor           \T4, \T7, \T7
2626
2627        vpclmulqdq      $0x00, \T3, \T2, \T2
2628
2629        vpxor           \T2, \XMM1, \XMM1
2630
2631        ######################
2632
2633        vmovdqu         HashKey_3(arg2), \T5
2634        vpshufd         $0b01001110, \XMM6, \T2
2635        vpshufd         $0b01001110, \T5, \T3
2636        vpxor           \XMM6, \T2, \T2
2637        vpxor           \T5, \T3, \T3
2638
2639        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2640        vpxor           \T4, \T6, \T6
2641
2642        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2643        vpxor           \T4, \T7, \T7
2644
2645        vpclmulqdq      $0x00, \T3, \T2, \T2
2646
2647        vpxor           \T2, \XMM1, \XMM1
2648
2649        ######################
2650
2651        vmovdqu         HashKey_2(arg2), \T5
2652        vpshufd         $0b01001110, \XMM7, \T2
2653        vpshufd         $0b01001110, \T5, \T3
2654        vpxor           \XMM7, \T2, \T2
2655        vpxor           \T5, \T3, \T3
2656
2657        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2658        vpxor           \T4, \T6, \T6
2659
2660        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2661        vpxor           \T4, \T7, \T7
2662
2663        vpclmulqdq      $0x00, \T3, \T2, \T2
2664
2665        vpxor           \T2, \XMM1, \XMM1
2666
2667        ######################
2668
2669        vmovdqu         HashKey(arg2), \T5
2670        vpshufd         $0b01001110, \XMM8, \T2
2671        vpshufd         $0b01001110, \T5, \T3
2672        vpxor           \XMM8, \T2, \T2
2673        vpxor           \T5, \T3, \T3
2674
2675        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2676        vpxor           \T4, \T6, \T6
2677
2678        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2679        vpxor           \T4, \T7, \T7
2680
2681        vpclmulqdq      $0x00, \T3, \T2, \T2
2682
2683        vpxor           \T2, \XMM1, \XMM1
2684        vpxor           \T6, \XMM1, \XMM1
2685        vpxor           \T7, \XMM1, \T2
2686
2687
2688
2689
2690        vpslldq $8, \T2, \T4
2691        vpsrldq $8, \T2, \T2
2692
2693        vpxor   \T4, \T7, \T7
2694        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2695						   # accumulated carry-less multiplications
2696
2697        #######################################################################
2698        #first phase of the reduction
2699        vmovdqa         POLY2(%rip), \T3
2700
2701        vpclmulqdq      $0x01, \T7, \T3, \T2
2702        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2703
2704        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2705        #######################################################################
2706
2707
2708        #second phase of the reduction
2709        vpclmulqdq      $0x00, \T7, \T3, \T2
2710        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2711
2712        vpclmulqdq      $0x10, \T7, \T3, \T4
2713        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2714
2715        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2716        #######################################################################
2717        vpxor           \T4, \T6, \T6              # the result is in T6
2718.endm
2719
2720
2721
2722#############################################################
2723#void   aesni_gcm_init_avx_gen4
2724#        (gcm_data     *my_ctx_data,
2725#         gcm_context_data *data,
2726#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2727#			(from Security Association) concatenated with 8 byte
2728#			Initialisation Vector (from IPSec ESP Payload)
2729#			concatenated with 0x00000001. 16-byte aligned pointer. */
2730#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2731#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2732#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2733#############################################################
2734SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2735        FUNC_SAVE
2736        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2737        FUNC_RESTORE
2738        ret
2739SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2740
2741###############################################################################
2742#void   aesni_gcm_enc_avx_gen4(
2743#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2744#        gcm_context_data *data,
2745#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2746#        const   u8 *in, /* Plaintext input */
2747#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2748###############################################################################
2749SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2750        FUNC_SAVE
2751        mov     keysize,%eax
2752        cmp     $32, %eax
2753        je      key_256_enc_update4
2754        cmp     $16, %eax
2755        je      key_128_enc_update4
2756        # must be 192
2757        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2758        FUNC_RESTORE
2759	ret
2760key_128_enc_update4:
2761        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2762        FUNC_RESTORE
2763	ret
2764key_256_enc_update4:
2765        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2766        FUNC_RESTORE
2767	ret
2768SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2769
2770###############################################################################
2771#void   aesni_gcm_dec_update_avx_gen4(
2772#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2773#        gcm_context_data *data,
2774#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2775#        const   u8 *in, /* Ciphertext input */
2776#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2777###############################################################################
2778SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2779        FUNC_SAVE
2780        mov     keysize,%eax
2781        cmp     $32, %eax
2782        je      key_256_dec_update4
2783        cmp     $16, %eax
2784        je      key_128_dec_update4
2785        # must be 192
2786        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2787        FUNC_RESTORE
2788        ret
2789key_128_dec_update4:
2790        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2791        FUNC_RESTORE
2792        ret
2793key_256_dec_update4:
2794        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2795        FUNC_RESTORE
2796        ret
2797SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2798
2799###############################################################################
2800#void   aesni_gcm_finalize_avx_gen4(
2801#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2802#        gcm_context_data *data,
2803#        u8      *auth_tag, /* Authenticated Tag output. */
2804#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2805#                              Valid values are 16 (most likely), 12 or 8. */
2806###############################################################################
2807SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2808        FUNC_SAVE
2809        mov	keysize,%eax
2810        cmp     $32, %eax
2811        je      key_256_finalize4
2812        cmp     $16, %eax
2813        je      key_128_finalize4
2814        # must be 192
2815        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2816        FUNC_RESTORE
2817        ret
2818key_128_finalize4:
2819        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2820        FUNC_RESTORE
2821        ret
2822key_256_finalize4:
2823        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2824        FUNC_RESTORE
2825        ret
2826SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2827