11c201e64SMarkus Stockhausen/*
21c201e64SMarkus Stockhausen * Fast AES implementation for SPE instruction set (PPC)
31c201e64SMarkus Stockhausen *
41c201e64SMarkus Stockhausen * This code makes use of the SPE SIMD instruction set as defined in
51c201e64SMarkus Stockhausen * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
61c201e64SMarkus Stockhausen * Implementation is based on optimization guide notes from
71c201e64SMarkus Stockhausen * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
81c201e64SMarkus Stockhausen *
91c201e64SMarkus Stockhausen * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
101c201e64SMarkus Stockhausen *
111c201e64SMarkus Stockhausen * This program is free software; you can redistribute it and/or modify it
121c201e64SMarkus Stockhausen * under the terms of the GNU General Public License as published by the Free
131c201e64SMarkus Stockhausen * Software Foundation; either version 2 of the License, or (at your option)
141c201e64SMarkus Stockhausen * any later version.
151c201e64SMarkus Stockhausen *
161c201e64SMarkus Stockhausen */
171c201e64SMarkus Stockhausen
181c201e64SMarkus Stockhausen#include <asm/ppc_asm.h>
191c201e64SMarkus Stockhausen#include "aes-spe-regs.h"
201c201e64SMarkus Stockhausen
211c201e64SMarkus Stockhausen#define	EAD(in, bpos) \
221c201e64SMarkus Stockhausen	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
231c201e64SMarkus Stockhausen
241c201e64SMarkus Stockhausen#define DAD(in, bpos) \
251c201e64SMarkus Stockhausen	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
261c201e64SMarkus Stockhausen
271c201e64SMarkus Stockhausen#define LWH(out, off) \
281c201e64SMarkus Stockhausen	evlwwsplat	out,off(rT0);	/* load word high		*/
291c201e64SMarkus Stockhausen
301c201e64SMarkus Stockhausen#define LWL(out, off) \
311c201e64SMarkus Stockhausen	lwz		out,off(rT0);	/* load word low		*/
321c201e64SMarkus Stockhausen
331c201e64SMarkus Stockhausen#define LBZ(out, tab, off) \
341c201e64SMarkus Stockhausen	lbz		out,off(tab);	/* load byte			*/
351c201e64SMarkus Stockhausen
361c201e64SMarkus Stockhausen#define LAH(out, in, bpos, off) \
371c201e64SMarkus Stockhausen	EAD(in, bpos)			/* calc addr + load word high	*/ \
381c201e64SMarkus Stockhausen	LWH(out, off)
391c201e64SMarkus Stockhausen
401c201e64SMarkus Stockhausen#define LAL(out, in, bpos, off) \
411c201e64SMarkus Stockhausen	EAD(in, bpos)			/* calc addr + load word low	*/ \
421c201e64SMarkus Stockhausen	LWL(out, off)
431c201e64SMarkus Stockhausen
441c201e64SMarkus Stockhausen#define LAE(out, in, bpos) \
451c201e64SMarkus Stockhausen	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
461c201e64SMarkus Stockhausen	LBZ(out, rT0, 8)
471c201e64SMarkus Stockhausen
481c201e64SMarkus Stockhausen#define LBE(out) \
491c201e64SMarkus Stockhausen	LBZ(out, rT0, 8)		/* load enc byte		*/
501c201e64SMarkus Stockhausen
511c201e64SMarkus Stockhausen#define LAD(out, in, bpos) \
521c201e64SMarkus Stockhausen	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
531c201e64SMarkus Stockhausen	LBZ(out, rT1, 0)
541c201e64SMarkus Stockhausen
551c201e64SMarkus Stockhausen#define LBD(out) \
561c201e64SMarkus Stockhausen	LBZ(out, rT1, 0)
571c201e64SMarkus Stockhausen
581c201e64SMarkus Stockhausen/*
591c201e64SMarkus Stockhausen * ppc_encrypt_block: The central encryption function for a single 16 bytes
601c201e64SMarkus Stockhausen * block. It does no stack handling or register saving to support fast calls
611c201e64SMarkus Stockhausen * via bl/blr. It expects that caller has pre-xored input data with first
621c201e64SMarkus Stockhausen * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
631c201e64SMarkus Stockhausen * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
64446957baSAdam Buchbinder * and rW0-rW3 and caller must execute a final xor on the output registers.
651c201e64SMarkus Stockhausen * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
661c201e64SMarkus Stockhausen *
671c201e64SMarkus Stockhausen */
681c201e64SMarkus Stockhausen_GLOBAL(ppc_encrypt_block)
691c201e64SMarkus Stockhausen	LAH(rW4, rD1, 2, 4)
701c201e64SMarkus Stockhausen	LAH(rW6, rD0, 3, 0)
711c201e64SMarkus Stockhausen	LAH(rW3, rD0, 1, 8)
721c201e64SMarkus Stockhausenppc_encrypt_block_loop:
731c201e64SMarkus Stockhausen	LAH(rW0, rD3, 0, 12)
741c201e64SMarkus Stockhausen	LAL(rW0, rD0, 0, 12)
751c201e64SMarkus Stockhausen	LAH(rW1, rD1, 0, 12)
761c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
771c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
781c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
791c201e64SMarkus Stockhausen	LAL(rW4, rD2, 2, 4)
801c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
811c201e64SMarkus Stockhausen	LAH(rW5, rD3, 2, 4)
821c201e64SMarkus Stockhausen	LAL(rW5, rD0, 2, 4)
831c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
841c201e64SMarkus Stockhausen	evldw		rD1,16(rKP)
851c201e64SMarkus Stockhausen	EAD(rD3, 3)
861c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW4
871c201e64SMarkus Stockhausen	LWL(rW7, 0)
881c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW6
891c201e64SMarkus Stockhausen	EAD(rD2, 0)
901c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW2
911c201e64SMarkus Stockhausen	LWL(rW1, 12)
921c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
931c201e64SMarkus Stockhausen	evldw		rD3,24(rKP)
941c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
951c201e64SMarkus Stockhausen	EAD(rD1, 2)
961c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW5
971c201e64SMarkus Stockhausen	LWH(rW4, 4)
981c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
991c201e64SMarkus Stockhausen	EAD(rD0, 3)
1001c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
1011c201e64SMarkus Stockhausen	LWH(rW6, 0)
1021c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW1
1031c201e64SMarkus Stockhausen	EAD(rD0, 1)
1041c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
1051c201e64SMarkus Stockhausen	LWH(rW3, 8)
1061c201e64SMarkus Stockhausen	LAH(rW0, rD3, 0, 12)
1071c201e64SMarkus Stockhausen	LAL(rW0, rD0, 0, 12)
1081c201e64SMarkus Stockhausen	LAH(rW1, rD1, 0, 12)
1091c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
1101c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
1111c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
1121c201e64SMarkus Stockhausen	LAL(rW4, rD2, 2, 4)
1131c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
1141c201e64SMarkus Stockhausen	LAH(rW5, rD3, 2, 4)
1151c201e64SMarkus Stockhausen	LAL(rW5, rD0, 2, 4)
1161c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
1171c201e64SMarkus Stockhausen	evldw		rD1,32(rKP)
1181c201e64SMarkus Stockhausen	EAD(rD3, 3)
1191c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW4
1201c201e64SMarkus Stockhausen	LWL(rW7, 0)
1211c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW6
1221c201e64SMarkus Stockhausen	EAD(rD2, 0)
1231c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW2
1241c201e64SMarkus Stockhausen	LWL(rW1, 12)
1251c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
1261c201e64SMarkus Stockhausen	evldw		rD3,40(rKP)
1271c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
1281c201e64SMarkus Stockhausen	EAD(rD1, 2)
1291c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW5
1301c201e64SMarkus Stockhausen	LWH(rW4, 4)
1311c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
1321c201e64SMarkus Stockhausen	EAD(rD0, 3)
1331c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
1341c201e64SMarkus Stockhausen	LWH(rW6, 0)
1351c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW1
1361c201e64SMarkus Stockhausen	EAD(rD0, 1)
1371c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
1381c201e64SMarkus Stockhausen	LWH(rW3, 8)
1391c201e64SMarkus Stockhausen	addi		rKP,rKP,32
1401c201e64SMarkus Stockhausen	bdnz		ppc_encrypt_block_loop
1411c201e64SMarkus Stockhausen	LAH(rW0, rD3, 0, 12)
1421c201e64SMarkus Stockhausen	LAL(rW0, rD0, 0, 12)
1431c201e64SMarkus Stockhausen	LAH(rW1, rD1, 0, 12)
1441c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
1451c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
1461c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
1471c201e64SMarkus Stockhausen	LAL(rW4, rD2, 2, 4)
1481c201e64SMarkus Stockhausen	LAH(rW5, rD3, 2, 4)
1491c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
1501c201e64SMarkus Stockhausen	LAL(rW5, rD0, 2, 4)
1511c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
1521c201e64SMarkus Stockhausen	evldw		rD1,16(rKP)
1531c201e64SMarkus Stockhausen	EAD(rD3, 3)
1541c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW4
1551c201e64SMarkus Stockhausen	LWL(rW7, 0)
1561c201e64SMarkus Stockhausen	evxor		rW2,rW2,rW6
1571c201e64SMarkus Stockhausen	EAD(rD2, 0)
1581c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW2
1591c201e64SMarkus Stockhausen	LWL(rW1, 12)
1601c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
1611c201e64SMarkus Stockhausen	evldw		rD3,24(rKP)
1621c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
1631c201e64SMarkus Stockhausen	EAD(rD1, 0)
1641c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW5
1651c201e64SMarkus Stockhausen	LBE(rW2)
1661c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
1671c201e64SMarkus Stockhausen	EAD(rD0, 1)
1681c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
1691c201e64SMarkus Stockhausen	LBE(rW6)
1701c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW1
1711c201e64SMarkus Stockhausen	EAD(rD0, 0)
1721c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
1731c201e64SMarkus Stockhausen	LBE(rW1)
1741c201e64SMarkus Stockhausen	LAE(rW0, rD3, 0)
1751c201e64SMarkus Stockhausen	LAE(rW1, rD0, 0)
1761c201e64SMarkus Stockhausen	LAE(rW4, rD2, 1)
1771c201e64SMarkus Stockhausen	LAE(rW5, rD3, 1)
1781c201e64SMarkus Stockhausen	LAE(rW3, rD2, 0)
1791c201e64SMarkus Stockhausen	LAE(rW7, rD1, 1)
1801c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,8,16,23
1811c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,8,16,23
1821c201e64SMarkus Stockhausen	LAE(rW4, rD1, 2)
1831c201e64SMarkus Stockhausen	LAE(rW5, rD2, 2)
1841c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,8,16,23
1851c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,8,16,23
1861c201e64SMarkus Stockhausen	LAE(rW6, rD3, 2)
1871c201e64SMarkus Stockhausen	LAE(rW7, rD0, 2)
1881c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,16,8,15
1891c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,16,8,15
1901c201e64SMarkus Stockhausen	LAE(rW4, rD0, 3)
1911c201e64SMarkus Stockhausen	LAE(rW5, rD1, 3)
1921c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,16,8,15
1931c201e64SMarkus Stockhausen	lwz		rD0,32(rKP)
1941c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,16,8,15
1951c201e64SMarkus Stockhausen	lwz		rD1,36(rKP)
1961c201e64SMarkus Stockhausen	LAE(rW6, rD2, 3)
1971c201e64SMarkus Stockhausen	LAE(rW7, rD3, 3)
1981c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,24,0,7
1991c201e64SMarkus Stockhausen	lwz		rD2,40(rKP)
2001c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,24,0,7
2011c201e64SMarkus Stockhausen	lwz		rD3,44(rKP)
2021c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,24,0,7
2031c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,24,0,7
2041c201e64SMarkus Stockhausen	blr
2051c201e64SMarkus Stockhausen
2061c201e64SMarkus Stockhausen/*
2071c201e64SMarkus Stockhausen * ppc_decrypt_block: The central decryption function for a single 16 bytes
2081c201e64SMarkus Stockhausen * block. It does no stack handling or register saving to support fast calls
2091c201e64SMarkus Stockhausen * via bl/blr. It expects that caller has pre-xored input data with first
2101c201e64SMarkus Stockhausen * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
2111c201e64SMarkus Stockhausen * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
212446957baSAdam Buchbinder * and rW0-rW3 and caller must execute a final xor on the output registers.
2131c201e64SMarkus Stockhausen * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
2141c201e64SMarkus Stockhausen *
2151c201e64SMarkus Stockhausen */
2161c201e64SMarkus Stockhausen_GLOBAL(ppc_decrypt_block)
2171c201e64SMarkus Stockhausen	LAH(rW0, rD1, 0, 12)
2181c201e64SMarkus Stockhausen	LAH(rW6, rD0, 3, 0)
2191c201e64SMarkus Stockhausen	LAH(rW3, rD0, 1, 8)
2201c201e64SMarkus Stockhausenppc_decrypt_block_loop:
2211c201e64SMarkus Stockhausen	LAH(rW1, rD3, 0, 12)
2221c201e64SMarkus Stockhausen	LAL(rW0, rD2, 0, 12)
2231c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
2241c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
2251c201e64SMarkus Stockhausen	LAH(rW4, rD3, 2, 4)
2261c201e64SMarkus Stockhausen	LAL(rW4, rD0, 2, 4)
2271c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
2281c201e64SMarkus Stockhausen	LAH(rW5, rD1, 2, 4)
2291c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
2301c201e64SMarkus Stockhausen	LAL(rW7, rD3, 3, 0)
2311c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
2321c201e64SMarkus Stockhausen	evldw		rD1,16(rKP)
2331c201e64SMarkus Stockhausen	EAD(rD0, 0)
2341c201e64SMarkus Stockhausen	evxor		rW4,rW4,rW6
2351c201e64SMarkus Stockhausen	LWL(rW1, 12)
2361c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW4
2371c201e64SMarkus Stockhausen	EAD(rD2, 2)
2381c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW2
2391c201e64SMarkus Stockhausen	LWL(rW5, 4)
2401c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
2411c201e64SMarkus Stockhausen	evldw		rD3,24(rKP)
2421c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
2431c201e64SMarkus Stockhausen	EAD(rD1, 0)
2441c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
2451c201e64SMarkus Stockhausen	LWH(rW0, 12)
2461c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW1
2471c201e64SMarkus Stockhausen	EAD(rD0, 3)
2481c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
2491c201e64SMarkus Stockhausen	LWH(rW6, 0)
2501c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW5
2511c201e64SMarkus Stockhausen	EAD(rD0, 1)
2521c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
2531c201e64SMarkus Stockhausen	LWH(rW3, 8)
2541c201e64SMarkus Stockhausen	LAH(rW1, rD3, 0, 12)
2551c201e64SMarkus Stockhausen	LAL(rW0, rD2, 0, 12)
2561c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
2571c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
2581c201e64SMarkus Stockhausen	LAH(rW4, rD3, 2, 4)
2591c201e64SMarkus Stockhausen	LAL(rW4, rD0, 2, 4)
2601c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
2611c201e64SMarkus Stockhausen	LAH(rW5, rD1, 2, 4)
2621c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
2631c201e64SMarkus Stockhausen	LAL(rW7, rD3, 3, 0)
2641c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
2651c201e64SMarkus Stockhausen	evldw		 rD1,32(rKP)
2661c201e64SMarkus Stockhausen	EAD(rD0, 0)
2671c201e64SMarkus Stockhausen	evxor		rW4,rW4,rW6
2681c201e64SMarkus Stockhausen	LWL(rW1, 12)
2691c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW4
2701c201e64SMarkus Stockhausen	EAD(rD2, 2)
2711c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW2
2721c201e64SMarkus Stockhausen	LWL(rW5, 4)
2731c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
2741c201e64SMarkus Stockhausen	evldw		rD3,40(rKP)
2751c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
2761c201e64SMarkus Stockhausen	EAD(rD1, 0)
2771c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
2781c201e64SMarkus Stockhausen	LWH(rW0, 12)
2791c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW1
2801c201e64SMarkus Stockhausen	EAD(rD0, 3)
2811c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
2821c201e64SMarkus Stockhausen	LWH(rW6, 0)
2831c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW5
2841c201e64SMarkus Stockhausen	EAD(rD0, 1)
2851c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
2861c201e64SMarkus Stockhausen	LWH(rW3, 8)
2871c201e64SMarkus Stockhausen	addi		rKP,rKP,32
2881c201e64SMarkus Stockhausen	bdnz		ppc_decrypt_block_loop
2891c201e64SMarkus Stockhausen	LAH(rW1, rD3, 0, 12)
2901c201e64SMarkus Stockhausen	LAL(rW0, rD2, 0, 12)
2911c201e64SMarkus Stockhausen	LAH(rW2, rD2, 1, 8)
2921c201e64SMarkus Stockhausen	LAL(rW2, rD3, 1, 8)
2931c201e64SMarkus Stockhausen	LAH(rW4, rD3, 2, 4)
2941c201e64SMarkus Stockhausen	LAL(rW4, rD0, 2, 4)
2951c201e64SMarkus Stockhausen	LAL(rW6, rD1, 3, 0)
2961c201e64SMarkus Stockhausen	LAH(rW5, rD1, 2, 4)
2971c201e64SMarkus Stockhausen	LAH(rW7, rD2, 3, 0)
2981c201e64SMarkus Stockhausen	LAL(rW7, rD3, 3, 0)
2991c201e64SMarkus Stockhausen	LAL(rW3, rD1, 1, 8)
3001c201e64SMarkus Stockhausen	evldw		 rD1,16(rKP)
3011c201e64SMarkus Stockhausen	EAD(rD0, 0)
3021c201e64SMarkus Stockhausen	evxor		rW4,rW4,rW6
3031c201e64SMarkus Stockhausen	LWL(rW1, 12)
3041c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW4
3051c201e64SMarkus Stockhausen	EAD(rD2, 2)
3061c201e64SMarkus Stockhausen	evxor		rW0,rW0,rW2
3071c201e64SMarkus Stockhausen	LWL(rW5, 4)
3081c201e64SMarkus Stockhausen	evxor		rD1,rD1,rW0
3091c201e64SMarkus Stockhausen	evldw		rD3,24(rKP)
3101c201e64SMarkus Stockhausen	evmergehi	rD0,rD0,rD1
3111c201e64SMarkus Stockhausen	DAD(rD1, 0)
3121c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW7
3131c201e64SMarkus Stockhausen	LBD(rW0)
3141c201e64SMarkus Stockhausen	evxor		rW3,rW3,rW1
3151c201e64SMarkus Stockhausen	DAD(rD0, 1)
3161c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW3
3171c201e64SMarkus Stockhausen	LBD(rW6)
3181c201e64SMarkus Stockhausen	evxor		rD3,rD3,rW5
3191c201e64SMarkus Stockhausen	DAD(rD0, 0)
3201c201e64SMarkus Stockhausen	evmergehi	rD2,rD2,rD3
3211c201e64SMarkus Stockhausen	LBD(rW3)
3221c201e64SMarkus Stockhausen	LAD(rW2, rD3, 0)
3231c201e64SMarkus Stockhausen	LAD(rW1, rD2, 0)
3241c201e64SMarkus Stockhausen	LAD(rW4, rD2, 1)
3251c201e64SMarkus Stockhausen	LAD(rW5, rD3, 1)
3261c201e64SMarkus Stockhausen	LAD(rW7, rD1, 1)
3271c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,8,16,23
3281c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,8,16,23
3291c201e64SMarkus Stockhausen	LAD(rW4, rD3, 2)
3301c201e64SMarkus Stockhausen	LAD(rW5, rD0, 2)
3311c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,8,16,23
3321c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,8,16,23
3331c201e64SMarkus Stockhausen	LAD(rW6, rD1, 2)
3341c201e64SMarkus Stockhausen	LAD(rW7, rD2, 2)
3351c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,16,8,15
3361c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,16,8,15
3371c201e64SMarkus Stockhausen	LAD(rW4, rD0, 3)
3381c201e64SMarkus Stockhausen	LAD(rW5, rD1, 3)
3391c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,16,8,15
3401c201e64SMarkus Stockhausen	lwz		rD0,32(rKP)
3411c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,16,8,15
3421c201e64SMarkus Stockhausen	lwz		rD1,36(rKP)
3431c201e64SMarkus Stockhausen	LAD(rW6, rD2, 3)
3441c201e64SMarkus Stockhausen	LAD(rW7, rD3, 3)
3451c201e64SMarkus Stockhausen	rlwimi		rW0,rW4,24,0,7
3461c201e64SMarkus Stockhausen	lwz		rD2,40(rKP)
3471c201e64SMarkus Stockhausen	rlwimi		rW1,rW5,24,0,7
3481c201e64SMarkus Stockhausen	lwz		rD3,44(rKP)
3491c201e64SMarkus Stockhausen	rlwimi		rW2,rW6,24,0,7
3501c201e64SMarkus Stockhausen	rlwimi		rW3,rW7,24,0,7
3511c201e64SMarkus Stockhausen	blr
352