1/* 2 * Fast AES implementation for SPE instruction set (PPC) 3 * 4 * This code makes use of the SPE SIMD instruction set as defined in 5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf 6 * Implementation is based on optimization guide notes from 7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf 8 * 9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> 10 * 11 * This program is free software; you can redistribute it and/or modify it 12 * under the terms of the GNU General Public License as published by the Free 13 * Software Foundation; either version 2 of the License, or (at your option) 14 * any later version. 15 * 16 */ 17 18#include <asm/ppc_asm.h> 19#include "aes-spe-regs.h" 20 21#define EAD(in, bpos) \ 22 rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; 23 24#define DAD(in, bpos) \ 25 rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; 26 27#define LWH(out, off) \ 28 evlwwsplat out,off(rT0); /* load word high */ 29 30#define LWL(out, off) \ 31 lwz out,off(rT0); /* load word low */ 32 33#define LBZ(out, tab, off) \ 34 lbz out,off(tab); /* load byte */ 35 36#define LAH(out, in, bpos, off) \ 37 EAD(in, bpos) /* calc addr + load word high */ \ 38 LWH(out, off) 39 40#define LAL(out, in, bpos, off) \ 41 EAD(in, bpos) /* calc addr + load word low */ \ 42 LWL(out, off) 43 44#define LAE(out, in, bpos) \ 45 EAD(in, bpos) /* calc addr + load enc byte */ \ 46 LBZ(out, rT0, 8) 47 48#define LBE(out) \ 49 LBZ(out, rT0, 8) /* load enc byte */ 50 51#define LAD(out, in, bpos) \ 52 DAD(in, bpos) /* calc addr + load dec byte */ \ 53 LBZ(out, rT1, 0) 54 55#define LBD(out) \ 56 LBZ(out, rT1, 0) 57 58/* 59 * ppc_encrypt_block: The central encryption function for a single 16 bytes 60 * block. It does no stack handling or register saving to support fast calls 61 * via bl/blr. It expects that caller has pre-xored input data with first 62 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must 63 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 64 * and rW0-rW3 and caller must execute a final xor on the output registers. 65 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. 66 * 67 */ 68_GLOBAL(ppc_encrypt_block) 69 LAH(rW4, rD1, 2, 4) 70 LAH(rW6, rD0, 3, 0) 71 LAH(rW3, rD0, 1, 8) 72ppc_encrypt_block_loop: 73 LAH(rW0, rD3, 0, 12) 74 LAL(rW0, rD0, 0, 12) 75 LAH(rW1, rD1, 0, 12) 76 LAH(rW2, rD2, 1, 8) 77 LAL(rW2, rD3, 1, 8) 78 LAL(rW3, rD1, 1, 8) 79 LAL(rW4, rD2, 2, 4) 80 LAL(rW6, rD1, 3, 0) 81 LAH(rW5, rD3, 2, 4) 82 LAL(rW5, rD0, 2, 4) 83 LAH(rW7, rD2, 3, 0) 84 evldw rD1,16(rKP) 85 EAD(rD3, 3) 86 evxor rW2,rW2,rW4 87 LWL(rW7, 0) 88 evxor rW2,rW2,rW6 89 EAD(rD2, 0) 90 evxor rD1,rD1,rW2 91 LWL(rW1, 12) 92 evxor rD1,rD1,rW0 93 evldw rD3,24(rKP) 94 evmergehi rD0,rD0,rD1 95 EAD(rD1, 2) 96 evxor rW3,rW3,rW5 97 LWH(rW4, 4) 98 evxor rW3,rW3,rW7 99 EAD(rD0, 3) 100 evxor rD3,rD3,rW3 101 LWH(rW6, 0) 102 evxor rD3,rD3,rW1 103 EAD(rD0, 1) 104 evmergehi rD2,rD2,rD3 105 LWH(rW3, 8) 106 LAH(rW0, rD3, 0, 12) 107 LAL(rW0, rD0, 0, 12) 108 LAH(rW1, rD1, 0, 12) 109 LAH(rW2, rD2, 1, 8) 110 LAL(rW2, rD3, 1, 8) 111 LAL(rW3, rD1, 1, 8) 112 LAL(rW4, rD2, 2, 4) 113 LAL(rW6, rD1, 3, 0) 114 LAH(rW5, rD3, 2, 4) 115 LAL(rW5, rD0, 2, 4) 116 LAH(rW7, rD2, 3, 0) 117 evldw rD1,32(rKP) 118 EAD(rD3, 3) 119 evxor rW2,rW2,rW4 120 LWL(rW7, 0) 121 evxor rW2,rW2,rW6 122 EAD(rD2, 0) 123 evxor rD1,rD1,rW2 124 LWL(rW1, 12) 125 evxor rD1,rD1,rW0 126 evldw rD3,40(rKP) 127 evmergehi rD0,rD0,rD1 128 EAD(rD1, 2) 129 evxor rW3,rW3,rW5 130 LWH(rW4, 4) 131 evxor rW3,rW3,rW7 132 EAD(rD0, 3) 133 evxor rD3,rD3,rW3 134 LWH(rW6, 0) 135 evxor rD3,rD3,rW1 136 EAD(rD0, 1) 137 evmergehi rD2,rD2,rD3 138 LWH(rW3, 8) 139 addi rKP,rKP,32 140 bdnz ppc_encrypt_block_loop 141 LAH(rW0, rD3, 0, 12) 142 LAL(rW0, rD0, 0, 12) 143 LAH(rW1, rD1, 0, 12) 144 LAH(rW2, rD2, 1, 8) 145 LAL(rW2, rD3, 1, 8) 146 LAL(rW3, rD1, 1, 8) 147 LAL(rW4, rD2, 2, 4) 148 LAH(rW5, rD3, 2, 4) 149 LAL(rW6, rD1, 3, 0) 150 LAL(rW5, rD0, 2, 4) 151 LAH(rW7, rD2, 3, 0) 152 evldw rD1,16(rKP) 153 EAD(rD3, 3) 154 evxor rW2,rW2,rW4 155 LWL(rW7, 0) 156 evxor rW2,rW2,rW6 157 EAD(rD2, 0) 158 evxor rD1,rD1,rW2 159 LWL(rW1, 12) 160 evxor rD1,rD1,rW0 161 evldw rD3,24(rKP) 162 evmergehi rD0,rD0,rD1 163 EAD(rD1, 0) 164 evxor rW3,rW3,rW5 165 LBE(rW2) 166 evxor rW3,rW3,rW7 167 EAD(rD0, 1) 168 evxor rD3,rD3,rW3 169 LBE(rW6) 170 evxor rD3,rD3,rW1 171 EAD(rD0, 0) 172 evmergehi rD2,rD2,rD3 173 LBE(rW1) 174 LAE(rW0, rD3, 0) 175 LAE(rW1, rD0, 0) 176 LAE(rW4, rD2, 1) 177 LAE(rW5, rD3, 1) 178 LAE(rW3, rD2, 0) 179 LAE(rW7, rD1, 1) 180 rlwimi rW0,rW4,8,16,23 181 rlwimi rW1,rW5,8,16,23 182 LAE(rW4, rD1, 2) 183 LAE(rW5, rD2, 2) 184 rlwimi rW2,rW6,8,16,23 185 rlwimi rW3,rW7,8,16,23 186 LAE(rW6, rD3, 2) 187 LAE(rW7, rD0, 2) 188 rlwimi rW0,rW4,16,8,15 189 rlwimi rW1,rW5,16,8,15 190 LAE(rW4, rD0, 3) 191 LAE(rW5, rD1, 3) 192 rlwimi rW2,rW6,16,8,15 193 lwz rD0,32(rKP) 194 rlwimi rW3,rW7,16,8,15 195 lwz rD1,36(rKP) 196 LAE(rW6, rD2, 3) 197 LAE(rW7, rD3, 3) 198 rlwimi rW0,rW4,24,0,7 199 lwz rD2,40(rKP) 200 rlwimi rW1,rW5,24,0,7 201 lwz rD3,44(rKP) 202 rlwimi rW2,rW6,24,0,7 203 rlwimi rW3,rW7,24,0,7 204 blr 205 206/* 207 * ppc_decrypt_block: The central decryption function for a single 16 bytes 208 * block. It does no stack handling or register saving to support fast calls 209 * via bl/blr. It expects that caller has pre-xored input data with first 210 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must 211 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 212 * and rW0-rW3 and caller must execute a final xor on the output registers. 213 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. 214 * 215 */ 216_GLOBAL(ppc_decrypt_block) 217 LAH(rW0, rD1, 0, 12) 218 LAH(rW6, rD0, 3, 0) 219 LAH(rW3, rD0, 1, 8) 220ppc_decrypt_block_loop: 221 LAH(rW1, rD3, 0, 12) 222 LAL(rW0, rD2, 0, 12) 223 LAH(rW2, rD2, 1, 8) 224 LAL(rW2, rD3, 1, 8) 225 LAH(rW4, rD3, 2, 4) 226 LAL(rW4, rD0, 2, 4) 227 LAL(rW6, rD1, 3, 0) 228 LAH(rW5, rD1, 2, 4) 229 LAH(rW7, rD2, 3, 0) 230 LAL(rW7, rD3, 3, 0) 231 LAL(rW3, rD1, 1, 8) 232 evldw rD1,16(rKP) 233 EAD(rD0, 0) 234 evxor rW4,rW4,rW6 235 LWL(rW1, 12) 236 evxor rW0,rW0,rW4 237 EAD(rD2, 2) 238 evxor rW0,rW0,rW2 239 LWL(rW5, 4) 240 evxor rD1,rD1,rW0 241 evldw rD3,24(rKP) 242 evmergehi rD0,rD0,rD1 243 EAD(rD1, 0) 244 evxor rW3,rW3,rW7 245 LWH(rW0, 12) 246 evxor rW3,rW3,rW1 247 EAD(rD0, 3) 248 evxor rD3,rD3,rW3 249 LWH(rW6, 0) 250 evxor rD3,rD3,rW5 251 EAD(rD0, 1) 252 evmergehi rD2,rD2,rD3 253 LWH(rW3, 8) 254 LAH(rW1, rD3, 0, 12) 255 LAL(rW0, rD2, 0, 12) 256 LAH(rW2, rD2, 1, 8) 257 LAL(rW2, rD3, 1, 8) 258 LAH(rW4, rD3, 2, 4) 259 LAL(rW4, rD0, 2, 4) 260 LAL(rW6, rD1, 3, 0) 261 LAH(rW5, rD1, 2, 4) 262 LAH(rW7, rD2, 3, 0) 263 LAL(rW7, rD3, 3, 0) 264 LAL(rW3, rD1, 1, 8) 265 evldw rD1,32(rKP) 266 EAD(rD0, 0) 267 evxor rW4,rW4,rW6 268 LWL(rW1, 12) 269 evxor rW0,rW0,rW4 270 EAD(rD2, 2) 271 evxor rW0,rW0,rW2 272 LWL(rW5, 4) 273 evxor rD1,rD1,rW0 274 evldw rD3,40(rKP) 275 evmergehi rD0,rD0,rD1 276 EAD(rD1, 0) 277 evxor rW3,rW3,rW7 278 LWH(rW0, 12) 279 evxor rW3,rW3,rW1 280 EAD(rD0, 3) 281 evxor rD3,rD3,rW3 282 LWH(rW6, 0) 283 evxor rD3,rD3,rW5 284 EAD(rD0, 1) 285 evmergehi rD2,rD2,rD3 286 LWH(rW3, 8) 287 addi rKP,rKP,32 288 bdnz ppc_decrypt_block_loop 289 LAH(rW1, rD3, 0, 12) 290 LAL(rW0, rD2, 0, 12) 291 LAH(rW2, rD2, 1, 8) 292 LAL(rW2, rD3, 1, 8) 293 LAH(rW4, rD3, 2, 4) 294 LAL(rW4, rD0, 2, 4) 295 LAL(rW6, rD1, 3, 0) 296 LAH(rW5, rD1, 2, 4) 297 LAH(rW7, rD2, 3, 0) 298 LAL(rW7, rD3, 3, 0) 299 LAL(rW3, rD1, 1, 8) 300 evldw rD1,16(rKP) 301 EAD(rD0, 0) 302 evxor rW4,rW4,rW6 303 LWL(rW1, 12) 304 evxor rW0,rW0,rW4 305 EAD(rD2, 2) 306 evxor rW0,rW0,rW2 307 LWL(rW5, 4) 308 evxor rD1,rD1,rW0 309 evldw rD3,24(rKP) 310 evmergehi rD0,rD0,rD1 311 DAD(rD1, 0) 312 evxor rW3,rW3,rW7 313 LBD(rW0) 314 evxor rW3,rW3,rW1 315 DAD(rD0, 1) 316 evxor rD3,rD3,rW3 317 LBD(rW6) 318 evxor rD3,rD3,rW5 319 DAD(rD0, 0) 320 evmergehi rD2,rD2,rD3 321 LBD(rW3) 322 LAD(rW2, rD3, 0) 323 LAD(rW1, rD2, 0) 324 LAD(rW4, rD2, 1) 325 LAD(rW5, rD3, 1) 326 LAD(rW7, rD1, 1) 327 rlwimi rW0,rW4,8,16,23 328 rlwimi rW1,rW5,8,16,23 329 LAD(rW4, rD3, 2) 330 LAD(rW5, rD0, 2) 331 rlwimi rW2,rW6,8,16,23 332 rlwimi rW3,rW7,8,16,23 333 LAD(rW6, rD1, 2) 334 LAD(rW7, rD2, 2) 335 rlwimi rW0,rW4,16,8,15 336 rlwimi rW1,rW5,16,8,15 337 LAD(rW4, rD0, 3) 338 LAD(rW5, rD1, 3) 339 rlwimi rW2,rW6,16,8,15 340 lwz rD0,32(rKP) 341 rlwimi rW3,rW7,16,8,15 342 lwz rD1,36(rKP) 343 LAD(rW6, rD2, 3) 344 LAD(rW7, rD3, 3) 345 rlwimi rW0,rW4,24,0,7 346 lwz rD2,40(rKP) 347 rlwimi rW1,rW5,24,0,7 348 lwz rD3,44(rKP) 349 rlwimi rW2,rW6,24,0,7 350 rlwimi rW3,rW7,24,0,7 351 blr 352