1 #include <stdio.h> 2 #include <stdint.h> 3 #include <stdlib.h> 4 #include <string.h> 5 6 typedef void (*testfn)(void); 7 8 typedef struct { 9 uint64_t q0, q1, q2, q3; 10 } __attribute__((aligned(32))) v4di; 11 12 typedef struct { 13 uint64_t mm[8]; 14 v4di ymm[16]; 15 uint64_t r[16]; 16 uint64_t flags; 17 uint32_t ff; 18 uint64_t pad; 19 v4di mem[4]; 20 v4di mem0[4]; 21 } reg_state; 22 23 typedef struct { 24 int n; 25 testfn fn; 26 const char *s; 27 reg_state *init; 28 } TestDef; 29 30 reg_state initI; 31 reg_state initF32; 32 reg_state initF64; 33 34 static void dump_ymm(const char *name, int n, const v4di *r, int ff) 35 { 36 printf("%s%d = %016lx %016lx %016lx %016lx\n", 37 name, n, r->q3, r->q2, r->q1, r->q0); 38 if (ff == 64) { 39 double v[4]; 40 memcpy(v, r, sizeof(v)); 41 printf(" %16g %16g %16g %16g\n", 42 v[3], v[2], v[1], v[0]); 43 } else if (ff == 32) { 44 float v[8]; 45 memcpy(v, r, sizeof(v)); 46 printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n", 47 v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]); 48 } 49 } 50 51 static void dump_regs(reg_state *s) 52 { 53 int i; 54 55 for (i = 0; i < 16; i++) { 56 dump_ymm("ymm", i, &s->ymm[i], 0); 57 } 58 for (i = 0; i < 4; i++) { 59 dump_ymm("mem", i, &s->mem0[i], 0); 60 } 61 } 62 63 static void compare_state(const reg_state *a, const reg_state *b) 64 { 65 int i; 66 for (i = 0; i < 8; i++) { 67 if (a->mm[i] != b->mm[i]) { 68 printf("MM%d = %016lx\n", i, b->mm[i]); 69 } 70 } 71 for (i = 0; i < 16; i++) { 72 if (a->r[i] != b->r[i]) { 73 printf("r%d = %016lx\n", i, b->r[i]); 74 } 75 } 76 for (i = 0; i < 16; i++) { 77 if (memcmp(&a->ymm[i], &b->ymm[i], 32)) { 78 dump_ymm("ymm", i, &b->ymm[i], a->ff); 79 } 80 } 81 for (i = 0; i < 4; i++) { 82 if (memcmp(&a->mem0[i], &a->mem[i], 32)) { 83 dump_ymm("mem", i, &a->mem[i], a->ff); 84 } 85 } 86 if (a->flags != b->flags) { 87 printf("FLAGS = %016lx\n", b->flags); 88 } 89 } 90 91 #define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t" 92 #define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t" 93 #define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t" 94 #define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t" 95 #define MMREG(F) \ 96 F(mm0, 0x00) \ 97 F(mm1, 0x08) \ 98 F(mm2, 0x10) \ 99 F(mm3, 0x18) \ 100 F(mm4, 0x20) \ 101 F(mm5, 0x28) \ 102 F(mm6, 0x30) \ 103 F(mm7, 0x38) 104 #define YMMREG(F) \ 105 F(ymm0, 0x040) \ 106 F(ymm1, 0x060) \ 107 F(ymm2, 0x080) \ 108 F(ymm3, 0x0a0) \ 109 F(ymm4, 0x0c0) \ 110 F(ymm5, 0x0e0) \ 111 F(ymm6, 0x100) \ 112 F(ymm7, 0x120) \ 113 F(ymm8, 0x140) \ 114 F(ymm9, 0x160) \ 115 F(ymm10, 0x180) \ 116 F(ymm11, 0x1a0) \ 117 F(ymm12, 0x1c0) \ 118 F(ymm13, 0x1e0) \ 119 F(ymm14, 0x200) \ 120 F(ymm15, 0x220) 121 #define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t" 122 #define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t" 123 #define REG(F) \ 124 F(rbx, 0x248) \ 125 F(rcx, 0x250) \ 126 F(rdx, 0x258) \ 127 F(rsi, 0x260) \ 128 F(rdi, 0x268) \ 129 F(r8, 0x280) \ 130 F(r9, 0x288) \ 131 F(r10, 0x290) \ 132 F(r11, 0x298) \ 133 F(r12, 0x2a0) \ 134 F(r13, 0x2a8) \ 135 F(r14, 0x2b0) \ 136 F(r15, 0x2b8) \ 137 138 static void run_test(const TestDef *t) 139 { 140 reg_state result; 141 reg_state *init = t->init; 142 memcpy(init->mem, init->mem0, sizeof(init->mem)); 143 printf("%5d %s\n", t->n, t->s); 144 asm volatile( 145 MMREG(LOADMM) 146 YMMREG(LOADYMM) 147 "sub rsp, 128\n\t" 148 "push rax\n\t" 149 "push rbx\n\t" 150 "push rcx\n\t" 151 "push rdx\n\t" 152 "push %1\n\t" 153 "push %2\n\t" 154 "mov rax, %0\n\t" 155 "pushf\n\t" 156 "pop rbx\n\t" 157 "shr rbx, 8\n\t" 158 "shl rbx, 8\n\t" 159 "mov rcx, 0x2c0[rax]\n\t" 160 "and rcx, 0xff\n\t" 161 "or rbx, rcx\n\t" 162 "push rbx\n\t" 163 "popf\n\t" 164 REG(LOADREG) 165 "mov rax, 0x240[rax]\n\t" 166 "call [rsp]\n\t" 167 "mov [rsp], rax\n\t" 168 "mov rax, 8[rsp]\n\t" 169 REG(STOREREG) 170 "mov rbx, [rsp]\n\t" 171 "mov 0x240[rax], rbx\n\t" 172 "mov rbx, 0\n\t" 173 "mov 0x270[rax], rbx\n\t" 174 "mov 0x278[rax], rbx\n\t" 175 "pushf\n\t" 176 "pop rbx\n\t" 177 "and rbx, 0xff\n\t" 178 "mov 0x2c0[rax], rbx\n\t" 179 "add rsp, 16\n\t" 180 "pop rdx\n\t" 181 "pop rcx\n\t" 182 "pop rbx\n\t" 183 "pop rax\n\t" 184 "add rsp, 128\n\t" 185 MMREG(STOREMM) 186 YMMREG(STOREYMM) 187 : : "r"(init), "r"(&result), "r"(t->fn) 188 : "memory", "cc", 189 "rsi", "rdi", 190 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", 191 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", 192 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", 193 "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", 194 "ymm12", "ymm13", "ymm14", "ymm15" 195 ); 196 compare_state(init, &result); 197 } 198 199 #define TEST(n, cmd, type) \ 200 static void __attribute__((naked)) test_##n(void) \ 201 { \ 202 asm volatile(cmd); \ 203 asm volatile("ret"); \ 204 } 205 #include "test-avx.h" 206 207 208 static const TestDef test_table[] = { 209 #define TEST(n, cmd, type) {n, test_##n, cmd, &init##type}, 210 #include "test-avx.h" 211 {-1, NULL, "", NULL} 212 }; 213 214 static void run_all(void) 215 { 216 const TestDef *t; 217 for (t = test_table; t->fn; t++) { 218 run_test(t); 219 } 220 } 221 222 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) 223 224 float val_f32[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5, 8.3}; 225 double val_f64[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5}; 226 v4di val_i64[] = { 227 {0x3d6b3b6a9e4118f2lu, 0x355ae76d2774d78clu, 228 0xac3ff76c4daa4b28lu, 0xe7fabd204cb54083lu}, 229 {0xd851c54a56bf1f29lu, 0x4a84d1d50bf4c4fflu, 230 0x56621e553d52b56clu, 0xd0069553da8f584alu}, 231 {0x5826475e2c5fd799lu, 0xfd32edc01243f5e9lu, 232 0x738ba2c66d3fe126lu, 0x5707219c6e6c26b4lu}, 233 }; 234 235 v4di deadbeef = {0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull, 236 0xa5a5a5a5deadbeefull, 0xa5a5a5a5deadbeefull}; 237 v4di indexq = {0x000000000000001full, 0x000000000000008full, 238 0xffffffffffffffffull, 0xffffffffffffff5full}; 239 v4di indexd = {0x00000002000000efull, 0xfffffff500000010ull, 240 0x0000000afffffff0ull, 0x000000000000000eull}; 241 242 v4di gather_mem[0x20]; 243 244 void init_f32reg(v4di *r) 245 { 246 static int n; 247 float v[8]; 248 int i; 249 for (i = 0; i < 8; i++) { 250 v[i] = val_f32[n++]; 251 if (n == ARRAY_LEN(val_f32)) { 252 n = 0; 253 } 254 } 255 memcpy(r, v, sizeof(*r)); 256 } 257 258 void init_f64reg(v4di *r) 259 { 260 static int n; 261 double v[4]; 262 int i; 263 for (i = 0; i < 4; i++) { 264 v[i] = val_f64[n++]; 265 if (n == ARRAY_LEN(val_f64)) { 266 n = 0; 267 } 268 } 269 memcpy(r, v, sizeof(*r)); 270 } 271 272 void init_intreg(v4di *r) 273 { 274 static uint64_t mask; 275 static int n; 276 277 r->q0 = val_i64[n].q0 ^ mask; 278 r->q1 = val_i64[n].q1 ^ mask; 279 r->q2 = val_i64[n].q2 ^ mask; 280 r->q3 = val_i64[n].q3 ^ mask; 281 n++; 282 if (n == ARRAY_LEN(val_i64)) { 283 n = 0; 284 mask *= 0x104C11DB7; 285 } 286 } 287 288 static void init_all(reg_state *s) 289 { 290 int i; 291 292 s->r[3] = (uint64_t)&s->mem[0]; /* rdx */ 293 s->r[4] = (uint64_t)&gather_mem[ARRAY_LEN(gather_mem) / 2]; /* rsi */ 294 s->r[5] = (uint64_t)&s->mem[2]; /* rdi */ 295 s->flags = 2; 296 for (i = 0; i < 16; i++) { 297 s->ymm[i] = deadbeef; 298 } 299 s->ymm[13] = indexd; 300 s->ymm[14] = indexq; 301 for (i = 0; i < 4; i++) { 302 s->mem0[i] = deadbeef; 303 } 304 } 305 306 int main(int argc, char *argv[]) 307 { 308 int i; 309 310 init_all(&initI); 311 init_intreg(&initI.ymm[10]); 312 init_intreg(&initI.ymm[11]); 313 init_intreg(&initI.ymm[12]); 314 init_intreg(&initI.mem0[1]); 315 printf("Int:\n"); 316 dump_regs(&initI); 317 318 init_all(&initF32); 319 init_f32reg(&initF32.ymm[10]); 320 init_f32reg(&initF32.ymm[11]); 321 init_f32reg(&initF32.ymm[12]); 322 init_f32reg(&initF32.mem0[1]); 323 initF32.ff = 32; 324 printf("F32:\n"); 325 dump_regs(&initF32); 326 327 init_all(&initF64); 328 init_f64reg(&initF64.ymm[10]); 329 init_f64reg(&initF64.ymm[11]); 330 init_f64reg(&initF64.ymm[12]); 331 init_f64reg(&initF64.mem0[1]); 332 initF64.ff = 64; 333 printf("F64:\n"); 334 dump_regs(&initF64); 335 336 for (i = 0; i < ARRAY_LEN(gather_mem); i++) { 337 init_intreg(&gather_mem[i]); 338 } 339 340 if (argc > 1) { 341 int n = atoi(argv[1]); 342 run_test(&test_table[n]); 343 } else { 344 run_all(); 345 } 346 return 0; 347 } 348