1 /* 2 * SME outer product, 1 x 1. 3 * SPDX-License-Identifier: GPL-2.0-or-later 4 */ 5 6 #include <stdio.h> 7 8 extern void foo(float *dst); 9 10 asm( 11 " .arch_extension sme\n" 12 " .type foo, @function\n" 13 "foo:\n" 14 " stp x29, x30, [sp, -80]!\n" 15 " mov x29, sp\n" 16 " stp d8, d9, [sp, 16]\n" 17 " stp d10, d11, [sp, 32]\n" 18 " stp d12, d13, [sp, 48]\n" 19 " stp d14, d15, [sp, 64]\n" 20 " smstart\n" 21 " ptrue p0.s, vl4\n" 22 " fmov z0.s, #1.0\n" 23 /* 24 * An outer product of a vector of 1.0 by itself should be a matrix of 1.0. 25 * Note that we are using tile 1 here (za1.s) rather than tile 0. 26 */ 27 " zero {za}\n" 28 " fmopa za1.s, p0/m, p0/m, z0.s, z0.s\n" 29 /* 30 * Read the first 4x4 sub-matrix of elements from tile 1: 31 * Note that za1h should be interchangeable here. 32 */ 33 " mov w12, #0\n" 34 " mova z0.s, p0/m, za1v.s[w12, #0]\n" 35 " mova z1.s, p0/m, za1v.s[w12, #1]\n" 36 " mova z2.s, p0/m, za1v.s[w12, #2]\n" 37 " mova z3.s, p0/m, za1v.s[w12, #3]\n" 38 /* 39 * And store them to the input pointer (dst in the C code): 40 */ 41 " st1w {z0.s}, p0, [x0]\n" 42 " add x0, x0, #16\n" 43 " st1w {z1.s}, p0, [x0]\n" 44 " add x0, x0, #16\n" 45 " st1w {z2.s}, p0, [x0]\n" 46 " add x0, x0, #16\n" 47 " st1w {z3.s}, p0, [x0]\n" 48 " smstop\n" 49 " ldp d8, d9, [sp, 16]\n" 50 " ldp d10, d11, [sp, 32]\n" 51 " ldp d12, d13, [sp, 48]\n" 52 " ldp d14, d15, [sp, 64]\n" 53 " ldp x29, x30, [sp], 80\n" 54 " ret\n" 55 " .size foo, . - foo" 56 ); 57 58 int main() 59 { 60 float dst[16]; 61 int i, j; 62 63 foo(dst); 64 65 for (i = 0; i < 16; i++) { 66 if (dst[i] != 1.0f) { 67 break; 68 } 69 } 70 71 if (i == 16) { 72 return 0; /* success */ 73 } 74 75 /* failure */ 76 for (i = 0; i < 4; ++i) { 77 for (j = 0; j < 4; ++j) { 78 printf("%f ", (double)dst[i * 4 + j]); 79 } 80 printf("\n"); 81 } 82 return 1; 83 } 84