xref: /openbmc/linux/arch/sparc/kernel/visemul.c (revision f43dc23d5ea91fca257be02138a255f02d98e806)
1 /* visemul.c: Emulation of VIS instructions.
2  *
3  * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4  */
5 #include <linux/kernel.h>
6 #include <linux/errno.h>
7 #include <linux/thread_info.h>
8 #include <linux/perf_event.h>
9 
10 #include <asm/ptrace.h>
11 #include <asm/pstate.h>
12 #include <asm/system.h>
13 #include <asm/fpumacro.h>
14 #include <asm/uaccess.h>
15 
16 /* OPF field of various VIS instructions.  */
17 
18 /* 000111011 - four 16-bit packs  */
19 #define FPACK16_OPF	0x03b
20 
21 /* 000111010 - two 32-bit packs  */
22 #define FPACK32_OPF	0x03a
23 
24 /* 000111101 - four 16-bit packs  */
25 #define FPACKFIX_OPF	0x03d
26 
27 /* 001001101 - four 16-bit expands  */
28 #define FEXPAND_OPF	0x04d
29 
30 /* 001001011 - two 32-bit merges */
31 #define FPMERGE_OPF	0x04b
32 
33 /* 000110001 - 8-by-16-bit partitoned product  */
34 #define FMUL8x16_OPF	0x031
35 
36 /* 000110011 - 8-by-16-bit upper alpha partitioned product  */
37 #define FMUL8x16AU_OPF	0x033
38 
39 /* 000110101 - 8-by-16-bit lower alpha partitioned product  */
40 #define FMUL8x16AL_OPF	0x035
41 
42 /* 000110110 - upper 8-by-16-bit partitioned product  */
43 #define FMUL8SUx16_OPF	0x036
44 
45 /* 000110111 - lower 8-by-16-bit partitioned product  */
46 #define FMUL8ULx16_OPF	0x037
47 
48 /* 000111000 - upper 8-by-16-bit partitioned product  */
49 #define FMULD8SUx16_OPF	0x038
50 
51 /* 000111001 - lower unsigned 8-by-16-bit partitioned product  */
52 #define FMULD8ULx16_OPF	0x039
53 
54 /* 000101000 - four 16-bit compare; set rd if src1 > src2  */
55 #define FCMPGT16_OPF	0x028
56 
57 /* 000101100 - two 32-bit compare; set rd if src1 > src2  */
58 #define FCMPGT32_OPF	0x02c
59 
60 /* 000100000 - four 16-bit compare; set rd if src1 <= src2  */
61 #define FCMPLE16_OPF	0x020
62 
63 /* 000100100 - two 32-bit compare; set rd if src1 <= src2  */
64 #define FCMPLE32_OPF	0x024
65 
66 /* 000100010 - four 16-bit compare; set rd if src1 != src2  */
67 #define FCMPNE16_OPF	0x022
68 
69 /* 000100110 - two 32-bit compare; set rd if src1 != src2  */
70 #define FCMPNE32_OPF	0x026
71 
72 /* 000101010 - four 16-bit compare; set rd if src1 == src2  */
73 #define FCMPEQ16_OPF	0x02a
74 
75 /* 000101110 - two 32-bit compare; set rd if src1 == src2  */
76 #define FCMPEQ32_OPF	0x02e
77 
78 /* 000000000 - Eight 8-bit edge boundary processing  */
79 #define EDGE8_OPF	0x000
80 
81 /* 000000001 - Eight 8-bit edge boundary processing, no CC */
82 #define EDGE8N_OPF	0x001
83 
84 /* 000000010 - Eight 8-bit edge boundary processing, little-endian  */
85 #define EDGE8L_OPF	0x002
86 
87 /* 000000011 - Eight 8-bit edge boundary processing, little-endian, no CC  */
88 #define EDGE8LN_OPF	0x003
89 
90 /* 000000100 - Four 16-bit edge boundary processing  */
91 #define EDGE16_OPF	0x004
92 
93 /* 000000101 - Four 16-bit edge boundary processing, no CC  */
94 #define EDGE16N_OPF	0x005
95 
96 /* 000000110 - Four 16-bit edge boundary processing, little-endian  */
97 #define EDGE16L_OPF	0x006
98 
99 /* 000000111 - Four 16-bit edge boundary processing, little-endian, no CC  */
100 #define EDGE16LN_OPF	0x007
101 
102 /* 000001000 - Two 32-bit edge boundary processing  */
103 #define EDGE32_OPF	0x008
104 
105 /* 000001001 - Two 32-bit edge boundary processing, no CC  */
106 #define EDGE32N_OPF	0x009
107 
108 /* 000001010 - Two 32-bit edge boundary processing, little-endian  */
109 #define EDGE32L_OPF	0x00a
110 
111 /* 000001011 - Two 32-bit edge boundary processing, little-endian, no CC  */
112 #define EDGE32LN_OPF	0x00b
113 
114 /* 000111110 - distance between 8 8-bit components  */
115 #define PDIST_OPF	0x03e
116 
117 /* 000010000 - convert 8-bit 3-D address to blocked byte address  */
118 #define ARRAY8_OPF	0x010
119 
120 /* 000010010 - convert 16-bit 3-D address to blocked byte address  */
121 #define ARRAY16_OPF	0x012
122 
123 /* 000010100 - convert 32-bit 3-D address to blocked byte address  */
124 #define ARRAY32_OPF	0x014
125 
126 /* 000011001 - Set the GSR.MASK field in preparation for a BSHUFFLE  */
127 #define BMASK_OPF	0x019
128 
129 /* 001001100 - Permute bytes as specified by GSR.MASK  */
130 #define BSHUFFLE_OPF	0x04c
131 
132 #define VIS_OPF_SHIFT	5
133 #define VIS_OPF_MASK	(0x1ff << VIS_OPF_SHIFT)
134 
135 #define RS1(INSN)	(((INSN) >> 14) & 0x1f)
136 #define RS2(INSN)	(((INSN) >>  0) & 0x1f)
137 #define RD(INSN)	(((INSN) >> 25) & 0x1f)
138 
139 static inline void maybe_flush_windows(unsigned int rs1, unsigned int rs2,
140 				       unsigned int rd, int from_kernel)
141 {
142 	if (rs2 >= 16 || rs1 >= 16 || rd >= 16) {
143 		if (from_kernel != 0)
144 			__asm__ __volatile__("flushw");
145 		else
146 			flushw_user();
147 	}
148 }
149 
150 static unsigned long fetch_reg(unsigned int reg, struct pt_regs *regs)
151 {
152 	unsigned long value;
153 
154 	if (reg < 16)
155 		return (!reg ? 0 : regs->u_regs[reg]);
156 	if (regs->tstate & TSTATE_PRIV) {
157 		struct reg_window *win;
158 		win = (struct reg_window *)(regs->u_regs[UREG_FP] + STACK_BIAS);
159 		value = win->locals[reg - 16];
160 	} else if (test_thread_flag(TIF_32BIT)) {
161 		struct reg_window32 __user *win32;
162 		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
163 		get_user(value, &win32->locals[reg - 16]);
164 	} else {
165 		struct reg_window __user *win;
166 		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
167 		get_user(value, &win->locals[reg - 16]);
168 	}
169 	return value;
170 }
171 
172 static inline unsigned long __user *__fetch_reg_addr_user(unsigned int reg,
173 							  struct pt_regs *regs)
174 {
175 	BUG_ON(reg < 16);
176 	BUG_ON(regs->tstate & TSTATE_PRIV);
177 
178 	if (test_thread_flag(TIF_32BIT)) {
179 		struct reg_window32 __user *win32;
180 		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
181 		return (unsigned long __user *)&win32->locals[reg - 16];
182 	} else {
183 		struct reg_window __user *win;
184 		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
185 		return &win->locals[reg - 16];
186 	}
187 }
188 
189 static inline unsigned long *__fetch_reg_addr_kern(unsigned int reg,
190 						   struct pt_regs *regs)
191 {
192 	BUG_ON(reg >= 16);
193 	BUG_ON(regs->tstate & TSTATE_PRIV);
194 
195 	return &regs->u_regs[reg];
196 }
197 
198 static void store_reg(struct pt_regs *regs, unsigned long val, unsigned long rd)
199 {
200 	if (rd < 16) {
201 		unsigned long *rd_kern = __fetch_reg_addr_kern(rd, regs);
202 
203 		*rd_kern = val;
204 	} else {
205 		unsigned long __user *rd_user = __fetch_reg_addr_user(rd, regs);
206 
207 		if (test_thread_flag(TIF_32BIT))
208 			__put_user((u32)val, (u32 __user *)rd_user);
209 		else
210 			__put_user(val, rd_user);
211 	}
212 }
213 
214 static inline unsigned long fpd_regval(struct fpustate *f,
215 				       unsigned int insn_regnum)
216 {
217 	insn_regnum = (((insn_regnum & 1) << 5) |
218 		       (insn_regnum & 0x1e));
219 
220 	return *(unsigned long *) &f->regs[insn_regnum];
221 }
222 
223 static inline unsigned long *fpd_regaddr(struct fpustate *f,
224 					 unsigned int insn_regnum)
225 {
226 	insn_regnum = (((insn_regnum & 1) << 5) |
227 		       (insn_regnum & 0x1e));
228 
229 	return (unsigned long *) &f->regs[insn_regnum];
230 }
231 
232 static inline unsigned int fps_regval(struct fpustate *f,
233 				      unsigned int insn_regnum)
234 {
235 	return f->regs[insn_regnum];
236 }
237 
238 static inline unsigned int *fps_regaddr(struct fpustate *f,
239 					unsigned int insn_regnum)
240 {
241 	return &f->regs[insn_regnum];
242 }
243 
244 struct edge_tab {
245 	u16 left, right;
246 };
247 static struct edge_tab edge8_tab[8] = {
248 	{ 0xff, 0x80 },
249 	{ 0x7f, 0xc0 },
250 	{ 0x3f, 0xe0 },
251 	{ 0x1f, 0xf0 },
252 	{ 0x0f, 0xf8 },
253 	{ 0x07, 0xfc },
254 	{ 0x03, 0xfe },
255 	{ 0x01, 0xff },
256 };
257 static struct edge_tab edge8_tab_l[8] = {
258 	{ 0xff, 0x01 },
259 	{ 0xfe, 0x03 },
260 	{ 0xfc, 0x07 },
261 	{ 0xf8, 0x0f },
262 	{ 0xf0, 0x1f },
263 	{ 0xe0, 0x3f },
264 	{ 0xc0, 0x7f },
265 	{ 0x80, 0xff },
266 };
267 static struct edge_tab edge16_tab[4] = {
268 	{ 0xf, 0x8 },
269 	{ 0x7, 0xc },
270 	{ 0x3, 0xe },
271 	{ 0x1, 0xf },
272 };
273 static struct edge_tab edge16_tab_l[4] = {
274 	{ 0xf, 0x1 },
275 	{ 0xe, 0x3 },
276 	{ 0xc, 0x7 },
277 	{ 0x8, 0xf },
278 };
279 static struct edge_tab edge32_tab[2] = {
280 	{ 0x3, 0x2 },
281 	{ 0x1, 0x3 },
282 };
283 static struct edge_tab edge32_tab_l[2] = {
284 	{ 0x3, 0x1 },
285 	{ 0x2, 0x3 },
286 };
287 
288 static void edge(struct pt_regs *regs, unsigned int insn, unsigned int opf)
289 {
290 	unsigned long orig_rs1, rs1, orig_rs2, rs2, rd_val;
291 	u16 left, right;
292 
293 	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
294 	orig_rs1 = rs1 = fetch_reg(RS1(insn), regs);
295 	orig_rs2 = rs2 = fetch_reg(RS2(insn), regs);
296 
297 	if (test_thread_flag(TIF_32BIT)) {
298 		rs1 = rs1 & 0xffffffff;
299 		rs2 = rs2 & 0xffffffff;
300 	}
301 	switch (opf) {
302 	default:
303 	case EDGE8_OPF:
304 	case EDGE8N_OPF:
305 		left = edge8_tab[rs1 & 0x7].left;
306 		right = edge8_tab[rs2 & 0x7].right;
307 		break;
308 	case EDGE8L_OPF:
309 	case EDGE8LN_OPF:
310 		left = edge8_tab_l[rs1 & 0x7].left;
311 		right = edge8_tab_l[rs2 & 0x7].right;
312 		break;
313 
314 	case EDGE16_OPF:
315 	case EDGE16N_OPF:
316 		left = edge16_tab[(rs1 >> 1) & 0x3].left;
317 		right = edge16_tab[(rs2 >> 1) & 0x3].right;
318 		break;
319 
320 	case EDGE16L_OPF:
321 	case EDGE16LN_OPF:
322 		left = edge16_tab_l[(rs1 >> 1) & 0x3].left;
323 		right = edge16_tab_l[(rs2 >> 1) & 0x3].right;
324 		break;
325 
326 	case EDGE32_OPF:
327 	case EDGE32N_OPF:
328 		left = edge32_tab[(rs1 >> 2) & 0x1].left;
329 		right = edge32_tab[(rs2 >> 2) & 0x1].right;
330 		break;
331 
332 	case EDGE32L_OPF:
333 	case EDGE32LN_OPF:
334 		left = edge32_tab_l[(rs1 >> 2) & 0x1].left;
335 		right = edge32_tab_l[(rs2 >> 2) & 0x1].right;
336 		break;
337 	};
338 
339 	if ((rs1 & ~0x7UL) == (rs2 & ~0x7UL))
340 		rd_val = right & left;
341 	else
342 		rd_val = left;
343 
344 	store_reg(regs, rd_val, RD(insn));
345 
346 	switch (opf) {
347 	case EDGE8_OPF:
348 	case EDGE8L_OPF:
349 	case EDGE16_OPF:
350 	case EDGE16L_OPF:
351 	case EDGE32_OPF:
352 	case EDGE32L_OPF: {
353 		unsigned long ccr, tstate;
354 
355 		__asm__ __volatile__("subcc	%1, %2, %%g0\n\t"
356 				     "rd	%%ccr, %0"
357 				     : "=r" (ccr)
358 				     : "r" (orig_rs1), "r" (orig_rs2)
359 				     : "cc");
360 		tstate = regs->tstate & ~(TSTATE_XCC | TSTATE_ICC);
361 		regs->tstate = tstate | (ccr << 32UL);
362 	}
363 	};
364 }
365 
366 static void array(struct pt_regs *regs, unsigned int insn, unsigned int opf)
367 {
368 	unsigned long rs1, rs2, rd_val;
369 	unsigned int bits, bits_mask;
370 
371 	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
372 	rs1 = fetch_reg(RS1(insn), regs);
373 	rs2 = fetch_reg(RS2(insn), regs);
374 
375 	bits = (rs2 > 5 ? 5 : rs2);
376 	bits_mask = (1UL << bits) - 1UL;
377 
378 	rd_val = ((((rs1 >> 11) & 0x3) <<  0) |
379 		  (((rs1 >> 33) & 0x3) <<  2) |
380 		  (((rs1 >> 55) & 0x1) <<  4) |
381 		  (((rs1 >> 13) & 0xf) <<  5) |
382 		  (((rs1 >> 35) & 0xf) <<  9) |
383 		  (((rs1 >> 56) & 0xf) << 13) |
384 		  (((rs1 >> 17) & bits_mask) << 17) |
385 		  (((rs1 >> 39) & bits_mask) << (17 + bits)) |
386 		  (((rs1 >> 60) & 0xf)       << (17 + (2*bits))));
387 
388 	switch (opf) {
389 	case ARRAY16_OPF:
390 		rd_val <<= 1;
391 		break;
392 
393 	case ARRAY32_OPF:
394 		rd_val <<= 2;
395 	};
396 
397 	store_reg(regs, rd_val, RD(insn));
398 }
399 
400 static void bmask(struct pt_regs *regs, unsigned int insn)
401 {
402 	unsigned long rs1, rs2, rd_val, gsr;
403 
404 	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
405 	rs1 = fetch_reg(RS1(insn), regs);
406 	rs2 = fetch_reg(RS2(insn), regs);
407 	rd_val = rs1 + rs2;
408 
409 	store_reg(regs, rd_val, RD(insn));
410 
411 	gsr = current_thread_info()->gsr[0] & 0xffffffff;
412 	gsr |= rd_val << 32UL;
413 	current_thread_info()->gsr[0] = gsr;
414 }
415 
416 static void bshuffle(struct pt_regs *regs, unsigned int insn)
417 {
418 	struct fpustate *f = FPUSTATE;
419 	unsigned long rs1, rs2, rd_val;
420 	unsigned long bmask, i;
421 
422 	bmask = current_thread_info()->gsr[0] >> 32UL;
423 
424 	rs1 = fpd_regval(f, RS1(insn));
425 	rs2 = fpd_regval(f, RS2(insn));
426 
427 	rd_val = 0UL;
428 	for (i = 0; i < 8; i++) {
429 		unsigned long which = (bmask >> (i * 4)) & 0xf;
430 		unsigned long byte;
431 
432 		if (which < 8)
433 			byte = (rs1 >> (which * 8)) & 0xff;
434 		else
435 			byte = (rs2 >> ((which-8)*8)) & 0xff;
436 		rd_val |= (byte << (i * 8));
437 	}
438 
439 	*fpd_regaddr(f, RD(insn)) = rd_val;
440 }
441 
442 static void pdist(struct pt_regs *regs, unsigned int insn)
443 {
444 	struct fpustate *f = FPUSTATE;
445 	unsigned long rs1, rs2, *rd, rd_val;
446 	unsigned long i;
447 
448 	rs1 = fpd_regval(f, RS1(insn));
449 	rs2 = fpd_regval(f, RS2(insn));
450 	rd = fpd_regaddr(f, RD(insn));
451 
452 	rd_val = *rd;
453 
454 	for (i = 0; i < 8; i++) {
455 		s16 s1, s2;
456 
457 		s1 = (rs1 >> (56 - (i * 8))) & 0xff;
458 		s2 = (rs2 >> (56 - (i * 8))) & 0xff;
459 
460 		/* Absolute value of difference. */
461 		s1 -= s2;
462 		if (s1 < 0)
463 			s1 = ~s1 + 1;
464 
465 		rd_val += s1;
466 	}
467 
468 	*rd = rd_val;
469 }
470 
471 static void pformat(struct pt_regs *regs, unsigned int insn, unsigned int opf)
472 {
473 	struct fpustate *f = FPUSTATE;
474 	unsigned long rs1, rs2, gsr, scale, rd_val;
475 
476 	gsr = current_thread_info()->gsr[0];
477 	scale = (gsr >> 3) & (opf == FPACK16_OPF ? 0xf : 0x1f);
478 	switch (opf) {
479 	case FPACK16_OPF: {
480 		unsigned long byte;
481 
482 		rs2 = fpd_regval(f, RS2(insn));
483 		rd_val = 0;
484 		for (byte = 0; byte < 4; byte++) {
485 			unsigned int val;
486 			s16 src = (rs2 >> (byte * 16UL)) & 0xffffUL;
487 			int scaled = src << scale;
488 			int from_fixed = scaled >> 7;
489 
490 			val = ((from_fixed < 0) ?
491 			       0 :
492 			       (from_fixed > 255) ?
493 			       255 : from_fixed);
494 
495 			rd_val |= (val << (8 * byte));
496 		}
497 		*fps_regaddr(f, RD(insn)) = rd_val;
498 		break;
499 	}
500 
501 	case FPACK32_OPF: {
502 		unsigned long word;
503 
504 		rs1 = fpd_regval(f, RS1(insn));
505 		rs2 = fpd_regval(f, RS2(insn));
506 		rd_val = (rs1 << 8) & ~(0x000000ff000000ffUL);
507 		for (word = 0; word < 2; word++) {
508 			unsigned long val;
509 			s32 src = (rs2 >> (word * 32UL));
510 			s64 scaled = src << scale;
511 			s64 from_fixed = scaled >> 23;
512 
513 			val = ((from_fixed < 0) ?
514 			       0 :
515 			       (from_fixed > 255) ?
516 			       255 : from_fixed);
517 
518 			rd_val |= (val << (32 * word));
519 		}
520 		*fpd_regaddr(f, RD(insn)) = rd_val;
521 		break;
522 	}
523 
524 	case FPACKFIX_OPF: {
525 		unsigned long word;
526 
527 		rs2 = fpd_regval(f, RS2(insn));
528 
529 		rd_val = 0;
530 		for (word = 0; word < 2; word++) {
531 			long val;
532 			s32 src = (rs2 >> (word * 32UL));
533 			s64 scaled = src << scale;
534 			s64 from_fixed = scaled >> 16;
535 
536 			val = ((from_fixed < -32768) ?
537 			       -32768 :
538 			       (from_fixed > 32767) ?
539 			       32767 : from_fixed);
540 
541 			rd_val |= ((val & 0xffff) << (word * 16));
542 		}
543 		*fps_regaddr(f, RD(insn)) = rd_val;
544 		break;
545 	}
546 
547 	case FEXPAND_OPF: {
548 		unsigned long byte;
549 
550 		rs2 = fps_regval(f, RS2(insn));
551 
552 		rd_val = 0;
553 		for (byte = 0; byte < 4; byte++) {
554 			unsigned long val;
555 			u8 src = (rs2 >> (byte * 8)) & 0xff;
556 
557 			val = src << 4;
558 
559 			rd_val |= (val << (byte * 16));
560 		}
561 		*fpd_regaddr(f, RD(insn)) = rd_val;
562 		break;
563 	}
564 
565 	case FPMERGE_OPF: {
566 		rs1 = fps_regval(f, RS1(insn));
567 		rs2 = fps_regval(f, RS2(insn));
568 
569 		rd_val = (((rs2 & 0x000000ff) <<  0) |
570 			  ((rs1 & 0x000000ff) <<  8) |
571 			  ((rs2 & 0x0000ff00) <<  8) |
572 			  ((rs1 & 0x0000ff00) << 16) |
573 			  ((rs2 & 0x00ff0000) << 16) |
574 			  ((rs1 & 0x00ff0000) << 24) |
575 			  ((rs2 & 0xff000000) << 24) |
576 			  ((rs1 & 0xff000000) << 32));
577 		*fpd_regaddr(f, RD(insn)) = rd_val;
578 		break;
579 	}
580 	};
581 }
582 
583 static void pmul(struct pt_regs *regs, unsigned int insn, unsigned int opf)
584 {
585 	struct fpustate *f = FPUSTATE;
586 	unsigned long rs1, rs2, rd_val;
587 
588 	switch (opf) {
589 	case FMUL8x16_OPF: {
590 		unsigned long byte;
591 
592 		rs1 = fps_regval(f, RS1(insn));
593 		rs2 = fpd_regval(f, RS2(insn));
594 
595 		rd_val = 0;
596 		for (byte = 0; byte < 4; byte++) {
597 			u16 src1 = (rs1 >> (byte *  8)) & 0x00ff;
598 			s16 src2 = (rs2 >> (byte * 16)) & 0xffff;
599 			u32 prod = src1 * src2;
600 			u16 scaled = ((prod & 0x00ffff00) >> 8);
601 
602 			/* Round up.  */
603 			if (prod & 0x80)
604 				scaled++;
605 			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
606 		}
607 
608 		*fpd_regaddr(f, RD(insn)) = rd_val;
609 		break;
610 	}
611 
612 	case FMUL8x16AU_OPF:
613 	case FMUL8x16AL_OPF: {
614 		unsigned long byte;
615 		s16 src2;
616 
617 		rs1 = fps_regval(f, RS1(insn));
618 		rs2 = fps_regval(f, RS2(insn));
619 
620 		rd_val = 0;
621 		src2 = rs2 >> (opf == FMUL8x16AU_OPF ? 16 : 0);
622 		for (byte = 0; byte < 4; byte++) {
623 			u16 src1 = (rs1 >> (byte * 8)) & 0x00ff;
624 			u32 prod = src1 * src2;
625 			u16 scaled = ((prod & 0x00ffff00) >> 8);
626 
627 			/* Round up.  */
628 			if (prod & 0x80)
629 				scaled++;
630 			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
631 		}
632 
633 		*fpd_regaddr(f, RD(insn)) = rd_val;
634 		break;
635 	}
636 
637 	case FMUL8SUx16_OPF:
638 	case FMUL8ULx16_OPF: {
639 		unsigned long byte, ushift;
640 
641 		rs1 = fpd_regval(f, RS1(insn));
642 		rs2 = fpd_regval(f, RS2(insn));
643 
644 		rd_val = 0;
645 		ushift = (opf == FMUL8SUx16_OPF) ? 8 : 0;
646 		for (byte = 0; byte < 4; byte++) {
647 			u16 src1;
648 			s16 src2;
649 			u32 prod;
650 			u16 scaled;
651 
652 			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
653 			src2 = ((rs2 >> (16 * byte)) & 0xffff);
654 			prod = src1 * src2;
655 			scaled = ((prod & 0x00ffff00) >> 8);
656 
657 			/* Round up.  */
658 			if (prod & 0x80)
659 				scaled++;
660 			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
661 		}
662 
663 		*fpd_regaddr(f, RD(insn)) = rd_val;
664 		break;
665 	}
666 
667 	case FMULD8SUx16_OPF:
668 	case FMULD8ULx16_OPF: {
669 		unsigned long byte, ushift;
670 
671 		rs1 = fps_regval(f, RS1(insn));
672 		rs2 = fps_regval(f, RS2(insn));
673 
674 		rd_val = 0;
675 		ushift = (opf == FMULD8SUx16_OPF) ? 8 : 0;
676 		for (byte = 0; byte < 2; byte++) {
677 			u16 src1;
678 			s16 src2;
679 			u32 prod;
680 			u16 scaled;
681 
682 			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
683 			src2 = ((rs2 >> (16 * byte)) & 0xffff);
684 			prod = src1 * src2;
685 			scaled = ((prod & 0x00ffff00) >> 8);
686 
687 			/* Round up.  */
688 			if (prod & 0x80)
689 				scaled++;
690 			rd_val |= ((scaled & 0xffffUL) <<
691 				   ((byte * 32UL) + 7UL));
692 		}
693 		*fpd_regaddr(f, RD(insn)) = rd_val;
694 		break;
695 	}
696 	};
697 }
698 
699 static void pcmp(struct pt_regs *regs, unsigned int insn, unsigned int opf)
700 {
701 	struct fpustate *f = FPUSTATE;
702 	unsigned long rs1, rs2, rd_val, i;
703 
704 	rs1 = fpd_regval(f, RS1(insn));
705 	rs2 = fpd_regval(f, RS2(insn));
706 
707 	rd_val = 0;
708 
709 	switch (opf) {
710 	case FCMPGT16_OPF:
711 		for (i = 0; i < 4; i++) {
712 			s16 a = (rs1 >> (i * 16)) & 0xffff;
713 			s16 b = (rs2 >> (i * 16)) & 0xffff;
714 
715 			if (a > b)
716 				rd_val |= 1 << i;
717 		}
718 		break;
719 
720 	case FCMPGT32_OPF:
721 		for (i = 0; i < 2; i++) {
722 			s32 a = (rs1 >> (i * 32)) & 0xffff;
723 			s32 b = (rs2 >> (i * 32)) & 0xffff;
724 
725 			if (a > b)
726 				rd_val |= 1 << i;
727 		}
728 		break;
729 
730 	case FCMPLE16_OPF:
731 		for (i = 0; i < 4; i++) {
732 			s16 a = (rs1 >> (i * 16)) & 0xffff;
733 			s16 b = (rs2 >> (i * 16)) & 0xffff;
734 
735 			if (a <= b)
736 				rd_val |= 1 << i;
737 		}
738 		break;
739 
740 	case FCMPLE32_OPF:
741 		for (i = 0; i < 2; i++) {
742 			s32 a = (rs1 >> (i * 32)) & 0xffff;
743 			s32 b = (rs2 >> (i * 32)) & 0xffff;
744 
745 			if (a <= b)
746 				rd_val |= 1 << i;
747 		}
748 		break;
749 
750 	case FCMPNE16_OPF:
751 		for (i = 0; i < 4; i++) {
752 			s16 a = (rs1 >> (i * 16)) & 0xffff;
753 			s16 b = (rs2 >> (i * 16)) & 0xffff;
754 
755 			if (a != b)
756 				rd_val |= 1 << i;
757 		}
758 		break;
759 
760 	case FCMPNE32_OPF:
761 		for (i = 0; i < 2; i++) {
762 			s32 a = (rs1 >> (i * 32)) & 0xffff;
763 			s32 b = (rs2 >> (i * 32)) & 0xffff;
764 
765 			if (a != b)
766 				rd_val |= 1 << i;
767 		}
768 		break;
769 
770 	case FCMPEQ16_OPF:
771 		for (i = 0; i < 4; i++) {
772 			s16 a = (rs1 >> (i * 16)) & 0xffff;
773 			s16 b = (rs2 >> (i * 16)) & 0xffff;
774 
775 			if (a == b)
776 				rd_val |= 1 << i;
777 		}
778 		break;
779 
780 	case FCMPEQ32_OPF:
781 		for (i = 0; i < 2; i++) {
782 			s32 a = (rs1 >> (i * 32)) & 0xffff;
783 			s32 b = (rs2 >> (i * 32)) & 0xffff;
784 
785 			if (a == b)
786 				rd_val |= 1 << i;
787 		}
788 		break;
789 	};
790 
791 	maybe_flush_windows(0, 0, RD(insn), 0);
792 	store_reg(regs, rd_val, RD(insn));
793 }
794 
795 /* Emulate the VIS instructions which are not implemented in
796  * hardware on Niagara.
797  */
798 int vis_emul(struct pt_regs *regs, unsigned int insn)
799 {
800 	unsigned long pc = regs->tpc;
801 	unsigned int opf;
802 
803 	BUG_ON(regs->tstate & TSTATE_PRIV);
804 
805 	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0);
806 
807 	if (test_thread_flag(TIF_32BIT))
808 		pc = (u32)pc;
809 
810 	if (get_user(insn, (u32 __user *) pc))
811 		return -EFAULT;
812 
813 	save_and_clear_fpu();
814 
815 	opf = (insn & VIS_OPF_MASK) >> VIS_OPF_SHIFT;
816 	switch (opf) {
817 	default:
818 		return -EINVAL;
819 
820 	/* Pixel Formatting Instructions.  */
821 	case FPACK16_OPF:
822 	case FPACK32_OPF:
823 	case FPACKFIX_OPF:
824 	case FEXPAND_OPF:
825 	case FPMERGE_OPF:
826 		pformat(regs, insn, opf);
827 		break;
828 
829 	/* Partitioned Multiply Instructions  */
830 	case FMUL8x16_OPF:
831 	case FMUL8x16AU_OPF:
832 	case FMUL8x16AL_OPF:
833 	case FMUL8SUx16_OPF:
834 	case FMUL8ULx16_OPF:
835 	case FMULD8SUx16_OPF:
836 	case FMULD8ULx16_OPF:
837 		pmul(regs, insn, opf);
838 		break;
839 
840 	/* Pixel Compare Instructions  */
841 	case FCMPGT16_OPF:
842 	case FCMPGT32_OPF:
843 	case FCMPLE16_OPF:
844 	case FCMPLE32_OPF:
845 	case FCMPNE16_OPF:
846 	case FCMPNE32_OPF:
847 	case FCMPEQ16_OPF:
848 	case FCMPEQ32_OPF:
849 		pcmp(regs, insn, opf);
850 		break;
851 
852 	/* Edge Handling Instructions  */
853 	case EDGE8_OPF:
854 	case EDGE8N_OPF:
855 	case EDGE8L_OPF:
856 	case EDGE8LN_OPF:
857 	case EDGE16_OPF:
858 	case EDGE16N_OPF:
859 	case EDGE16L_OPF:
860 	case EDGE16LN_OPF:
861 	case EDGE32_OPF:
862 	case EDGE32N_OPF:
863 	case EDGE32L_OPF:
864 	case EDGE32LN_OPF:
865 		edge(regs, insn, opf);
866 		break;
867 
868 	/* Pixel Component Distance  */
869 	case PDIST_OPF:
870 		pdist(regs, insn);
871 		break;
872 
873 	/* Three-Dimensional Array Addressing Instructions  */
874 	case ARRAY8_OPF:
875 	case ARRAY16_OPF:
876 	case ARRAY32_OPF:
877 		array(regs, insn, opf);
878 		break;
879 
880 	/* Byte Mask and Shuffle Instructions  */
881 	case BMASK_OPF:
882 		bmask(regs, insn);
883 		break;
884 
885 	case BSHUFFLE_OPF:
886 		bshuffle(regs, insn);
887 		break;
888 	};
889 
890 	regs->tpc = regs->tnpc;
891 	regs->tnpc += 4;
892 	return 0;
893 }
894