1 /*
2  * SN Platform GRU Driver
3  *
4  *              KERNEL SERVICES THAT USE THE GRU
5  *
6  *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
7  *
8  *  This program is free software; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation; either version 2 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  */
22 
23 #include <linux/kernel.h>
24 #include <linux/errno.h>
25 #include <linux/slab.h>
26 #include <linux/mm.h>
27 #include <linux/spinlock.h>
28 #include <linux/device.h>
29 #include <linux/miscdevice.h>
30 #include <linux/proc_fs.h>
31 #include <linux/interrupt.h>
32 #include <linux/uaccess.h>
33 #include <linux/delay.h>
34 #include "gru.h"
35 #include "grulib.h"
36 #include "grutables.h"
37 #include "grukservices.h"
38 #include "gru_instructions.h"
39 #include <asm/uv/uv_hub.h>
40 
41 /*
42  * Kernel GRU Usage
43  *
44  * The following is an interim algorithm for management of kernel GRU
45  * resources. This will likely be replaced when we better understand the
46  * kernel/user requirements.
47  *
48  * Blade percpu resources reserved for kernel use. These resources are
49  * reserved whenever the the kernel context for the blade is loaded. Note
50  * that the kernel context is not guaranteed to be always available. It is
51  * loaded on demand & can be stolen by a user if the user demand exceeds the
52  * kernel demand. The kernel can always reload the kernel context but
53  * a SLEEP may be required!!!.
54  *
55  * Async Overview:
56  *
57  * 	Each blade has one "kernel context" that owns GRU kernel resources
58  * 	located on the blade. Kernel drivers use GRU resources in this context
59  * 	for sending messages, zeroing memory, etc.
60  *
61  * 	The kernel context is dynamically loaded on demand. If it is not in
62  * 	use by the kernel, the kernel context can be unloaded & given to a user.
63  * 	The kernel context will be reloaded when needed. This may require that
64  * 	a context be stolen from a user.
65  * 		NOTE: frequent unloading/reloading of the kernel context is
66  * 		expensive. We are depending on batch schedulers, cpusets, sane
67  * 		drivers or some other mechanism to prevent the need for frequent
68  *	 	stealing/reloading.
69  *
70  * 	The kernel context consists of two parts:
71  * 		- 1 CB & a few DSRs that are reserved for each cpu on the blade.
72  * 		  Each cpu has it's own private resources & does not share them
73  * 		  with other cpus. These resources are used serially, ie,
74  * 		  locked, used & unlocked  on each call to a function in
75  * 		  grukservices.
76  * 		  	(Now that we have dynamic loading of kernel contexts, I
77  * 		  	 may rethink this & allow sharing between cpus....)
78  *
79  *		- Additional resources can be reserved long term & used directly
80  *		  by UV drivers located in the kernel. Drivers using these GRU
81  *		  resources can use asynchronous GRU instructions that send
82  *		  interrupts on completion.
83  *		  	- these resources must be explicitly locked/unlocked
84  *		  	- locked resources prevent (obviously) the kernel
85  *		  	  context from being unloaded.
86  *			- drivers using these resource directly issue their own
87  *			  GRU instruction and must wait/check completion.
88  *
89  * 		  When these resources are reserved, the caller can optionally
90  * 		  associate a wait_queue with the resources and use asynchronous
91  * 		  GRU instructions. When an async GRU instruction completes, the
92  * 		  driver will do a wakeup on the event.
93  *
94  */
95 
96 
97 #define ASYNC_HAN_TO_BID(h)	((h) - 1)
98 #define ASYNC_BID_TO_HAN(b)	((b) + 1)
99 #define ASYNC_HAN_TO_BS(h)	gru_base[ASYNC_HAN_TO_BID(h)]
100 #define KCB_TO_GID(cb)		((cb - gru_start_vaddr) /		\
101 					(GRU_SIZE * GRU_CHIPLETS_PER_BLADE))
102 #define KCB_TO_BS(cb)		gru_base[KCB_TO_GID(cb)]
103 
104 #define GRU_NUM_KERNEL_CBR	1
105 #define GRU_NUM_KERNEL_DSR_BYTES 256
106 #define GRU_NUM_KERNEL_DSR_CL	(GRU_NUM_KERNEL_DSR_BYTES /		\
107 					GRU_CACHE_LINE_BYTES)
108 
109 /* GRU instruction attributes for all instructions */
110 #define IMA			IMA_CB_DELAY
111 
112 /* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
113 #define __gru_cacheline_aligned__                               \
114 	__attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
115 
116 #define MAGIC	0x1234567887654321UL
117 
118 /* Default retry count for GRU errors on kernel instructions */
119 #define EXCEPTION_RETRY_LIMIT	3
120 
121 /* Status of message queue sections */
122 #define MQS_EMPTY		0
123 #define MQS_FULL		1
124 #define MQS_NOOP		2
125 
126 /*----------------- RESOURCE MANAGEMENT -------------------------------------*/
127 /* optimized for x86_64 */
128 struct message_queue {
129 	union gru_mesqhead	head __gru_cacheline_aligned__;	/* CL 0 */
130 	int			qlines;				/* DW 1 */
131 	long 			hstatus[2];
132 	void 			*next __gru_cacheline_aligned__;/* CL 1 */
133 	void 			*limit;
134 	void 			*start;
135 	void 			*start2;
136 	char			data ____cacheline_aligned;	/* CL 2 */
137 };
138 
139 /* First word in every message - used by mesq interface */
140 struct message_header {
141 	char	present;
142 	char	present2;
143 	char 	lines;
144 	char	fill;
145 };
146 
147 #define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
148 
149 /*
150  * Reload the blade's kernel context into a GRU chiplet. Called holding
151  * the bs_kgts_sema for READ. Will steal user contexts if necessary.
152  */
153 static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
154 {
155 	struct gru_state *gru;
156 	struct gru_thread_state *kgts;
157 	void *vaddr;
158 	int ctxnum, ncpus;
159 
160 	up_read(&bs->bs_kgts_sema);
161 	down_write(&bs->bs_kgts_sema);
162 
163 	if (!bs->bs_kgts)
164 		bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0);
165 	kgts = bs->bs_kgts;
166 
167 	if (!kgts->ts_gru) {
168 		STAT(load_kernel_context);
169 		ncpus = uv_blade_nr_possible_cpus(blade_id);
170 		kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
171 			GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
172 		kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
173 			GRU_NUM_KERNEL_DSR_BYTES * ncpus +
174 				bs->bs_async_dsr_bytes);
175 		while (!gru_assign_gru_context(kgts, blade_id)) {
176 			msleep(1);
177 			gru_steal_context(kgts, blade_id);
178 		}
179 		gru_load_context(kgts);
180 		gru = bs->bs_kgts->ts_gru;
181 		vaddr = gru->gs_gru_base_vaddr;
182 		ctxnum = kgts->ts_ctxnum;
183 		bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
184 		bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
185 	}
186 	downgrade_write(&bs->bs_kgts_sema);
187 }
188 
189 /*
190  * Free all kernel contexts that are not currently in use.
191  *   Returns 0 if all freed, else number of inuse context.
192  */
193 static int gru_free_kernel_contexts(void)
194 {
195 	struct gru_blade_state *bs;
196 	struct gru_thread_state *kgts;
197 	int bid, ret = 0;
198 
199 	for (bid = 0; bid < GRU_MAX_BLADES; bid++) {
200 		bs = gru_base[bid];
201 		if (!bs)
202 			continue;
203 		if (down_write_trylock(&bs->bs_kgts_sema)) {
204 			kgts = bs->bs_kgts;
205 			if (kgts && kgts->ts_gru)
206 				gru_unload_context(kgts, 0);
207 			kfree(kgts);
208 			bs->bs_kgts = NULL;
209 			up_write(&bs->bs_kgts_sema);
210 		} else {
211 			ret++;
212 		}
213 	}
214 	return ret;
215 }
216 
217 /*
218  * Lock & load the kernel context for the specified blade.
219  */
220 static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
221 {
222 	struct gru_blade_state *bs;
223 
224 	STAT(lock_kernel_context);
225 	bs = gru_base[blade_id];
226 
227 	down_read(&bs->bs_kgts_sema);
228 	if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
229 		gru_load_kernel_context(bs, blade_id);
230 	return bs;
231 
232 }
233 
234 /*
235  * Unlock the kernel context for the specified blade. Context is not
236  * unloaded but may be stolen before next use.
237  */
238 static void gru_unlock_kernel_context(int blade_id)
239 {
240 	struct gru_blade_state *bs;
241 
242 	bs = gru_base[blade_id];
243 	up_read(&bs->bs_kgts_sema);
244 	STAT(unlock_kernel_context);
245 }
246 
247 /*
248  * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
249  * 	- returns with preemption disabled
250  */
251 static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
252 {
253 	struct gru_blade_state *bs;
254 	int lcpu;
255 
256 	BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
257 	preempt_disable();
258 	bs = gru_lock_kernel_context(uv_numa_blade_id());
259 	lcpu = uv_blade_processor_id();
260 	*cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
261 	*dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
262 	return 0;
263 }
264 
265 /*
266  * Free the current cpus reserved DSR/CBR resources.
267  */
268 static void gru_free_cpu_resources(void *cb, void *dsr)
269 {
270 	gru_unlock_kernel_context(uv_numa_blade_id());
271 	preempt_enable();
272 }
273 
274 /*
275  * Reserve GRU resources to be used asynchronously.
276  *   Note: currently supports only 1 reservation per blade.
277  *
278  * 	input:
279  * 		blade_id  - blade on which resources should be reserved
280  * 		cbrs	  - number of CBRs
281  * 		dsr_bytes - number of DSR bytes needed
282  *	output:
283  *		handle to identify resource
284  *		(0 = async resources already reserved)
285  */
286 unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
287 			struct completion *cmp)
288 {
289 	struct gru_blade_state *bs;
290 	struct gru_thread_state *kgts;
291 	int ret = 0;
292 
293 	bs = gru_base[blade_id];
294 
295 	down_write(&bs->bs_kgts_sema);
296 
297 	/* Verify no resources already reserved */
298 	if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
299 		goto done;
300 	bs->bs_async_dsr_bytes = dsr_bytes;
301 	bs->bs_async_cbrs = cbrs;
302 	bs->bs_async_wq = cmp;
303 	kgts = bs->bs_kgts;
304 
305 	/* Resources changed. Unload context if already loaded */
306 	if (kgts && kgts->ts_gru)
307 		gru_unload_context(kgts, 0);
308 	ret = ASYNC_BID_TO_HAN(blade_id);
309 
310 done:
311 	up_write(&bs->bs_kgts_sema);
312 	return ret;
313 }
314 
315 /*
316  * Release async resources previously reserved.
317  *
318  *	input:
319  *		han - handle to identify resources
320  */
321 void gru_release_async_resources(unsigned long han)
322 {
323 	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
324 
325 	down_write(&bs->bs_kgts_sema);
326 	bs->bs_async_dsr_bytes = 0;
327 	bs->bs_async_cbrs = 0;
328 	bs->bs_async_wq = NULL;
329 	up_write(&bs->bs_kgts_sema);
330 }
331 
332 /*
333  * Wait for async GRU instructions to complete.
334  *
335  *	input:
336  *		han - handle to identify resources
337  */
338 void gru_wait_async_cbr(unsigned long han)
339 {
340 	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
341 
342 	wait_for_completion(bs->bs_async_wq);
343 	mb();
344 }
345 
346 /*
347  * Lock previous reserved async GRU resources
348  *
349  *	input:
350  *		han - handle to identify resources
351  *	output:
352  *		cb  - pointer to first CBR
353  *		dsr - pointer to first DSR
354  */
355 void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr)
356 {
357 	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
358 	int blade_id = ASYNC_HAN_TO_BID(han);
359 	int ncpus;
360 
361 	gru_lock_kernel_context(blade_id);
362 	ncpus = uv_blade_nr_possible_cpus(blade_id);
363 	if (cb)
364 		*cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
365 	if (dsr)
366 		*dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
367 }
368 
369 /*
370  * Unlock previous reserved async GRU resources
371  *
372  *	input:
373  *		han - handle to identify resources
374  */
375 void gru_unlock_async_resource(unsigned long han)
376 {
377 	int blade_id = ASYNC_HAN_TO_BID(han);
378 
379 	gru_unlock_kernel_context(blade_id);
380 }
381 
382 /*----------------------------------------------------------------------*/
383 int gru_get_cb_exception_detail(void *cb,
384 		struct control_block_extended_exc_detail *excdet)
385 {
386 	struct gru_control_block_extended *cbe;
387 	struct gru_blade_state *bs;
388 	int cbrnum;
389 
390 	bs = KCB_TO_BS(cb);
391 	cbrnum = thread_cbr_number(bs->bs_kgts, get_cb_number(cb));
392 	cbe = get_cbe(GRUBASE(cb), cbrnum);
393 	gru_flush_cache(cbe);	/* CBE not coherent */
394 	excdet->opc = cbe->opccpy;
395 	excdet->exopc = cbe->exopccpy;
396 	excdet->ecause = cbe->ecause;
397 	excdet->exceptdet0 = cbe->idef1upd;
398 	excdet->exceptdet1 = cbe->idef3upd;
399 	gru_flush_cache(cbe);
400 	return 0;
401 }
402 
403 char *gru_get_cb_exception_detail_str(int ret, void *cb,
404 				      char *buf, int size)
405 {
406 	struct gru_control_block_status *gen = (void *)cb;
407 	struct control_block_extended_exc_detail excdet;
408 
409 	if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
410 		gru_get_cb_exception_detail(cb, &excdet);
411 		snprintf(buf, size,
412 			"GRU exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
413 			"excdet0 0x%lx, excdet1 0x%x",
414 			gen, excdet.opc, excdet.exopc, excdet.ecause,
415 			excdet.exceptdet0, excdet.exceptdet1);
416 	} else {
417 		snprintf(buf, size, "No exception");
418 	}
419 	return buf;
420 }
421 
422 static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
423 {
424 	while (gen->istatus >= CBS_ACTIVE) {
425 		cpu_relax();
426 		barrier();
427 	}
428 	return gen->istatus;
429 }
430 
431 static int gru_retry_exception(void *cb)
432 {
433 	struct gru_control_block_status *gen = (void *)cb;
434 	struct control_block_extended_exc_detail excdet;
435 	int retry = EXCEPTION_RETRY_LIMIT;
436 
437 	while (1)  {
438 		if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
439 			return CBS_IDLE;
440 		if (gru_get_cb_message_queue_substatus(cb))
441 			return CBS_EXCEPTION;
442 		gru_get_cb_exception_detail(cb, &excdet);
443 		if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
444 				(excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
445 			break;
446 		if (retry-- == 0)
447 			break;
448 		gen->icmd = 1;
449 		gru_flush_cache(gen);
450 	}
451 	return CBS_EXCEPTION;
452 }
453 
454 int gru_check_status_proc(void *cb)
455 {
456 	struct gru_control_block_status *gen = (void *)cb;
457 	int ret;
458 
459 	ret = gen->istatus;
460 	if (ret != CBS_EXCEPTION)
461 		return ret;
462 	return gru_retry_exception(cb);
463 
464 }
465 
466 int gru_wait_proc(void *cb)
467 {
468 	struct gru_control_block_status *gen = (void *)cb;
469 	int ret;
470 
471 	ret = gru_wait_idle_or_exception(gen);
472 	if (ret == CBS_EXCEPTION)
473 		ret = gru_retry_exception(cb);
474 
475 	return ret;
476 }
477 
478 void gru_abort(int ret, void *cb, char *str)
479 {
480 	char buf[GRU_EXC_STR_SIZE];
481 
482 	panic("GRU FATAL ERROR: %s - %s\n", str,
483 	      gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
484 }
485 
486 void gru_wait_abort_proc(void *cb)
487 {
488 	int ret;
489 
490 	ret = gru_wait_proc(cb);
491 	if (ret)
492 		gru_abort(ret, cb, "gru_wait_abort");
493 }
494 
495 
496 /*------------------------------ MESSAGE QUEUES -----------------------------*/
497 
498 /* Internal status . These are NOT returned to the user. */
499 #define MQIE_AGAIN		-1	/* try again */
500 
501 
502 /*
503  * Save/restore the "present" flag that is in the second line of 2-line
504  * messages
505  */
506 static inline int get_present2(void *p)
507 {
508 	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
509 	return mhdr->present;
510 }
511 
512 static inline void restore_present2(void *p, int val)
513 {
514 	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
515 	mhdr->present = val;
516 }
517 
518 /*
519  * Create a message queue.
520  * 	qlines - message queue size in cache lines. Includes 2-line header.
521  */
522 int gru_create_message_queue(struct gru_message_queue_desc *mqd,
523 		void *p, unsigned int bytes, int nasid, int vector, int apicid)
524 {
525 	struct message_queue *mq = p;
526 	unsigned int qlines;
527 
528 	qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
529 	memset(mq, 0, bytes);
530 	mq->start = &mq->data;
531 	mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
532 	mq->next = &mq->data;
533 	mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
534 	mq->qlines = qlines;
535 	mq->hstatus[0] = 0;
536 	mq->hstatus[1] = 1;
537 	mq->head = gru_mesq_head(2, qlines / 2 + 1);
538 	mqd->mq = mq;
539 	mqd->mq_gpa = uv_gpa(mq);
540 	mqd->qlines = qlines;
541 	mqd->interrupt_pnode = UV_NASID_TO_PNODE(nasid);
542 	mqd->interrupt_vector = vector;
543 	mqd->interrupt_apicid = apicid;
544 	return 0;
545 }
546 EXPORT_SYMBOL_GPL(gru_create_message_queue);
547 
548 /*
549  * Send a NOOP message to a message queue
550  * 	Returns:
551  * 		 0 - if queue is full after the send. This is the normal case
552  * 		     but various races can change this.
553  *		-1 - if mesq sent successfully but queue not full
554  *		>0 - unexpected error. MQE_xxx returned
555  */
556 static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
557 				void *mesg)
558 {
559 	const struct message_header noop_header = {
560 					.present = MQS_NOOP, .lines = 1};
561 	unsigned long m;
562 	int substatus, ret;
563 	struct message_header save_mhdr, *mhdr = mesg;
564 
565 	STAT(mesq_noop);
566 	save_mhdr = *mhdr;
567 	*mhdr = noop_header;
568 	gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
569 	ret = gru_wait(cb);
570 
571 	if (ret) {
572 		substatus = gru_get_cb_message_queue_substatus(cb);
573 		switch (substatus) {
574 		case CBSS_NO_ERROR:
575 			STAT(mesq_noop_unexpected_error);
576 			ret = MQE_UNEXPECTED_CB_ERR;
577 			break;
578 		case CBSS_LB_OVERFLOWED:
579 			STAT(mesq_noop_lb_overflow);
580 			ret = MQE_CONGESTION;
581 			break;
582 		case CBSS_QLIMIT_REACHED:
583 			STAT(mesq_noop_qlimit_reached);
584 			ret = 0;
585 			break;
586 		case CBSS_AMO_NACKED:
587 			STAT(mesq_noop_amo_nacked);
588 			ret = MQE_CONGESTION;
589 			break;
590 		case CBSS_PUT_NACKED:
591 			STAT(mesq_noop_put_nacked);
592 			m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
593 			gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
594 						IMA);
595 			if (gru_wait(cb) == CBS_IDLE)
596 				ret = MQIE_AGAIN;
597 			else
598 				ret = MQE_UNEXPECTED_CB_ERR;
599 			break;
600 		case CBSS_PAGE_OVERFLOW:
601 		default:
602 			BUG();
603 		}
604 	}
605 	*mhdr = save_mhdr;
606 	return ret;
607 }
608 
609 /*
610  * Handle a gru_mesq full.
611  */
612 static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
613 				void *mesg, int lines)
614 {
615 	union gru_mesqhead mqh;
616 	unsigned int limit, head;
617 	unsigned long avalue;
618 	int half, qlines;
619 
620 	/* Determine if switching to first/second half of q */
621 	avalue = gru_get_amo_value(cb);
622 	head = gru_get_amo_value_head(cb);
623 	limit = gru_get_amo_value_limit(cb);
624 
625 	qlines = mqd->qlines;
626 	half = (limit != qlines);
627 
628 	if (half)
629 		mqh = gru_mesq_head(qlines / 2 + 1, qlines);
630 	else
631 		mqh = gru_mesq_head(2, qlines / 2 + 1);
632 
633 	/* Try to get lock for switching head pointer */
634 	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
635 	if (gru_wait(cb) != CBS_IDLE)
636 		goto cberr;
637 	if (!gru_get_amo_value(cb)) {
638 		STAT(mesq_qf_locked);
639 		return MQE_QUEUE_FULL;
640 	}
641 
642 	/* Got the lock. Send optional NOP if queue not full, */
643 	if (head != limit) {
644 		if (send_noop_message(cb, mqd, mesg)) {
645 			gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
646 					XTYPE_DW, IMA);
647 			if (gru_wait(cb) != CBS_IDLE)
648 				goto cberr;
649 			STAT(mesq_qf_noop_not_full);
650 			return MQIE_AGAIN;
651 		}
652 		avalue++;
653 	}
654 
655 	/* Then flip queuehead to other half of queue. */
656 	gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
657 							IMA);
658 	if (gru_wait(cb) != CBS_IDLE)
659 		goto cberr;
660 
661 	/* If not successfully in swapping queue head, clear the hstatus lock */
662 	if (gru_get_amo_value(cb) != avalue) {
663 		STAT(mesq_qf_switch_head_failed);
664 		gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
665 							IMA);
666 		if (gru_wait(cb) != CBS_IDLE)
667 			goto cberr;
668 	}
669 	return MQIE_AGAIN;
670 cberr:
671 	STAT(mesq_qf_unexpected_error);
672 	return MQE_UNEXPECTED_CB_ERR;
673 }
674 
675 /*
676  * Send a cross-partition interrupt to the SSI that contains the target
677  * message queue. Normally, the interrupt is automatically delivered by hardware
678  * but some error conditions require explicit delivery.
679  */
680 static void send_message_queue_interrupt(struct gru_message_queue_desc *mqd)
681 {
682 	if (mqd->interrupt_vector)
683 		uv_hub_send_ipi(mqd->interrupt_pnode, mqd->interrupt_apicid,
684 				mqd->interrupt_vector);
685 }
686 
687 /*
688  * Handle a PUT failure. Note: if message was a 2-line message, one of the
689  * lines might have successfully have been written. Before sending the
690  * message, "present" must be cleared in BOTH lines to prevent the receiver
691  * from prematurely seeing the full message.
692  */
693 static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
694 			void *mesg, int lines)
695 {
696 	unsigned long m;
697 
698 	m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
699 	if (lines == 2) {
700 		gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
701 		if (gru_wait(cb) != CBS_IDLE)
702 			return MQE_UNEXPECTED_CB_ERR;
703 	}
704 	gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
705 	if (gru_wait(cb) != CBS_IDLE)
706 		return MQE_UNEXPECTED_CB_ERR;
707 	send_message_queue_interrupt(mqd);
708 	return MQE_OK;
709 }
710 
711 /*
712  * Handle a gru_mesq failure. Some of these failures are software recoverable
713  * or retryable.
714  */
715 static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
716 				void *mesg, int lines)
717 {
718 	int substatus, ret = 0;
719 
720 	substatus = gru_get_cb_message_queue_substatus(cb);
721 	switch (substatus) {
722 	case CBSS_NO_ERROR:
723 		STAT(mesq_send_unexpected_error);
724 		ret = MQE_UNEXPECTED_CB_ERR;
725 		break;
726 	case CBSS_LB_OVERFLOWED:
727 		STAT(mesq_send_lb_overflow);
728 		ret = MQE_CONGESTION;
729 		break;
730 	case CBSS_QLIMIT_REACHED:
731 		STAT(mesq_send_qlimit_reached);
732 		ret = send_message_queue_full(cb, mqd, mesg, lines);
733 		break;
734 	case CBSS_AMO_NACKED:
735 		STAT(mesq_send_amo_nacked);
736 		ret = MQE_CONGESTION;
737 		break;
738 	case CBSS_PUT_NACKED:
739 		STAT(mesq_send_put_nacked);
740 		ret = send_message_put_nacked(cb, mqd, mesg, lines);
741 		break;
742 	default:
743 		BUG();
744 	}
745 	return ret;
746 }
747 
748 /*
749  * Send a message to a message queue
750  * 	mqd	message queue descriptor
751  * 	mesg	message. ust be vaddr within a GSEG
752  * 	bytes	message size (<= 2 CL)
753  */
754 int gru_send_message_gpa(struct gru_message_queue_desc *mqd, void *mesg,
755 				unsigned int bytes)
756 {
757 	struct message_header *mhdr;
758 	void *cb;
759 	void *dsr;
760 	int istatus, clines, ret;
761 
762 	STAT(mesq_send);
763 	BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
764 
765 	clines = DIV_ROUND_UP(bytes, GRU_CACHE_LINE_BYTES);
766 	if (gru_get_cpu_resources(bytes, &cb, &dsr))
767 		return MQE_BUG_NO_RESOURCES;
768 	memcpy(dsr, mesg, bytes);
769 	mhdr = dsr;
770 	mhdr->present = MQS_FULL;
771 	mhdr->lines = clines;
772 	if (clines == 2) {
773 		mhdr->present2 = get_present2(mhdr);
774 		restore_present2(mhdr, MQS_FULL);
775 	}
776 
777 	do {
778 		ret = MQE_OK;
779 		gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), clines, IMA);
780 		istatus = gru_wait(cb);
781 		if (istatus != CBS_IDLE)
782 			ret = send_message_failure(cb, mqd, dsr, clines);
783 	} while (ret == MQIE_AGAIN);
784 	gru_free_cpu_resources(cb, dsr);
785 
786 	if (ret)
787 		STAT(mesq_send_failed);
788 	return ret;
789 }
790 EXPORT_SYMBOL_GPL(gru_send_message_gpa);
791 
792 /*
793  * Advance the receive pointer for the queue to the next message.
794  */
795 void gru_free_message(struct gru_message_queue_desc *mqd, void *mesg)
796 {
797 	struct message_queue *mq = mqd->mq;
798 	struct message_header *mhdr = mq->next;
799 	void *next, *pnext;
800 	int half = -1;
801 	int lines = mhdr->lines;
802 
803 	if (lines == 2)
804 		restore_present2(mhdr, MQS_EMPTY);
805 	mhdr->present = MQS_EMPTY;
806 
807 	pnext = mq->next;
808 	next = pnext + GRU_CACHE_LINE_BYTES * lines;
809 	if (next == mq->limit) {
810 		next = mq->start;
811 		half = 1;
812 	} else if (pnext < mq->start2 && next >= mq->start2) {
813 		half = 0;
814 	}
815 
816 	if (half >= 0)
817 		mq->hstatus[half] = 1;
818 	mq->next = next;
819 }
820 EXPORT_SYMBOL_GPL(gru_free_message);
821 
822 /*
823  * Get next message from message queue. Return NULL if no message
824  * present. User must call next_message() to move to next message.
825  * 	rmq	message queue
826  */
827 void *gru_get_next_message(struct gru_message_queue_desc *mqd)
828 {
829 	struct message_queue *mq = mqd->mq;
830 	struct message_header *mhdr = mq->next;
831 	int present = mhdr->present;
832 
833 	/* skip NOOP messages */
834 	STAT(mesq_receive);
835 	while (present == MQS_NOOP) {
836 		gru_free_message(mqd, mhdr);
837 		mhdr = mq->next;
838 		present = mhdr->present;
839 	}
840 
841 	/* Wait for both halves of 2 line messages */
842 	if (present == MQS_FULL && mhdr->lines == 2 &&
843 				get_present2(mhdr) == MQS_EMPTY)
844 		present = MQS_EMPTY;
845 
846 	if (!present) {
847 		STAT(mesq_receive_none);
848 		return NULL;
849 	}
850 
851 	if (mhdr->lines == 2)
852 		restore_present2(mhdr, mhdr->present2);
853 
854 	return mhdr;
855 }
856 EXPORT_SYMBOL_GPL(gru_get_next_message);
857 
858 /* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
859 
860 /*
861  * Copy a block of data using the GRU resources
862  */
863 int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
864 				unsigned int bytes)
865 {
866 	void *cb;
867 	void *dsr;
868 	int ret;
869 
870 	STAT(copy_gpa);
871 	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
872 		return MQE_BUG_NO_RESOURCES;
873 	gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
874 		  XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_CL, IMA);
875 	ret = gru_wait(cb);
876 	gru_free_cpu_resources(cb, dsr);
877 	return ret;
878 }
879 EXPORT_SYMBOL_GPL(gru_copy_gpa);
880 
881 /* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
882 /* 	Temp - will delete after we gain confidence in the GRU		*/
883 
884 static int quicktest0(unsigned long arg)
885 {
886 	unsigned long word0;
887 	unsigned long word1;
888 	void *cb;
889 	void *dsr;
890 	unsigned long *p;
891 	int ret = -EIO;
892 
893 	if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
894 		return MQE_BUG_NO_RESOURCES;
895 	p = dsr;
896 	word0 = MAGIC;
897 	word1 = 0;
898 
899 	gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
900 	if (gru_wait(cb) != CBS_IDLE) {
901 		printk(KERN_DEBUG "GRU quicktest0: CBR failure 1\n");
902 		goto done;
903 	}
904 
905 	if (*p != MAGIC) {
906 		printk(KERN_DEBUG "GRU: quicktest0 bad magic 0x%lx\n", *p);
907 		goto done;
908 	}
909 	gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
910 	if (gru_wait(cb) != CBS_IDLE) {
911 		printk(KERN_DEBUG "GRU quicktest0: CBR failure 2\n");
912 		goto done;
913 	}
914 
915 	if (word0 != word1 || word1 != MAGIC) {
916 		printk(KERN_DEBUG
917 		       "GRU quicktest0 err: found 0x%lx, expected 0x%lx\n",
918 		     word1, MAGIC);
919 		goto done;
920 	}
921 	ret = 0;
922 
923 done:
924 	gru_free_cpu_resources(cb, dsr);
925 	return ret;
926 }
927 
928 #define ALIGNUP(p, q)	((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
929 
930 static int quicktest1(unsigned long arg)
931 {
932 	struct gru_message_queue_desc mqd;
933 	void *p, *mq;
934 	unsigned long *dw;
935 	int i, ret = -EIO;
936 	char mes[GRU_CACHE_LINE_BYTES], *m;
937 
938 	/* Need  1K cacheline aligned that does not cross page boundary */
939 	p = kmalloc(4096, 0);
940 	if (p == NULL)
941 		return -ENOMEM;
942 	mq = ALIGNUP(p, 1024);
943 	memset(mes, 0xee, sizeof(mes));
944 	dw = mq;
945 
946 	gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
947 	for (i = 0; i < 6; i++) {
948 		mes[8] = i;
949 		do {
950 			ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
951 		} while (ret == MQE_CONGESTION);
952 		if (ret)
953 			break;
954 	}
955 	if (ret != MQE_QUEUE_FULL || i != 4)
956 		goto done;
957 
958 	for (i = 0; i < 6; i++) {
959 		m = gru_get_next_message(&mqd);
960 		if (!m || m[8] != i)
961 			break;
962 		gru_free_message(&mqd, m);
963 	}
964 	ret = (i == 4) ? 0 : -EIO;
965 
966 done:
967 	kfree(p);
968 	return ret;
969 }
970 
971 static int quicktest2(unsigned long arg)
972 {
973 	static DECLARE_COMPLETION(cmp);
974 	unsigned long han;
975 	int blade_id = 0;
976 	int numcb = 4;
977 	int ret = 0;
978 	unsigned long *buf;
979 	void *cb0, *cb;
980 	int i, k, istatus, bytes;
981 
982 	bytes = numcb * 4 * 8;
983 	buf = kmalloc(bytes, GFP_KERNEL);
984 	if (!buf)
985 		return -ENOMEM;
986 
987 	ret = -EBUSY;
988 	han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
989 	if (!han)
990 		goto done;
991 
992 	gru_lock_async_resource(han, &cb0, NULL);
993 	memset(buf, 0xee, bytes);
994 	for (i = 0; i < numcb; i++)
995 		gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
996 				XTYPE_DW, 4, 1, IMA_INTERRUPT);
997 
998 	ret = 0;
999 	for (k = 0; k < numcb; k++) {
1000 		gru_wait_async_cbr(han);
1001 		for (i = 0; i < numcb; i++) {
1002 			cb = cb0 + i * GRU_HANDLE_STRIDE;
1003 			istatus = gru_check_status(cb);
1004 			if (istatus == CBS_ACTIVE)
1005 				continue;
1006 			if (istatus == CBS_EXCEPTION)
1007 				ret = -EFAULT;
1008 			else if (buf[i] || buf[i + 1] || buf[i + 2] ||
1009 					buf[i + 3])
1010 				ret = -EIO;
1011 		}
1012 	}
1013 	BUG_ON(cmp.done);
1014 
1015 	gru_unlock_async_resource(han);
1016 	gru_release_async_resources(han);
1017 done:
1018 	kfree(buf);
1019 	return ret;
1020 }
1021 
1022 /*
1023  * Debugging only. User hook for various kernel tests
1024  * of driver & gru.
1025  */
1026 int gru_ktest(unsigned long arg)
1027 {
1028 	int ret = -EINVAL;
1029 
1030 	switch (arg & 0xff) {
1031 	case 0:
1032 		ret = quicktest0(arg);
1033 		break;
1034 	case 1:
1035 		ret = quicktest1(arg);
1036 		break;
1037 	case 2:
1038 		ret = quicktest2(arg);
1039 		break;
1040 	case 99:
1041 		ret = gru_free_kernel_contexts();
1042 		break;
1043 	}
1044 	return ret;
1045 
1046 }
1047 
1048 int gru_kservices_init(void)
1049 {
1050 	return 0;
1051 }
1052 
1053 void gru_kservices_exit(void)
1054 {
1055 	if (gru_free_kernel_contexts())
1056 		BUG();
1057 }
1058 
1059