xref: /openbmc/linux/arch/sparc/kernel/ldc.c (revision 5d331b7f)
1 // SPDX-License-Identifier: GPL-2.0
2 /* ldc.c: Logical Domain Channel link-layer protocol driver.
3  *
4  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
5  */
6 
7 #include <linux/kernel.h>
8 #include <linux/export.h>
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/delay.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/scatterlist.h>
15 #include <linux/interrupt.h>
16 #include <linux/list.h>
17 #include <linux/init.h>
18 #include <linux/bitmap.h>
19 #include <asm/iommu-common.h>
20 
21 #include <asm/hypervisor.h>
22 #include <asm/iommu.h>
23 #include <asm/page.h>
24 #include <asm/ldc.h>
25 #include <asm/mdesc.h>
26 
27 #define DRV_MODULE_NAME		"ldc"
28 #define PFX DRV_MODULE_NAME	": "
29 #define DRV_MODULE_VERSION	"1.1"
30 #define DRV_MODULE_RELDATE	"July 22, 2008"
31 
32 #define COOKIE_PGSZ_CODE	0xf000000000000000ULL
33 #define COOKIE_PGSZ_CODE_SHIFT	60ULL
34 
35 
36 static char version[] =
37 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
38 
39 /* Packet header layout for unreliable and reliable mode frames.
40  * When in RAW mode, packets are simply straight 64-byte payloads
41  * with no headers.
42  */
43 struct ldc_packet {
44 	u8			type;
45 #define LDC_CTRL		0x01
46 #define LDC_DATA		0x02
47 #define LDC_ERR			0x10
48 
49 	u8			stype;
50 #define LDC_INFO		0x01
51 #define LDC_ACK			0x02
52 #define LDC_NACK		0x04
53 
54 	u8			ctrl;
55 #define LDC_VERS		0x01 /* Link Version		*/
56 #define LDC_RTS			0x02 /* Request To Send		*/
57 #define LDC_RTR			0x03 /* Ready To Receive	*/
58 #define LDC_RDX			0x04 /* Ready for Data eXchange	*/
59 #define LDC_CTRL_MSK		0x0f
60 
61 	u8			env;
62 #define LDC_LEN			0x3f
63 #define LDC_FRAG_MASK		0xc0
64 #define LDC_START		0x40
65 #define LDC_STOP		0x80
66 
67 	u32			seqid;
68 
69 	union {
70 		u8		u_data[LDC_PACKET_SIZE - 8];
71 		struct {
72 			u32	pad;
73 			u32	ackid;
74 			u8	r_data[LDC_PACKET_SIZE - 8 - 8];
75 		} r;
76 	} u;
77 };
78 
79 struct ldc_version {
80 	u16 major;
81 	u16 minor;
82 };
83 
84 /* Ordered from largest major to lowest.  */
85 static struct ldc_version ver_arr[] = {
86 	{ .major = 1, .minor = 0 },
87 };
88 
89 #define LDC_DEFAULT_MTU			(4 * LDC_PACKET_SIZE)
90 #define LDC_DEFAULT_NUM_ENTRIES		(PAGE_SIZE / LDC_PACKET_SIZE)
91 
92 struct ldc_channel;
93 
94 struct ldc_mode_ops {
95 	int (*write)(struct ldc_channel *, const void *, unsigned int);
96 	int (*read)(struct ldc_channel *, void *, unsigned int);
97 };
98 
99 static const struct ldc_mode_ops raw_ops;
100 static const struct ldc_mode_ops nonraw_ops;
101 static const struct ldc_mode_ops stream_ops;
102 
103 int ldom_domaining_enabled;
104 
105 struct ldc_iommu {
106 	/* Protects ldc_unmap.  */
107 	spinlock_t			lock;
108 	struct ldc_mtable_entry		*page_table;
109 	struct iommu_map_table		iommu_map_table;
110 };
111 
112 struct ldc_channel {
113 	/* Protects all operations that depend upon channel state.  */
114 	spinlock_t			lock;
115 
116 	unsigned long			id;
117 
118 	u8				*mssbuf;
119 	u32				mssbuf_len;
120 	u32				mssbuf_off;
121 
122 	struct ldc_packet		*tx_base;
123 	unsigned long			tx_head;
124 	unsigned long			tx_tail;
125 	unsigned long			tx_num_entries;
126 	unsigned long			tx_ra;
127 
128 	unsigned long			tx_acked;
129 
130 	struct ldc_packet		*rx_base;
131 	unsigned long			rx_head;
132 	unsigned long			rx_tail;
133 	unsigned long			rx_num_entries;
134 	unsigned long			rx_ra;
135 
136 	u32				rcv_nxt;
137 	u32				snd_nxt;
138 
139 	unsigned long			chan_state;
140 
141 	struct ldc_channel_config	cfg;
142 	void				*event_arg;
143 
144 	const struct ldc_mode_ops	*mops;
145 
146 	struct ldc_iommu		iommu;
147 
148 	struct ldc_version		ver;
149 
150 	u8				hs_state;
151 #define LDC_HS_CLOSED			0x00
152 #define LDC_HS_OPEN			0x01
153 #define LDC_HS_GOTVERS			0x02
154 #define LDC_HS_SENTRTR			0x03
155 #define LDC_HS_GOTRTR			0x04
156 #define LDC_HS_COMPLETE			0x10
157 
158 	u8				flags;
159 #define LDC_FLAG_ALLOCED_QUEUES		0x01
160 #define LDC_FLAG_REGISTERED_QUEUES	0x02
161 #define LDC_FLAG_REGISTERED_IRQS	0x04
162 #define LDC_FLAG_RESET			0x10
163 
164 	u8				mss;
165 	u8				state;
166 
167 #define LDC_IRQ_NAME_MAX		32
168 	char				rx_irq_name[LDC_IRQ_NAME_MAX];
169 	char				tx_irq_name[LDC_IRQ_NAME_MAX];
170 
171 	struct hlist_head		mh_list;
172 
173 	struct hlist_node		list;
174 };
175 
176 #define ldcdbg(TYPE, f, a...) \
177 do {	if (lp->cfg.debug & LDC_DEBUG_##TYPE) \
178 		printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \
179 } while (0)
180 
181 #define	LDC_ABORT(lp)	ldc_abort((lp), __func__)
182 
183 static const char *state_to_str(u8 state)
184 {
185 	switch (state) {
186 	case LDC_STATE_INVALID:
187 		return "INVALID";
188 	case LDC_STATE_INIT:
189 		return "INIT";
190 	case LDC_STATE_BOUND:
191 		return "BOUND";
192 	case LDC_STATE_READY:
193 		return "READY";
194 	case LDC_STATE_CONNECTED:
195 		return "CONNECTED";
196 	default:
197 		return "<UNKNOWN>";
198 	}
199 }
200 
201 static unsigned long __advance(unsigned long off, unsigned long num_entries)
202 {
203 	off += LDC_PACKET_SIZE;
204 	if (off == (num_entries * LDC_PACKET_SIZE))
205 		off = 0;
206 
207 	return off;
208 }
209 
210 static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off)
211 {
212 	return __advance(off, lp->rx_num_entries);
213 }
214 
215 static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off)
216 {
217 	return __advance(off, lp->tx_num_entries);
218 }
219 
220 static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp,
221 						  unsigned long *new_tail)
222 {
223 	struct ldc_packet *p;
224 	unsigned long t;
225 
226 	t = tx_advance(lp, lp->tx_tail);
227 	if (t == lp->tx_head)
228 		return NULL;
229 
230 	*new_tail = t;
231 
232 	p = lp->tx_base;
233 	return p + (lp->tx_tail / LDC_PACKET_SIZE);
234 }
235 
236 /* When we are in reliable or stream mode, have to track the next packet
237  * we haven't gotten an ACK for in the TX queue using tx_acked.  We have
238  * to be careful not to stomp over the queue past that point.  During
239  * the handshake, we don't have TX data packets pending in the queue
240  * and that's why handshake_get_tx_packet() need not be mindful of
241  * lp->tx_acked.
242  */
243 static unsigned long head_for_data(struct ldc_channel *lp)
244 {
245 	if (lp->cfg.mode == LDC_MODE_STREAM)
246 		return lp->tx_acked;
247 	return lp->tx_head;
248 }
249 
250 static int tx_has_space_for(struct ldc_channel *lp, unsigned int size)
251 {
252 	unsigned long limit, tail, new_tail, diff;
253 	unsigned int mss;
254 
255 	limit = head_for_data(lp);
256 	tail = lp->tx_tail;
257 	new_tail = tx_advance(lp, tail);
258 	if (new_tail == limit)
259 		return 0;
260 
261 	if (limit > new_tail)
262 		diff = limit - new_tail;
263 	else
264 		diff = (limit +
265 			((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail));
266 	diff /= LDC_PACKET_SIZE;
267 	mss = lp->mss;
268 
269 	if (diff * mss < size)
270 		return 0;
271 
272 	return 1;
273 }
274 
275 static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp,
276 					     unsigned long *new_tail)
277 {
278 	struct ldc_packet *p;
279 	unsigned long h, t;
280 
281 	h = head_for_data(lp);
282 	t = tx_advance(lp, lp->tx_tail);
283 	if (t == h)
284 		return NULL;
285 
286 	*new_tail = t;
287 
288 	p = lp->tx_base;
289 	return p + (lp->tx_tail / LDC_PACKET_SIZE);
290 }
291 
292 static int set_tx_tail(struct ldc_channel *lp, unsigned long tail)
293 {
294 	unsigned long orig_tail = lp->tx_tail;
295 	int limit = 1000;
296 
297 	lp->tx_tail = tail;
298 	while (limit-- > 0) {
299 		unsigned long err;
300 
301 		err = sun4v_ldc_tx_set_qtail(lp->id, tail);
302 		if (!err)
303 			return 0;
304 
305 		if (err != HV_EWOULDBLOCK) {
306 			lp->tx_tail = orig_tail;
307 			return -EINVAL;
308 		}
309 		udelay(1);
310 	}
311 
312 	lp->tx_tail = orig_tail;
313 	return -EBUSY;
314 }
315 
316 /* This just updates the head value in the hypervisor using
317  * a polling loop with a timeout.  The caller takes care of
318  * upating software state representing the head change, if any.
319  */
320 static int __set_rx_head(struct ldc_channel *lp, unsigned long head)
321 {
322 	int limit = 1000;
323 
324 	while (limit-- > 0) {
325 		unsigned long err;
326 
327 		err = sun4v_ldc_rx_set_qhead(lp->id, head);
328 		if (!err)
329 			return 0;
330 
331 		if (err != HV_EWOULDBLOCK)
332 			return -EINVAL;
333 
334 		udelay(1);
335 	}
336 
337 	return -EBUSY;
338 }
339 
340 static int send_tx_packet(struct ldc_channel *lp,
341 			  struct ldc_packet *p,
342 			  unsigned long new_tail)
343 {
344 	BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE)));
345 
346 	return set_tx_tail(lp, new_tail);
347 }
348 
349 static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp,
350 						 u8 stype, u8 ctrl,
351 						 void *data, int dlen,
352 						 unsigned long *new_tail)
353 {
354 	struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail);
355 
356 	if (p) {
357 		memset(p, 0, sizeof(*p));
358 		p->type = LDC_CTRL;
359 		p->stype = stype;
360 		p->ctrl = ctrl;
361 		if (data)
362 			memcpy(p->u.u_data, data, dlen);
363 	}
364 	return p;
365 }
366 
367 static int start_handshake(struct ldc_channel *lp)
368 {
369 	struct ldc_packet *p;
370 	struct ldc_version *ver;
371 	unsigned long new_tail;
372 
373 	ver = &ver_arr[0];
374 
375 	ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n",
376 	       ver->major, ver->minor);
377 
378 	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
379 				   ver, sizeof(*ver), &new_tail);
380 	if (p) {
381 		int err = send_tx_packet(lp, p, new_tail);
382 		if (!err)
383 			lp->flags &= ~LDC_FLAG_RESET;
384 		return err;
385 	}
386 	return -EBUSY;
387 }
388 
389 static int send_version_nack(struct ldc_channel *lp,
390 			     u16 major, u16 minor)
391 {
392 	struct ldc_packet *p;
393 	struct ldc_version ver;
394 	unsigned long new_tail;
395 
396 	ver.major = major;
397 	ver.minor = minor;
398 
399 	p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS,
400 				   &ver, sizeof(ver), &new_tail);
401 	if (p) {
402 		ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n",
403 		       ver.major, ver.minor);
404 
405 		return send_tx_packet(lp, p, new_tail);
406 	}
407 	return -EBUSY;
408 }
409 
410 static int send_version_ack(struct ldc_channel *lp,
411 			    struct ldc_version *vp)
412 {
413 	struct ldc_packet *p;
414 	unsigned long new_tail;
415 
416 	p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS,
417 				   vp, sizeof(*vp), &new_tail);
418 	if (p) {
419 		ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n",
420 		       vp->major, vp->minor);
421 
422 		return send_tx_packet(lp, p, new_tail);
423 	}
424 	return -EBUSY;
425 }
426 
427 static int send_rts(struct ldc_channel *lp)
428 {
429 	struct ldc_packet *p;
430 	unsigned long new_tail;
431 
432 	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0,
433 				   &new_tail);
434 	if (p) {
435 		p->env = lp->cfg.mode;
436 		p->seqid = 0;
437 		lp->rcv_nxt = 0;
438 
439 		ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n",
440 		       p->env, p->seqid);
441 
442 		return send_tx_packet(lp, p, new_tail);
443 	}
444 	return -EBUSY;
445 }
446 
447 static int send_rtr(struct ldc_channel *lp)
448 {
449 	struct ldc_packet *p;
450 	unsigned long new_tail;
451 
452 	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0,
453 				   &new_tail);
454 	if (p) {
455 		p->env = lp->cfg.mode;
456 		p->seqid = 0;
457 
458 		ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n",
459 		       p->env, p->seqid);
460 
461 		return send_tx_packet(lp, p, new_tail);
462 	}
463 	return -EBUSY;
464 }
465 
466 static int send_rdx(struct ldc_channel *lp)
467 {
468 	struct ldc_packet *p;
469 	unsigned long new_tail;
470 
471 	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0,
472 				   &new_tail);
473 	if (p) {
474 		p->env = 0;
475 		p->seqid = ++lp->snd_nxt;
476 		p->u.r.ackid = lp->rcv_nxt;
477 
478 		ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n",
479 		       p->env, p->seqid, p->u.r.ackid);
480 
481 		return send_tx_packet(lp, p, new_tail);
482 	}
483 	return -EBUSY;
484 }
485 
486 static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt)
487 {
488 	struct ldc_packet *p;
489 	unsigned long new_tail;
490 	int err;
491 
492 	p = data_get_tx_packet(lp, &new_tail);
493 	if (!p)
494 		return -EBUSY;
495 	memset(p, 0, sizeof(*p));
496 	p->type = data_pkt->type;
497 	p->stype = LDC_NACK;
498 	p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK;
499 	p->seqid = lp->snd_nxt + 1;
500 	p->u.r.ackid = lp->rcv_nxt;
501 
502 	ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n",
503 	       p->type, p->ctrl, p->seqid, p->u.r.ackid);
504 
505 	err = send_tx_packet(lp, p, new_tail);
506 	if (!err)
507 		lp->snd_nxt++;
508 
509 	return err;
510 }
511 
512 static int ldc_abort(struct ldc_channel *lp, const char *msg)
513 {
514 	unsigned long hv_err;
515 
516 	ldcdbg(STATE, "ABORT[%s]\n", msg);
517 	ldc_print(lp);
518 
519 	/* We report but do not act upon the hypervisor errors because
520 	 * there really isn't much we can do if they fail at this point.
521 	 */
522 	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
523 	if (hv_err)
524 		printk(KERN_ERR PFX "ldc_abort: "
525 		       "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
526 		       lp->id, lp->tx_ra, lp->tx_num_entries, hv_err);
527 
528 	hv_err = sun4v_ldc_tx_get_state(lp->id,
529 					&lp->tx_head,
530 					&lp->tx_tail,
531 					&lp->chan_state);
532 	if (hv_err)
533 		printk(KERN_ERR PFX "ldc_abort: "
534 		       "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n",
535 		       lp->id, hv_err);
536 
537 	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
538 	if (hv_err)
539 		printk(KERN_ERR PFX "ldc_abort: "
540 		       "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
541 		       lp->id, lp->rx_ra, lp->rx_num_entries, hv_err);
542 
543 	/* Refetch the RX queue state as well, because we could be invoked
544 	 * here in the queue processing context.
545 	 */
546 	hv_err = sun4v_ldc_rx_get_state(lp->id,
547 					&lp->rx_head,
548 					&lp->rx_tail,
549 					&lp->chan_state);
550 	if (hv_err)
551 		printk(KERN_ERR PFX "ldc_abort: "
552 		       "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n",
553 		       lp->id, hv_err);
554 
555 	return -ECONNRESET;
556 }
557 
558 static struct ldc_version *find_by_major(u16 major)
559 {
560 	struct ldc_version *ret = NULL;
561 	int i;
562 
563 	for (i = 0; i < ARRAY_SIZE(ver_arr); i++) {
564 		struct ldc_version *v = &ver_arr[i];
565 		if (v->major <= major) {
566 			ret = v;
567 			break;
568 		}
569 	}
570 	return ret;
571 }
572 
573 static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp)
574 {
575 	struct ldc_version *vap;
576 	int err;
577 
578 	ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n",
579 	       vp->major, vp->minor);
580 
581 	if (lp->hs_state == LDC_HS_GOTVERS) {
582 		lp->hs_state = LDC_HS_OPEN;
583 		memset(&lp->ver, 0, sizeof(lp->ver));
584 	}
585 
586 	vap = find_by_major(vp->major);
587 	if (!vap) {
588 		err = send_version_nack(lp, 0, 0);
589 	} else if (vap->major != vp->major) {
590 		err = send_version_nack(lp, vap->major, vap->minor);
591 	} else {
592 		struct ldc_version ver = *vp;
593 		if (ver.minor > vap->minor)
594 			ver.minor = vap->minor;
595 		err = send_version_ack(lp, &ver);
596 		if (!err) {
597 			lp->ver = ver;
598 			lp->hs_state = LDC_HS_GOTVERS;
599 		}
600 	}
601 	if (err)
602 		return LDC_ABORT(lp);
603 
604 	return 0;
605 }
606 
607 static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp)
608 {
609 	ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n",
610 	       vp->major, vp->minor);
611 
612 	if (lp->hs_state == LDC_HS_GOTVERS) {
613 		if (lp->ver.major != vp->major ||
614 		    lp->ver.minor != vp->minor)
615 			return LDC_ABORT(lp);
616 	} else {
617 		lp->ver = *vp;
618 		lp->hs_state = LDC_HS_GOTVERS;
619 	}
620 	if (send_rts(lp))
621 		return LDC_ABORT(lp);
622 	return 0;
623 }
624 
625 static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp)
626 {
627 	struct ldc_version *vap;
628 	struct ldc_packet *p;
629 	unsigned long new_tail;
630 
631 	if (vp->major == 0 && vp->minor == 0)
632 		return LDC_ABORT(lp);
633 
634 	vap = find_by_major(vp->major);
635 	if (!vap)
636 		return LDC_ABORT(lp);
637 
638 	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
639 					   vap, sizeof(*vap),
640 					   &new_tail);
641 	if (!p)
642 		return LDC_ABORT(lp);
643 
644 	return send_tx_packet(lp, p, new_tail);
645 }
646 
647 static int process_version(struct ldc_channel *lp,
648 			   struct ldc_packet *p)
649 {
650 	struct ldc_version *vp;
651 
652 	vp = (struct ldc_version *) p->u.u_data;
653 
654 	switch (p->stype) {
655 	case LDC_INFO:
656 		return process_ver_info(lp, vp);
657 
658 	case LDC_ACK:
659 		return process_ver_ack(lp, vp);
660 
661 	case LDC_NACK:
662 		return process_ver_nack(lp, vp);
663 
664 	default:
665 		return LDC_ABORT(lp);
666 	}
667 }
668 
669 static int process_rts(struct ldc_channel *lp,
670 		       struct ldc_packet *p)
671 {
672 	ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n",
673 	       p->stype, p->seqid, p->env);
674 
675 	if (p->stype     != LDC_INFO	   ||
676 	    lp->hs_state != LDC_HS_GOTVERS ||
677 	    p->env       != lp->cfg.mode)
678 		return LDC_ABORT(lp);
679 
680 	lp->snd_nxt = p->seqid;
681 	lp->rcv_nxt = p->seqid;
682 	lp->hs_state = LDC_HS_SENTRTR;
683 	if (send_rtr(lp))
684 		return LDC_ABORT(lp);
685 
686 	return 0;
687 }
688 
689 static int process_rtr(struct ldc_channel *lp,
690 		       struct ldc_packet *p)
691 {
692 	ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n",
693 	       p->stype, p->seqid, p->env);
694 
695 	if (p->stype     != LDC_INFO ||
696 	    p->env       != lp->cfg.mode)
697 		return LDC_ABORT(lp);
698 
699 	lp->snd_nxt = p->seqid;
700 	lp->hs_state = LDC_HS_COMPLETE;
701 	ldc_set_state(lp, LDC_STATE_CONNECTED);
702 	send_rdx(lp);
703 
704 	return LDC_EVENT_UP;
705 }
706 
707 static int rx_seq_ok(struct ldc_channel *lp, u32 seqid)
708 {
709 	return lp->rcv_nxt + 1 == seqid;
710 }
711 
712 static int process_rdx(struct ldc_channel *lp,
713 		       struct ldc_packet *p)
714 {
715 	ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n",
716 	       p->stype, p->seqid, p->env, p->u.r.ackid);
717 
718 	if (p->stype != LDC_INFO ||
719 	    !(rx_seq_ok(lp, p->seqid)))
720 		return LDC_ABORT(lp);
721 
722 	lp->rcv_nxt = p->seqid;
723 
724 	lp->hs_state = LDC_HS_COMPLETE;
725 	ldc_set_state(lp, LDC_STATE_CONNECTED);
726 
727 	return LDC_EVENT_UP;
728 }
729 
730 static int process_control_frame(struct ldc_channel *lp,
731 				 struct ldc_packet *p)
732 {
733 	switch (p->ctrl) {
734 	case LDC_VERS:
735 		return process_version(lp, p);
736 
737 	case LDC_RTS:
738 		return process_rts(lp, p);
739 
740 	case LDC_RTR:
741 		return process_rtr(lp, p);
742 
743 	case LDC_RDX:
744 		return process_rdx(lp, p);
745 
746 	default:
747 		return LDC_ABORT(lp);
748 	}
749 }
750 
751 static int process_error_frame(struct ldc_channel *lp,
752 			       struct ldc_packet *p)
753 {
754 	return LDC_ABORT(lp);
755 }
756 
757 static int process_data_ack(struct ldc_channel *lp,
758 			    struct ldc_packet *ack)
759 {
760 	unsigned long head = lp->tx_acked;
761 	u32 ackid = ack->u.r.ackid;
762 
763 	while (1) {
764 		struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE);
765 
766 		head = tx_advance(lp, head);
767 
768 		if (p->seqid == ackid) {
769 			lp->tx_acked = head;
770 			return 0;
771 		}
772 		if (head == lp->tx_tail)
773 			return LDC_ABORT(lp);
774 	}
775 
776 	return 0;
777 }
778 
779 static void send_events(struct ldc_channel *lp, unsigned int event_mask)
780 {
781 	if (event_mask & LDC_EVENT_RESET)
782 		lp->cfg.event(lp->event_arg, LDC_EVENT_RESET);
783 	if (event_mask & LDC_EVENT_UP)
784 		lp->cfg.event(lp->event_arg, LDC_EVENT_UP);
785 	if (event_mask & LDC_EVENT_DATA_READY)
786 		lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY);
787 }
788 
789 static irqreturn_t ldc_rx(int irq, void *dev_id)
790 {
791 	struct ldc_channel *lp = dev_id;
792 	unsigned long orig_state, flags;
793 	unsigned int event_mask;
794 
795 	spin_lock_irqsave(&lp->lock, flags);
796 
797 	orig_state = lp->chan_state;
798 
799 	/* We should probably check for hypervisor errors here and
800 	 * reset the LDC channel if we get one.
801 	 */
802 	sun4v_ldc_rx_get_state(lp->id,
803 			       &lp->rx_head,
804 			       &lp->rx_tail,
805 			       &lp->chan_state);
806 
807 	ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
808 	       orig_state, lp->chan_state, lp->rx_head, lp->rx_tail);
809 
810 	event_mask = 0;
811 
812 	if (lp->cfg.mode == LDC_MODE_RAW &&
813 	    lp->chan_state == LDC_CHANNEL_UP) {
814 		lp->hs_state = LDC_HS_COMPLETE;
815 		ldc_set_state(lp, LDC_STATE_CONNECTED);
816 
817 		/*
818 		 * Generate an LDC_EVENT_UP event if the channel
819 		 * was not already up.
820 		 */
821 		if (orig_state != LDC_CHANNEL_UP) {
822 			event_mask |= LDC_EVENT_UP;
823 			orig_state = lp->chan_state;
824 		}
825 	}
826 
827 	/* If we are in reset state, flush the RX queue and ignore
828 	 * everything.
829 	 */
830 	if (lp->flags & LDC_FLAG_RESET) {
831 		(void) ldc_rx_reset(lp);
832 		goto out;
833 	}
834 
835 	/* Once we finish the handshake, we let the ldc_read()
836 	 * paths do all of the control frame and state management.
837 	 * Just trigger the callback.
838 	 */
839 	if (lp->hs_state == LDC_HS_COMPLETE) {
840 handshake_complete:
841 		if (lp->chan_state != orig_state) {
842 			unsigned int event = LDC_EVENT_RESET;
843 
844 			if (lp->chan_state == LDC_CHANNEL_UP)
845 				event = LDC_EVENT_UP;
846 
847 			event_mask |= event;
848 		}
849 		if (lp->rx_head != lp->rx_tail)
850 			event_mask |= LDC_EVENT_DATA_READY;
851 
852 		goto out;
853 	}
854 
855 	if (lp->chan_state != orig_state)
856 		goto out;
857 
858 	while (lp->rx_head != lp->rx_tail) {
859 		struct ldc_packet *p;
860 		unsigned long new;
861 		int err;
862 
863 		p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
864 
865 		switch (p->type) {
866 		case LDC_CTRL:
867 			err = process_control_frame(lp, p);
868 			if (err > 0)
869 				event_mask |= err;
870 			break;
871 
872 		case LDC_DATA:
873 			event_mask |= LDC_EVENT_DATA_READY;
874 			err = 0;
875 			break;
876 
877 		case LDC_ERR:
878 			err = process_error_frame(lp, p);
879 			break;
880 
881 		default:
882 			err = LDC_ABORT(lp);
883 			break;
884 		}
885 
886 		if (err < 0)
887 			break;
888 
889 		new = lp->rx_head;
890 		new += LDC_PACKET_SIZE;
891 		if (new == (lp->rx_num_entries * LDC_PACKET_SIZE))
892 			new = 0;
893 		lp->rx_head = new;
894 
895 		err = __set_rx_head(lp, new);
896 		if (err < 0) {
897 			(void) LDC_ABORT(lp);
898 			break;
899 		}
900 		if (lp->hs_state == LDC_HS_COMPLETE)
901 			goto handshake_complete;
902 	}
903 
904 out:
905 	spin_unlock_irqrestore(&lp->lock, flags);
906 
907 	send_events(lp, event_mask);
908 
909 	return IRQ_HANDLED;
910 }
911 
912 static irqreturn_t ldc_tx(int irq, void *dev_id)
913 {
914 	struct ldc_channel *lp = dev_id;
915 	unsigned long flags, orig_state;
916 	unsigned int event_mask = 0;
917 
918 	spin_lock_irqsave(&lp->lock, flags);
919 
920 	orig_state = lp->chan_state;
921 
922 	/* We should probably check for hypervisor errors here and
923 	 * reset the LDC channel if we get one.
924 	 */
925 	sun4v_ldc_tx_get_state(lp->id,
926 			       &lp->tx_head,
927 			       &lp->tx_tail,
928 			       &lp->chan_state);
929 
930 	ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
931 	       orig_state, lp->chan_state, lp->tx_head, lp->tx_tail);
932 
933 	if (lp->cfg.mode == LDC_MODE_RAW &&
934 	    lp->chan_state == LDC_CHANNEL_UP) {
935 		lp->hs_state = LDC_HS_COMPLETE;
936 		ldc_set_state(lp, LDC_STATE_CONNECTED);
937 
938 		/*
939 		 * Generate an LDC_EVENT_UP event if the channel
940 		 * was not already up.
941 		 */
942 		if (orig_state != LDC_CHANNEL_UP) {
943 			event_mask |= LDC_EVENT_UP;
944 			orig_state = lp->chan_state;
945 		}
946 	}
947 
948 	spin_unlock_irqrestore(&lp->lock, flags);
949 
950 	send_events(lp, event_mask);
951 
952 	return IRQ_HANDLED;
953 }
954 
955 /* XXX ldc_alloc() and ldc_free() needs to run under a mutex so
956  * XXX that addition and removal from the ldc_channel_list has
957  * XXX atomicity, otherwise the __ldc_channel_exists() check is
958  * XXX totally pointless as another thread can slip into ldc_alloc()
959  * XXX and add a channel with the same ID.  There also needs to be
960  * XXX a spinlock for ldc_channel_list.
961  */
962 static HLIST_HEAD(ldc_channel_list);
963 
964 static int __ldc_channel_exists(unsigned long id)
965 {
966 	struct ldc_channel *lp;
967 
968 	hlist_for_each_entry(lp, &ldc_channel_list, list) {
969 		if (lp->id == id)
970 			return 1;
971 	}
972 	return 0;
973 }
974 
975 static int alloc_queue(const char *name, unsigned long num_entries,
976 		       struct ldc_packet **base, unsigned long *ra)
977 {
978 	unsigned long size, order;
979 	void *q;
980 
981 	size = num_entries * LDC_PACKET_SIZE;
982 	order = get_order(size);
983 
984 	q = (void *) __get_free_pages(GFP_KERNEL, order);
985 	if (!q) {
986 		printk(KERN_ERR PFX "Alloc of %s queue failed with "
987 		       "size=%lu order=%lu\n", name, size, order);
988 		return -ENOMEM;
989 	}
990 
991 	memset(q, 0, PAGE_SIZE << order);
992 
993 	*base = q;
994 	*ra = __pa(q);
995 
996 	return 0;
997 }
998 
999 static void free_queue(unsigned long num_entries, struct ldc_packet *q)
1000 {
1001 	unsigned long size, order;
1002 
1003 	if (!q)
1004 		return;
1005 
1006 	size = num_entries * LDC_PACKET_SIZE;
1007 	order = get_order(size);
1008 
1009 	free_pages((unsigned long)q, order);
1010 }
1011 
1012 static unsigned long ldc_cookie_to_index(u64 cookie, void *arg)
1013 {
1014 	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;
1015 	/* struct ldc_iommu *ldc_iommu = (struct ldc_iommu *)arg; */
1016 
1017 	cookie &= ~COOKIE_PGSZ_CODE;
1018 
1019 	return (cookie >> (13ULL + (szcode * 3ULL)));
1020 }
1021 
1022 static void ldc_demap(struct ldc_iommu *iommu, unsigned long id, u64 cookie,
1023 		      unsigned long entry, unsigned long npages)
1024 {
1025 	struct ldc_mtable_entry *base;
1026 	unsigned long i, shift;
1027 
1028 	shift = (cookie >> COOKIE_PGSZ_CODE_SHIFT) * 3;
1029 	base = iommu->page_table + entry;
1030 	for (i = 0; i < npages; i++) {
1031 		if (base->cookie)
1032 			sun4v_ldc_revoke(id, cookie + (i << shift),
1033 					 base->cookie);
1034 		base->mte = 0;
1035 	}
1036 }
1037 
1038 /* XXX Make this configurable... XXX */
1039 #define LDC_IOTABLE_SIZE	(8 * 1024)
1040 
1041 static int ldc_iommu_init(const char *name, struct ldc_channel *lp)
1042 {
1043 	unsigned long sz, num_tsb_entries, tsbsize, order;
1044 	struct ldc_iommu *ldc_iommu = &lp->iommu;
1045 	struct iommu_map_table *iommu = &ldc_iommu->iommu_map_table;
1046 	struct ldc_mtable_entry *table;
1047 	unsigned long hv_err;
1048 	int err;
1049 
1050 	num_tsb_entries = LDC_IOTABLE_SIZE;
1051 	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
1052 	spin_lock_init(&ldc_iommu->lock);
1053 
1054 	sz = num_tsb_entries / 8;
1055 	sz = (sz + 7UL) & ~7UL;
1056 	iommu->map = kzalloc(sz, GFP_KERNEL);
1057 	if (!iommu->map) {
1058 		printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz);
1059 		return -ENOMEM;
1060 	}
1061 	iommu_tbl_pool_init(iommu, num_tsb_entries, PAGE_SHIFT,
1062 			    NULL, false /* no large pool */,
1063 			    1 /* npools */,
1064 			    true /* skip span boundary check */);
1065 
1066 	order = get_order(tsbsize);
1067 
1068 	table = (struct ldc_mtable_entry *)
1069 		__get_free_pages(GFP_KERNEL, order);
1070 	err = -ENOMEM;
1071 	if (!table) {
1072 		printk(KERN_ERR PFX "Alloc of MTE table failed, "
1073 		       "size=%lu order=%lu\n", tsbsize, order);
1074 		goto out_free_map;
1075 	}
1076 
1077 	memset(table, 0, PAGE_SIZE << order);
1078 
1079 	ldc_iommu->page_table = table;
1080 
1081 	hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table),
1082 					 num_tsb_entries);
1083 	err = -EINVAL;
1084 	if (hv_err)
1085 		goto out_free_table;
1086 
1087 	return 0;
1088 
1089 out_free_table:
1090 	free_pages((unsigned long) table, order);
1091 	ldc_iommu->page_table = NULL;
1092 
1093 out_free_map:
1094 	kfree(iommu->map);
1095 	iommu->map = NULL;
1096 
1097 	return err;
1098 }
1099 
1100 static void ldc_iommu_release(struct ldc_channel *lp)
1101 {
1102 	struct ldc_iommu *ldc_iommu = &lp->iommu;
1103 	struct iommu_map_table *iommu = &ldc_iommu->iommu_map_table;
1104 	unsigned long num_tsb_entries, tsbsize, order;
1105 
1106 	(void) sun4v_ldc_set_map_table(lp->id, 0, 0);
1107 
1108 	num_tsb_entries = iommu->poolsize * iommu->nr_pools;
1109 	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
1110 	order = get_order(tsbsize);
1111 
1112 	free_pages((unsigned long) ldc_iommu->page_table, order);
1113 	ldc_iommu->page_table = NULL;
1114 
1115 	kfree(iommu->map);
1116 	iommu->map = NULL;
1117 }
1118 
1119 struct ldc_channel *ldc_alloc(unsigned long id,
1120 			      const struct ldc_channel_config *cfgp,
1121 			      void *event_arg,
1122 			      const char *name)
1123 {
1124 	struct ldc_channel *lp;
1125 	const struct ldc_mode_ops *mops;
1126 	unsigned long dummy1, dummy2, hv_err;
1127 	u8 mss, *mssbuf;
1128 	int err;
1129 
1130 	err = -ENODEV;
1131 	if (!ldom_domaining_enabled)
1132 		goto out_err;
1133 
1134 	err = -EINVAL;
1135 	if (!cfgp)
1136 		goto out_err;
1137 	if (!name)
1138 		goto out_err;
1139 
1140 	switch (cfgp->mode) {
1141 	case LDC_MODE_RAW:
1142 		mops = &raw_ops;
1143 		mss = LDC_PACKET_SIZE;
1144 		break;
1145 
1146 	case LDC_MODE_UNRELIABLE:
1147 		mops = &nonraw_ops;
1148 		mss = LDC_PACKET_SIZE - 8;
1149 		break;
1150 
1151 	case LDC_MODE_STREAM:
1152 		mops = &stream_ops;
1153 		mss = LDC_PACKET_SIZE - 8 - 8;
1154 		break;
1155 
1156 	default:
1157 		goto out_err;
1158 	}
1159 
1160 	if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq)
1161 		goto out_err;
1162 
1163 	hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2);
1164 	err = -ENODEV;
1165 	if (hv_err == HV_ECHANNEL)
1166 		goto out_err;
1167 
1168 	err = -EEXIST;
1169 	if (__ldc_channel_exists(id))
1170 		goto out_err;
1171 
1172 	mssbuf = NULL;
1173 
1174 	lp = kzalloc(sizeof(*lp), GFP_KERNEL);
1175 	err = -ENOMEM;
1176 	if (!lp)
1177 		goto out_err;
1178 
1179 	spin_lock_init(&lp->lock);
1180 
1181 	lp->id = id;
1182 
1183 	err = ldc_iommu_init(name, lp);
1184 	if (err)
1185 		goto out_free_ldc;
1186 
1187 	lp->mops = mops;
1188 	lp->mss = mss;
1189 
1190 	lp->cfg = *cfgp;
1191 	if (!lp->cfg.mtu)
1192 		lp->cfg.mtu = LDC_DEFAULT_MTU;
1193 
1194 	if (lp->cfg.mode == LDC_MODE_STREAM) {
1195 		mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL);
1196 		if (!mssbuf) {
1197 			err = -ENOMEM;
1198 			goto out_free_iommu;
1199 		}
1200 		lp->mssbuf = mssbuf;
1201 	}
1202 
1203 	lp->event_arg = event_arg;
1204 
1205 	/* XXX allow setting via ldc_channel_config to override defaults
1206 	 * XXX or use some formula based upon mtu
1207 	 */
1208 	lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
1209 	lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
1210 
1211 	err = alloc_queue("TX", lp->tx_num_entries,
1212 			  &lp->tx_base, &lp->tx_ra);
1213 	if (err)
1214 		goto out_free_mssbuf;
1215 
1216 	err = alloc_queue("RX", lp->rx_num_entries,
1217 			  &lp->rx_base, &lp->rx_ra);
1218 	if (err)
1219 		goto out_free_txq;
1220 
1221 	lp->flags |= LDC_FLAG_ALLOCED_QUEUES;
1222 
1223 	lp->hs_state = LDC_HS_CLOSED;
1224 	ldc_set_state(lp, LDC_STATE_INIT);
1225 
1226 	INIT_HLIST_NODE(&lp->list);
1227 	hlist_add_head(&lp->list, &ldc_channel_list);
1228 
1229 	INIT_HLIST_HEAD(&lp->mh_list);
1230 
1231 	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name);
1232 	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name);
1233 
1234 	err = request_irq(lp->cfg.rx_irq, ldc_rx, 0,
1235 			  lp->rx_irq_name, lp);
1236 	if (err)
1237 		goto out_free_txq;
1238 
1239 	err = request_irq(lp->cfg.tx_irq, ldc_tx, 0,
1240 			  lp->tx_irq_name, lp);
1241 	if (err) {
1242 		free_irq(lp->cfg.rx_irq, lp);
1243 		goto out_free_txq;
1244 	}
1245 
1246 	return lp;
1247 
1248 out_free_txq:
1249 	free_queue(lp->tx_num_entries, lp->tx_base);
1250 
1251 out_free_mssbuf:
1252 	kfree(mssbuf);
1253 
1254 out_free_iommu:
1255 	ldc_iommu_release(lp);
1256 
1257 out_free_ldc:
1258 	kfree(lp);
1259 
1260 out_err:
1261 	return ERR_PTR(err);
1262 }
1263 EXPORT_SYMBOL(ldc_alloc);
1264 
1265 void ldc_unbind(struct ldc_channel *lp)
1266 {
1267 	if (lp->flags & LDC_FLAG_REGISTERED_IRQS) {
1268 		free_irq(lp->cfg.rx_irq, lp);
1269 		free_irq(lp->cfg.tx_irq, lp);
1270 		lp->flags &= ~LDC_FLAG_REGISTERED_IRQS;
1271 	}
1272 
1273 	if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) {
1274 		sun4v_ldc_tx_qconf(lp->id, 0, 0);
1275 		sun4v_ldc_rx_qconf(lp->id, 0, 0);
1276 		lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
1277 	}
1278 	if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) {
1279 		free_queue(lp->tx_num_entries, lp->tx_base);
1280 		free_queue(lp->rx_num_entries, lp->rx_base);
1281 		lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES;
1282 	}
1283 
1284 	ldc_set_state(lp, LDC_STATE_INIT);
1285 }
1286 EXPORT_SYMBOL(ldc_unbind);
1287 
1288 void ldc_free(struct ldc_channel *lp)
1289 {
1290 	ldc_unbind(lp);
1291 	hlist_del(&lp->list);
1292 	kfree(lp->mssbuf);
1293 	ldc_iommu_release(lp);
1294 
1295 	kfree(lp);
1296 }
1297 EXPORT_SYMBOL(ldc_free);
1298 
1299 /* Bind the channel.  This registers the LDC queues with
1300  * the hypervisor and puts the channel into a pseudo-listening
1301  * state.  This does not initiate a handshake, ldc_connect() does
1302  * that.
1303  */
1304 int ldc_bind(struct ldc_channel *lp)
1305 {
1306 	unsigned long hv_err, flags;
1307 	int err = -EINVAL;
1308 
1309 	if (lp->state != LDC_STATE_INIT)
1310 		return -EINVAL;
1311 
1312 	spin_lock_irqsave(&lp->lock, flags);
1313 
1314 	enable_irq(lp->cfg.rx_irq);
1315 	enable_irq(lp->cfg.tx_irq);
1316 
1317 	lp->flags |= LDC_FLAG_REGISTERED_IRQS;
1318 
1319 	err = -ENODEV;
1320 	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
1321 	if (hv_err)
1322 		goto out_free_irqs;
1323 
1324 	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
1325 	if (hv_err)
1326 		goto out_free_irqs;
1327 
1328 	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
1329 	if (hv_err)
1330 		goto out_unmap_tx;
1331 
1332 	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
1333 	if (hv_err)
1334 		goto out_unmap_tx;
1335 
1336 	lp->flags |= LDC_FLAG_REGISTERED_QUEUES;
1337 
1338 	hv_err = sun4v_ldc_tx_get_state(lp->id,
1339 					&lp->tx_head,
1340 					&lp->tx_tail,
1341 					&lp->chan_state);
1342 	err = -EBUSY;
1343 	if (hv_err)
1344 		goto out_unmap_rx;
1345 
1346 	lp->tx_acked = lp->tx_head;
1347 
1348 	lp->hs_state = LDC_HS_OPEN;
1349 	ldc_set_state(lp, LDC_STATE_BOUND);
1350 
1351 	if (lp->cfg.mode == LDC_MODE_RAW) {
1352 		/*
1353 		 * There is no handshake in RAW mode, so handshake
1354 		 * is completed.
1355 		 */
1356 		lp->hs_state = LDC_HS_COMPLETE;
1357 	}
1358 
1359 	spin_unlock_irqrestore(&lp->lock, flags);
1360 
1361 	return 0;
1362 
1363 out_unmap_rx:
1364 	lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
1365 	sun4v_ldc_rx_qconf(lp->id, 0, 0);
1366 
1367 out_unmap_tx:
1368 	sun4v_ldc_tx_qconf(lp->id, 0, 0);
1369 
1370 out_free_irqs:
1371 	lp->flags &= ~LDC_FLAG_REGISTERED_IRQS;
1372 	free_irq(lp->cfg.tx_irq, lp);
1373 	free_irq(lp->cfg.rx_irq, lp);
1374 
1375 	spin_unlock_irqrestore(&lp->lock, flags);
1376 
1377 	return err;
1378 }
1379 EXPORT_SYMBOL(ldc_bind);
1380 
1381 int ldc_connect(struct ldc_channel *lp)
1382 {
1383 	unsigned long flags;
1384 	int err;
1385 
1386 	if (lp->cfg.mode == LDC_MODE_RAW)
1387 		return -EINVAL;
1388 
1389 	spin_lock_irqsave(&lp->lock, flags);
1390 
1391 	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
1392 	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) ||
1393 	    lp->hs_state != LDC_HS_OPEN)
1394 		err = ((lp->hs_state > LDC_HS_OPEN) ? 0 : -EINVAL);
1395 	else
1396 		err = start_handshake(lp);
1397 
1398 	spin_unlock_irqrestore(&lp->lock, flags);
1399 
1400 	return err;
1401 }
1402 EXPORT_SYMBOL(ldc_connect);
1403 
1404 int ldc_disconnect(struct ldc_channel *lp)
1405 {
1406 	unsigned long hv_err, flags;
1407 	int err;
1408 
1409 	if (lp->cfg.mode == LDC_MODE_RAW)
1410 		return -EINVAL;
1411 
1412 	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
1413 	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES))
1414 		return -EINVAL;
1415 
1416 	spin_lock_irqsave(&lp->lock, flags);
1417 
1418 	err = -ENODEV;
1419 	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
1420 	if (hv_err)
1421 		goto out_err;
1422 
1423 	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
1424 	if (hv_err)
1425 		goto out_err;
1426 
1427 	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
1428 	if (hv_err)
1429 		goto out_err;
1430 
1431 	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
1432 	if (hv_err)
1433 		goto out_err;
1434 
1435 	ldc_set_state(lp, LDC_STATE_BOUND);
1436 	lp->hs_state = LDC_HS_OPEN;
1437 	lp->flags |= LDC_FLAG_RESET;
1438 
1439 	spin_unlock_irqrestore(&lp->lock, flags);
1440 
1441 	return 0;
1442 
1443 out_err:
1444 	sun4v_ldc_tx_qconf(lp->id, 0, 0);
1445 	sun4v_ldc_rx_qconf(lp->id, 0, 0);
1446 	free_irq(lp->cfg.tx_irq, lp);
1447 	free_irq(lp->cfg.rx_irq, lp);
1448 	lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS |
1449 		       LDC_FLAG_REGISTERED_QUEUES);
1450 	ldc_set_state(lp, LDC_STATE_INIT);
1451 
1452 	spin_unlock_irqrestore(&lp->lock, flags);
1453 
1454 	return err;
1455 }
1456 EXPORT_SYMBOL(ldc_disconnect);
1457 
1458 int ldc_state(struct ldc_channel *lp)
1459 {
1460 	return lp->state;
1461 }
1462 EXPORT_SYMBOL(ldc_state);
1463 
1464 void ldc_set_state(struct ldc_channel *lp, u8 state)
1465 {
1466 	ldcdbg(STATE, "STATE (%s) --> (%s)\n",
1467 	       state_to_str(lp->state),
1468 	       state_to_str(state));
1469 
1470 	lp->state = state;
1471 }
1472 EXPORT_SYMBOL(ldc_set_state);
1473 
1474 int ldc_mode(struct ldc_channel *lp)
1475 {
1476 	return lp->cfg.mode;
1477 }
1478 EXPORT_SYMBOL(ldc_mode);
1479 
1480 int ldc_rx_reset(struct ldc_channel *lp)
1481 {
1482 	return __set_rx_head(lp, lp->rx_tail);
1483 }
1484 EXPORT_SYMBOL(ldc_rx_reset);
1485 
1486 void __ldc_print(struct ldc_channel *lp, const char *caller)
1487 {
1488 	pr_info("%s: id=0x%lx flags=0x%x state=%s cstate=0x%lx hsstate=0x%x\n"
1489 		"\trx_h=0x%lx rx_t=0x%lx rx_n=%ld\n"
1490 		"\ttx_h=0x%lx tx_t=0x%lx tx_n=%ld\n"
1491 		"\trcv_nxt=%u snd_nxt=%u\n",
1492 		caller, lp->id, lp->flags, state_to_str(lp->state),
1493 		lp->chan_state, lp->hs_state,
1494 		lp->rx_head, lp->rx_tail, lp->rx_num_entries,
1495 		lp->tx_head, lp->tx_tail, lp->tx_num_entries,
1496 		lp->rcv_nxt, lp->snd_nxt);
1497 }
1498 EXPORT_SYMBOL(__ldc_print);
1499 
1500 static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size)
1501 {
1502 	struct ldc_packet *p;
1503 	unsigned long new_tail, hv_err;
1504 	int err;
1505 
1506 	hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
1507 					&lp->chan_state);
1508 	if (unlikely(hv_err))
1509 		return -EBUSY;
1510 
1511 	if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
1512 		return LDC_ABORT(lp);
1513 
1514 	if (size > LDC_PACKET_SIZE)
1515 		return -EMSGSIZE;
1516 
1517 	p = data_get_tx_packet(lp, &new_tail);
1518 	if (!p)
1519 		return -EAGAIN;
1520 
1521 	memcpy(p, buf, size);
1522 
1523 	err = send_tx_packet(lp, p, new_tail);
1524 	if (!err)
1525 		err = size;
1526 
1527 	return err;
1528 }
1529 
1530 static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size)
1531 {
1532 	struct ldc_packet *p;
1533 	unsigned long hv_err, new;
1534 	int err;
1535 
1536 	if (size < LDC_PACKET_SIZE)
1537 		return -EINVAL;
1538 
1539 	hv_err = sun4v_ldc_rx_get_state(lp->id,
1540 					&lp->rx_head,
1541 					&lp->rx_tail,
1542 					&lp->chan_state);
1543 	if (hv_err)
1544 		return LDC_ABORT(lp);
1545 
1546 	if (lp->chan_state == LDC_CHANNEL_DOWN ||
1547 	    lp->chan_state == LDC_CHANNEL_RESETTING)
1548 		return -ECONNRESET;
1549 
1550 	if (lp->rx_head == lp->rx_tail)
1551 		return 0;
1552 
1553 	p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
1554 	memcpy(buf, p, LDC_PACKET_SIZE);
1555 
1556 	new = rx_advance(lp, lp->rx_head);
1557 	lp->rx_head = new;
1558 
1559 	err = __set_rx_head(lp, new);
1560 	if (err < 0)
1561 		err = -ECONNRESET;
1562 	else
1563 		err = LDC_PACKET_SIZE;
1564 
1565 	return err;
1566 }
1567 
1568 static const struct ldc_mode_ops raw_ops = {
1569 	.write		=	write_raw,
1570 	.read		=	read_raw,
1571 };
1572 
1573 static int write_nonraw(struct ldc_channel *lp, const void *buf,
1574 			unsigned int size)
1575 {
1576 	unsigned long hv_err, tail;
1577 	unsigned int copied;
1578 	u32 seq;
1579 	int err;
1580 
1581 	hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
1582 					&lp->chan_state);
1583 	if (unlikely(hv_err))
1584 		return -EBUSY;
1585 
1586 	if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
1587 		return LDC_ABORT(lp);
1588 
1589 	if (!tx_has_space_for(lp, size))
1590 		return -EAGAIN;
1591 
1592 	seq = lp->snd_nxt;
1593 	copied = 0;
1594 	tail = lp->tx_tail;
1595 	while (copied < size) {
1596 		struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE);
1597 		u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ?
1598 			    p->u.u_data :
1599 			    p->u.r.r_data);
1600 		int data_len;
1601 
1602 		p->type = LDC_DATA;
1603 		p->stype = LDC_INFO;
1604 		p->ctrl = 0;
1605 
1606 		data_len = size - copied;
1607 		if (data_len > lp->mss)
1608 			data_len = lp->mss;
1609 
1610 		BUG_ON(data_len > LDC_LEN);
1611 
1612 		p->env = (data_len |
1613 			  (copied == 0 ? LDC_START : 0) |
1614 			  (data_len == size - copied ? LDC_STOP : 0));
1615 
1616 		p->seqid = ++seq;
1617 
1618 		ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n",
1619 		       p->type,
1620 		       p->stype,
1621 		       p->ctrl,
1622 		       p->env,
1623 		       p->seqid);
1624 
1625 		memcpy(data, buf, data_len);
1626 		buf += data_len;
1627 		copied += data_len;
1628 
1629 		tail = tx_advance(lp, tail);
1630 	}
1631 
1632 	err = set_tx_tail(lp, tail);
1633 	if (!err) {
1634 		lp->snd_nxt = seq;
1635 		err = size;
1636 	}
1637 
1638 	return err;
1639 }
1640 
1641 static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p,
1642 		      struct ldc_packet *first_frag)
1643 {
1644 	int err;
1645 
1646 	if (first_frag)
1647 		lp->rcv_nxt = first_frag->seqid - 1;
1648 
1649 	err = send_data_nack(lp, p);
1650 	if (err)
1651 		return err;
1652 
1653 	err = ldc_rx_reset(lp);
1654 	if (err < 0)
1655 		return LDC_ABORT(lp);
1656 
1657 	return 0;
1658 }
1659 
1660 static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p)
1661 {
1662 	if (p->stype & LDC_ACK) {
1663 		int err = process_data_ack(lp, p);
1664 		if (err)
1665 			return err;
1666 	}
1667 	if (p->stype & LDC_NACK)
1668 		return LDC_ABORT(lp);
1669 
1670 	return 0;
1671 }
1672 
1673 static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head)
1674 {
1675 	unsigned long dummy;
1676 	int limit = 1000;
1677 
1678 	ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n",
1679 	       cur_head, lp->rx_head, lp->rx_tail);
1680 	while (limit-- > 0) {
1681 		unsigned long hv_err;
1682 
1683 		hv_err = sun4v_ldc_rx_get_state(lp->id,
1684 						&dummy,
1685 						&lp->rx_tail,
1686 						&lp->chan_state);
1687 		if (hv_err)
1688 			return LDC_ABORT(lp);
1689 
1690 		if (lp->chan_state == LDC_CHANNEL_DOWN ||
1691 		    lp->chan_state == LDC_CHANNEL_RESETTING)
1692 			return -ECONNRESET;
1693 
1694 		if (cur_head != lp->rx_tail) {
1695 			ldcdbg(DATA, "DATA WAIT DONE "
1696 			       "head[%lx] tail[%lx] chan_state[%lx]\n",
1697 			       dummy, lp->rx_tail, lp->chan_state);
1698 			return 0;
1699 		}
1700 
1701 		udelay(1);
1702 	}
1703 	return -EAGAIN;
1704 }
1705 
1706 static int rx_set_head(struct ldc_channel *lp, unsigned long head)
1707 {
1708 	int err = __set_rx_head(lp, head);
1709 
1710 	if (err < 0)
1711 		return LDC_ABORT(lp);
1712 
1713 	lp->rx_head = head;
1714 	return 0;
1715 }
1716 
1717 static void send_data_ack(struct ldc_channel *lp)
1718 {
1719 	unsigned long new_tail;
1720 	struct ldc_packet *p;
1721 
1722 	p = data_get_tx_packet(lp, &new_tail);
1723 	if (likely(p)) {
1724 		int err;
1725 
1726 		memset(p, 0, sizeof(*p));
1727 		p->type = LDC_DATA;
1728 		p->stype = LDC_ACK;
1729 		p->ctrl = 0;
1730 		p->seqid = lp->snd_nxt + 1;
1731 		p->u.r.ackid = lp->rcv_nxt;
1732 
1733 		err = send_tx_packet(lp, p, new_tail);
1734 		if (!err)
1735 			lp->snd_nxt++;
1736 	}
1737 }
1738 
1739 static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
1740 {
1741 	struct ldc_packet *first_frag;
1742 	unsigned long hv_err, new;
1743 	int err, copied;
1744 
1745 	hv_err = sun4v_ldc_rx_get_state(lp->id,
1746 					&lp->rx_head,
1747 					&lp->rx_tail,
1748 					&lp->chan_state);
1749 	if (hv_err)
1750 		return LDC_ABORT(lp);
1751 
1752 	if (lp->chan_state == LDC_CHANNEL_DOWN ||
1753 	    lp->chan_state == LDC_CHANNEL_RESETTING)
1754 		return -ECONNRESET;
1755 
1756 	if (lp->rx_head == lp->rx_tail)
1757 		return 0;
1758 
1759 	first_frag = NULL;
1760 	copied = err = 0;
1761 	new = lp->rx_head;
1762 	while (1) {
1763 		struct ldc_packet *p;
1764 		int pkt_len;
1765 
1766 		BUG_ON(new == lp->rx_tail);
1767 		p = lp->rx_base + (new / LDC_PACKET_SIZE);
1768 
1769 		ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] "
1770 		       "rcv_nxt[%08x]\n",
1771 		       p->type,
1772 		       p->stype,
1773 		       p->ctrl,
1774 		       p->env,
1775 		       p->seqid,
1776 		       p->u.r.ackid,
1777 		       lp->rcv_nxt);
1778 
1779 		if (unlikely(!rx_seq_ok(lp, p->seqid))) {
1780 			err = rx_bad_seq(lp, p, first_frag);
1781 			copied = 0;
1782 			break;
1783 		}
1784 
1785 		if (p->type & LDC_CTRL) {
1786 			err = process_control_frame(lp, p);
1787 			if (err < 0)
1788 				break;
1789 			err = 0;
1790 		}
1791 
1792 		lp->rcv_nxt = p->seqid;
1793 
1794 		/*
1795 		 * If this is a control-only packet, there is nothing
1796 		 * else to do but advance the rx queue since the packet
1797 		 * was already processed above.
1798 		 */
1799 		if (!(p->type & LDC_DATA)) {
1800 			new = rx_advance(lp, new);
1801 			break;
1802 		}
1803 		if (p->stype & (LDC_ACK | LDC_NACK)) {
1804 			err = data_ack_nack(lp, p);
1805 			if (err)
1806 				break;
1807 		}
1808 		if (!(p->stype & LDC_INFO)) {
1809 			new = rx_advance(lp, new);
1810 			err = rx_set_head(lp, new);
1811 			if (err)
1812 				break;
1813 			goto no_data;
1814 		}
1815 
1816 		pkt_len = p->env & LDC_LEN;
1817 
1818 		/* Every initial packet starts with the START bit set.
1819 		 *
1820 		 * Singleton packets will have both START+STOP set.
1821 		 *
1822 		 * Fragments will have START set in the first frame, STOP
1823 		 * set in the last frame, and neither bit set in middle
1824 		 * frames of the packet.
1825 		 *
1826 		 * Therefore if we are at the beginning of a packet and
1827 		 * we don't see START, or we are in the middle of a fragmented
1828 		 * packet and do see START, we are unsynchronized and should
1829 		 * flush the RX queue.
1830 		 */
1831 		if ((first_frag == NULL && !(p->env & LDC_START)) ||
1832 		    (first_frag != NULL &&  (p->env & LDC_START))) {
1833 			if (!first_frag)
1834 				new = rx_advance(lp, new);
1835 
1836 			err = rx_set_head(lp, new);
1837 			if (err)
1838 				break;
1839 
1840 			if (!first_frag)
1841 				goto no_data;
1842 		}
1843 		if (!first_frag)
1844 			first_frag = p;
1845 
1846 		if (pkt_len > size - copied) {
1847 			/* User didn't give us a big enough buffer,
1848 			 * what to do?  This is a pretty serious error.
1849 			 *
1850 			 * Since we haven't updated the RX ring head to
1851 			 * consume any of the packets, signal the error
1852 			 * to the user and just leave the RX ring alone.
1853 			 *
1854 			 * This seems the best behavior because this allows
1855 			 * a user of the LDC layer to start with a small
1856 			 * RX buffer for ldc_read() calls and use -EMSGSIZE
1857 			 * as a cue to enlarge it's read buffer.
1858 			 */
1859 			err = -EMSGSIZE;
1860 			break;
1861 		}
1862 
1863 		/* Ok, we are gonna eat this one.  */
1864 		new = rx_advance(lp, new);
1865 
1866 		memcpy(buf,
1867 		       (lp->cfg.mode == LDC_MODE_UNRELIABLE ?
1868 			p->u.u_data : p->u.r.r_data), pkt_len);
1869 		buf += pkt_len;
1870 		copied += pkt_len;
1871 
1872 		if (p->env & LDC_STOP)
1873 			break;
1874 
1875 no_data:
1876 		if (new == lp->rx_tail) {
1877 			err = rx_data_wait(lp, new);
1878 			if (err)
1879 				break;
1880 		}
1881 	}
1882 
1883 	if (!err)
1884 		err = rx_set_head(lp, new);
1885 
1886 	if (err && first_frag)
1887 		lp->rcv_nxt = first_frag->seqid - 1;
1888 
1889 	if (!err) {
1890 		err = copied;
1891 		if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE)
1892 			send_data_ack(lp);
1893 	}
1894 
1895 	return err;
1896 }
1897 
1898 static const struct ldc_mode_ops nonraw_ops = {
1899 	.write		=	write_nonraw,
1900 	.read		=	read_nonraw,
1901 };
1902 
1903 static int write_stream(struct ldc_channel *lp, const void *buf,
1904 			unsigned int size)
1905 {
1906 	if (size > lp->cfg.mtu)
1907 		size = lp->cfg.mtu;
1908 	return write_nonraw(lp, buf, size);
1909 }
1910 
1911 static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size)
1912 {
1913 	if (!lp->mssbuf_len) {
1914 		int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu);
1915 		if (err < 0)
1916 			return err;
1917 
1918 		lp->mssbuf_len = err;
1919 		lp->mssbuf_off = 0;
1920 	}
1921 
1922 	if (size > lp->mssbuf_len)
1923 		size = lp->mssbuf_len;
1924 	memcpy(buf, lp->mssbuf + lp->mssbuf_off, size);
1925 
1926 	lp->mssbuf_off += size;
1927 	lp->mssbuf_len -= size;
1928 
1929 	return size;
1930 }
1931 
1932 static const struct ldc_mode_ops stream_ops = {
1933 	.write		=	write_stream,
1934 	.read		=	read_stream,
1935 };
1936 
1937 int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
1938 {
1939 	unsigned long flags;
1940 	int err;
1941 
1942 	if (!buf)
1943 		return -EINVAL;
1944 
1945 	if (!size)
1946 		return 0;
1947 
1948 	spin_lock_irqsave(&lp->lock, flags);
1949 
1950 	if (lp->hs_state != LDC_HS_COMPLETE)
1951 		err = -ENOTCONN;
1952 	else
1953 		err = lp->mops->write(lp, buf, size);
1954 
1955 	spin_unlock_irqrestore(&lp->lock, flags);
1956 
1957 	return err;
1958 }
1959 EXPORT_SYMBOL(ldc_write);
1960 
1961 int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
1962 {
1963 	unsigned long flags;
1964 	int err;
1965 
1966 	ldcdbg(RX, "%s: entered size=%d\n", __func__, size);
1967 
1968 	if (!buf)
1969 		return -EINVAL;
1970 
1971 	if (!size)
1972 		return 0;
1973 
1974 	spin_lock_irqsave(&lp->lock, flags);
1975 
1976 	if (lp->hs_state != LDC_HS_COMPLETE)
1977 		err = -ENOTCONN;
1978 	else
1979 		err = lp->mops->read(lp, buf, size);
1980 
1981 	spin_unlock_irqrestore(&lp->lock, flags);
1982 
1983 	ldcdbg(RX, "%s: mode=%d, head=%lu, tail=%lu rv=%d\n", __func__,
1984 	       lp->cfg.mode, lp->rx_head, lp->rx_tail, err);
1985 
1986 	return err;
1987 }
1988 EXPORT_SYMBOL(ldc_read);
1989 
1990 static u64 pagesize_code(void)
1991 {
1992 	switch (PAGE_SIZE) {
1993 	default:
1994 	case (8ULL * 1024ULL):
1995 		return 0;
1996 	case (64ULL * 1024ULL):
1997 		return 1;
1998 	case (512ULL * 1024ULL):
1999 		return 2;
2000 	case (4ULL * 1024ULL * 1024ULL):
2001 		return 3;
2002 	case (32ULL * 1024ULL * 1024ULL):
2003 		return 4;
2004 	case (256ULL * 1024ULL * 1024ULL):
2005 		return 5;
2006 	}
2007 }
2008 
2009 static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset)
2010 {
2011 	return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) |
2012 		(index << PAGE_SHIFT) |
2013 		page_offset);
2014 }
2015 
2016 
2017 static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu,
2018 					     unsigned long npages)
2019 {
2020 	long entry;
2021 
2022 	entry = iommu_tbl_range_alloc(NULL, &iommu->iommu_map_table,
2023 				      npages, NULL, (unsigned long)-1, 0);
2024 	if (unlikely(entry == IOMMU_ERROR_CODE))
2025 		return NULL;
2026 
2027 	return iommu->page_table + entry;
2028 }
2029 
2030 static u64 perm_to_mte(unsigned int map_perm)
2031 {
2032 	u64 mte_base;
2033 
2034 	mte_base = pagesize_code();
2035 
2036 	if (map_perm & LDC_MAP_SHADOW) {
2037 		if (map_perm & LDC_MAP_R)
2038 			mte_base |= LDC_MTE_COPY_R;
2039 		if (map_perm & LDC_MAP_W)
2040 			mte_base |= LDC_MTE_COPY_W;
2041 	}
2042 	if (map_perm & LDC_MAP_DIRECT) {
2043 		if (map_perm & LDC_MAP_R)
2044 			mte_base |= LDC_MTE_READ;
2045 		if (map_perm & LDC_MAP_W)
2046 			mte_base |= LDC_MTE_WRITE;
2047 		if (map_perm & LDC_MAP_X)
2048 			mte_base |= LDC_MTE_EXEC;
2049 	}
2050 	if (map_perm & LDC_MAP_IO) {
2051 		if (map_perm & LDC_MAP_R)
2052 			mte_base |= LDC_MTE_IOMMU_R;
2053 		if (map_perm & LDC_MAP_W)
2054 			mte_base |= LDC_MTE_IOMMU_W;
2055 	}
2056 
2057 	return mte_base;
2058 }
2059 
2060 static int pages_in_region(unsigned long base, long len)
2061 {
2062 	int count = 0;
2063 
2064 	do {
2065 		unsigned long new = (base + PAGE_SIZE) & PAGE_MASK;
2066 
2067 		len -= (new - base);
2068 		base = new;
2069 		count++;
2070 	} while (len > 0);
2071 
2072 	return count;
2073 }
2074 
2075 struct cookie_state {
2076 	struct ldc_mtable_entry		*page_table;
2077 	struct ldc_trans_cookie		*cookies;
2078 	u64				mte_base;
2079 	u64				prev_cookie;
2080 	u32				pte_idx;
2081 	u32				nc;
2082 };
2083 
2084 static void fill_cookies(struct cookie_state *sp, unsigned long pa,
2085 			 unsigned long off, unsigned long len)
2086 {
2087 	do {
2088 		unsigned long tlen, new = pa + PAGE_SIZE;
2089 		u64 this_cookie;
2090 
2091 		sp->page_table[sp->pte_idx].mte = sp->mte_base | pa;
2092 
2093 		tlen = PAGE_SIZE;
2094 		if (off)
2095 			tlen = PAGE_SIZE - off;
2096 		if (tlen > len)
2097 			tlen = len;
2098 
2099 		this_cookie = make_cookie(sp->pte_idx,
2100 					  pagesize_code(), off);
2101 
2102 		off = 0;
2103 
2104 		if (this_cookie == sp->prev_cookie) {
2105 			sp->cookies[sp->nc - 1].cookie_size += tlen;
2106 		} else {
2107 			sp->cookies[sp->nc].cookie_addr = this_cookie;
2108 			sp->cookies[sp->nc].cookie_size = tlen;
2109 			sp->nc++;
2110 		}
2111 		sp->prev_cookie = this_cookie + tlen;
2112 
2113 		sp->pte_idx++;
2114 
2115 		len -= tlen;
2116 		pa = new;
2117 	} while (len > 0);
2118 }
2119 
2120 static int sg_count_one(struct scatterlist *sg)
2121 {
2122 	unsigned long base = page_to_pfn(sg_page(sg)) << PAGE_SHIFT;
2123 	long len = sg->length;
2124 
2125 	if ((sg->offset | len) & (8UL - 1))
2126 		return -EFAULT;
2127 
2128 	return pages_in_region(base + sg->offset, len);
2129 }
2130 
2131 static int sg_count_pages(struct scatterlist *sg, int num_sg)
2132 {
2133 	int count;
2134 	int i;
2135 
2136 	count = 0;
2137 	for (i = 0; i < num_sg; i++) {
2138 		int err = sg_count_one(sg + i);
2139 		if (err < 0)
2140 			return err;
2141 		count += err;
2142 	}
2143 
2144 	return count;
2145 }
2146 
2147 int ldc_map_sg(struct ldc_channel *lp,
2148 	       struct scatterlist *sg, int num_sg,
2149 	       struct ldc_trans_cookie *cookies, int ncookies,
2150 	       unsigned int map_perm)
2151 {
2152 	unsigned long i, npages;
2153 	struct ldc_mtable_entry *base;
2154 	struct cookie_state state;
2155 	struct ldc_iommu *iommu;
2156 	int err;
2157 	struct scatterlist *s;
2158 
2159 	if (map_perm & ~LDC_MAP_ALL)
2160 		return -EINVAL;
2161 
2162 	err = sg_count_pages(sg, num_sg);
2163 	if (err < 0)
2164 		return err;
2165 
2166 	npages = err;
2167 	if (err > ncookies)
2168 		return -EMSGSIZE;
2169 
2170 	iommu = &lp->iommu;
2171 
2172 	base = alloc_npages(iommu, npages);
2173 
2174 	if (!base)
2175 		return -ENOMEM;
2176 
2177 	state.page_table = iommu->page_table;
2178 	state.cookies = cookies;
2179 	state.mte_base = perm_to_mte(map_perm);
2180 	state.prev_cookie = ~(u64)0;
2181 	state.pte_idx = (base - iommu->page_table);
2182 	state.nc = 0;
2183 
2184 	for_each_sg(sg, s, num_sg, i) {
2185 		fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT,
2186 			     s->offset, s->length);
2187 	}
2188 
2189 	return state.nc;
2190 }
2191 EXPORT_SYMBOL(ldc_map_sg);
2192 
2193 int ldc_map_single(struct ldc_channel *lp,
2194 		   void *buf, unsigned int len,
2195 		   struct ldc_trans_cookie *cookies, int ncookies,
2196 		   unsigned int map_perm)
2197 {
2198 	unsigned long npages, pa;
2199 	struct ldc_mtable_entry *base;
2200 	struct cookie_state state;
2201 	struct ldc_iommu *iommu;
2202 
2203 	if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1))
2204 		return -EINVAL;
2205 
2206 	pa = __pa(buf);
2207 	if ((pa | len) & (8UL - 1))
2208 		return -EFAULT;
2209 
2210 	npages = pages_in_region(pa, len);
2211 
2212 	iommu = &lp->iommu;
2213 
2214 	base = alloc_npages(iommu, npages);
2215 
2216 	if (!base)
2217 		return -ENOMEM;
2218 
2219 	state.page_table = iommu->page_table;
2220 	state.cookies = cookies;
2221 	state.mte_base = perm_to_mte(map_perm);
2222 	state.prev_cookie = ~(u64)0;
2223 	state.pte_idx = (base - iommu->page_table);
2224 	state.nc = 0;
2225 	fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len);
2226 	BUG_ON(state.nc > ncookies);
2227 
2228 	return state.nc;
2229 }
2230 EXPORT_SYMBOL(ldc_map_single);
2231 
2232 
2233 static void free_npages(unsigned long id, struct ldc_iommu *iommu,
2234 			u64 cookie, u64 size)
2235 {
2236 	unsigned long npages, entry;
2237 
2238 	npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT;
2239 
2240 	entry = ldc_cookie_to_index(cookie, iommu);
2241 	ldc_demap(iommu, id, cookie, entry, npages);
2242 	iommu_tbl_range_free(&iommu->iommu_map_table, cookie, npages, entry);
2243 }
2244 
2245 void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
2246 	       int ncookies)
2247 {
2248 	struct ldc_iommu *iommu = &lp->iommu;
2249 	int i;
2250 	unsigned long flags;
2251 
2252 	spin_lock_irqsave(&iommu->lock, flags);
2253 	for (i = 0; i < ncookies; i++) {
2254 		u64 addr = cookies[i].cookie_addr;
2255 		u64 size = cookies[i].cookie_size;
2256 
2257 		free_npages(lp->id, iommu, addr, size);
2258 	}
2259 	spin_unlock_irqrestore(&iommu->lock, flags);
2260 }
2261 EXPORT_SYMBOL(ldc_unmap);
2262 
2263 int ldc_copy(struct ldc_channel *lp, int copy_dir,
2264 	     void *buf, unsigned int len, unsigned long offset,
2265 	     struct ldc_trans_cookie *cookies, int ncookies)
2266 {
2267 	unsigned int orig_len;
2268 	unsigned long ra;
2269 	int i;
2270 
2271 	if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) {
2272 		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n",
2273 		       lp->id, copy_dir);
2274 		return -EINVAL;
2275 	}
2276 
2277 	ra = __pa(buf);
2278 	if ((ra | len | offset) & (8UL - 1)) {
2279 		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer "
2280 		       "ra[%lx] len[%x] offset[%lx]\n",
2281 		       lp->id, ra, len, offset);
2282 		return -EFAULT;
2283 	}
2284 
2285 	if (lp->hs_state != LDC_HS_COMPLETE ||
2286 	    (lp->flags & LDC_FLAG_RESET)) {
2287 		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] "
2288 		       "flags[%x]\n", lp->id, lp->hs_state, lp->flags);
2289 		return -ECONNRESET;
2290 	}
2291 
2292 	orig_len = len;
2293 	for (i = 0; i < ncookies; i++) {
2294 		unsigned long cookie_raddr = cookies[i].cookie_addr;
2295 		unsigned long this_len = cookies[i].cookie_size;
2296 		unsigned long actual_len;
2297 
2298 		if (unlikely(offset)) {
2299 			unsigned long this_off = offset;
2300 
2301 			if (this_off > this_len)
2302 				this_off = this_len;
2303 
2304 			offset -= this_off;
2305 			this_len -= this_off;
2306 			if (!this_len)
2307 				continue;
2308 			cookie_raddr += this_off;
2309 		}
2310 
2311 		if (this_len > len)
2312 			this_len = len;
2313 
2314 		while (1) {
2315 			unsigned long hv_err;
2316 
2317 			hv_err = sun4v_ldc_copy(lp->id, copy_dir,
2318 						cookie_raddr, ra,
2319 						this_len, &actual_len);
2320 			if (unlikely(hv_err)) {
2321 				printk(KERN_ERR PFX "ldc_copy: ID[%lu] "
2322 				       "HV error %lu\n",
2323 				       lp->id, hv_err);
2324 				if (lp->hs_state != LDC_HS_COMPLETE ||
2325 				    (lp->flags & LDC_FLAG_RESET))
2326 					return -ECONNRESET;
2327 				else
2328 					return -EFAULT;
2329 			}
2330 
2331 			cookie_raddr += actual_len;
2332 			ra += actual_len;
2333 			len -= actual_len;
2334 			if (actual_len == this_len)
2335 				break;
2336 
2337 			this_len -= actual_len;
2338 		}
2339 
2340 		if (!len)
2341 			break;
2342 	}
2343 
2344 	/* It is caller policy what to do about short copies.
2345 	 * For example, a networking driver can declare the
2346 	 * packet a runt and drop it.
2347 	 */
2348 
2349 	return orig_len - len;
2350 }
2351 EXPORT_SYMBOL(ldc_copy);
2352 
2353 void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
2354 			  struct ldc_trans_cookie *cookies, int *ncookies,
2355 			  unsigned int map_perm)
2356 {
2357 	void *buf;
2358 	int err;
2359 
2360 	if (len & (8UL - 1))
2361 		return ERR_PTR(-EINVAL);
2362 
2363 	buf = kzalloc(len, GFP_ATOMIC);
2364 	if (!buf)
2365 		return ERR_PTR(-ENOMEM);
2366 
2367 	err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm);
2368 	if (err < 0) {
2369 		kfree(buf);
2370 		return ERR_PTR(err);
2371 	}
2372 	*ncookies = err;
2373 
2374 	return buf;
2375 }
2376 EXPORT_SYMBOL(ldc_alloc_exp_dring);
2377 
2378 void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len,
2379 			struct ldc_trans_cookie *cookies, int ncookies)
2380 {
2381 	ldc_unmap(lp, cookies, ncookies);
2382 	kfree(buf);
2383 }
2384 EXPORT_SYMBOL(ldc_free_exp_dring);
2385 
2386 static int __init ldc_init(void)
2387 {
2388 	unsigned long major, minor;
2389 	struct mdesc_handle *hp;
2390 	const u64 *v;
2391 	int err;
2392 	u64 mp;
2393 
2394 	hp = mdesc_grab();
2395 	if (!hp)
2396 		return -ENODEV;
2397 
2398 	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
2399 	err = -ENODEV;
2400 	if (mp == MDESC_NODE_NULL)
2401 		goto out;
2402 
2403 	v = mdesc_get_property(hp, mp, "domaining-enabled", NULL);
2404 	if (!v)
2405 		goto out;
2406 
2407 	major = 1;
2408 	minor = 0;
2409 	if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) {
2410 		printk(KERN_INFO PFX "Could not register LDOM hvapi.\n");
2411 		goto out;
2412 	}
2413 
2414 	printk(KERN_INFO "%s", version);
2415 
2416 	if (!*v) {
2417 		printk(KERN_INFO PFX "Domaining disabled.\n");
2418 		goto out;
2419 	}
2420 	ldom_domaining_enabled = 1;
2421 	err = 0;
2422 
2423 out:
2424 	mdesc_release(hp);
2425 	return err;
2426 }
2427 
2428 core_initcall(ldc_init);
2429