xref: /openbmc/linux/net/smc/af_smc.c (revision 711aab1d)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21 
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31 
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35 
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46 
47 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
48 						 * creation
49 						 */
50 
51 struct smc_lgr_list smc_lgr_list = {		/* established link groups */
52 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55 
56 static void smc_tcp_listen_work(struct work_struct *);
57 
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60 	struct smc_sock *smc = smc_sk(sk);
61 
62 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64 
65 static struct smc_hashinfo smc_v4_hashinfo = {
66 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68 
69 int smc_hash_sk(struct sock *sk)
70 {
71 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 	struct hlist_head *head;
73 
74 	head = &h->ht;
75 
76 	write_lock_bh(&h->lock);
77 	sk_add_node(sk, head);
78 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 	write_unlock_bh(&h->lock);
80 
81 	return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84 
85 void smc_unhash_sk(struct sock *sk)
86 {
87 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88 
89 	write_lock_bh(&h->lock);
90 	if (sk_del_node_init(sk))
91 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 	write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95 
96 struct proto smc_proto = {
97 	.name		= "SMC",
98 	.owner		= THIS_MODULE,
99 	.keepalive	= smc_set_keepalive,
100 	.hash		= smc_hash_sk,
101 	.unhash		= smc_unhash_sk,
102 	.obj_size	= sizeof(struct smc_sock),
103 	.h.smc_hash	= &smc_v4_hashinfo,
104 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107 
108 static int smc_release(struct socket *sock)
109 {
110 	struct sock *sk = sock->sk;
111 	struct smc_sock *smc;
112 	int rc = 0;
113 
114 	if (!sk)
115 		goto out;
116 
117 	smc = smc_sk(sk);
118 	sock_hold(sk);
119 	if (sk->sk_state == SMC_LISTEN)
120 		/* smc_close_non_accepted() is called and acquires
121 		 * sock lock for child sockets again
122 		 */
123 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
124 	else
125 		lock_sock(sk);
126 
127 	if (smc->use_fallback) {
128 		sk->sk_state = SMC_CLOSED;
129 		sk->sk_state_change(sk);
130 	} else {
131 		rc = smc_close_active(smc);
132 		sock_set_flag(sk, SOCK_DEAD);
133 		sk->sk_shutdown |= SHUTDOWN_MASK;
134 	}
135 	if (smc->clcsock) {
136 		sock_release(smc->clcsock);
137 		smc->clcsock = NULL;
138 	}
139 
140 	/* detach socket */
141 	sock_orphan(sk);
142 	sock->sk = NULL;
143 	if (smc->use_fallback) {
144 		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
145 	} else if (sk->sk_state == SMC_CLOSED) {
146 		smc_conn_free(&smc->conn);
147 		schedule_delayed_work(&smc->sock_put_work,
148 				      SMC_CLOSE_SOCK_PUT_DELAY);
149 	}
150 	release_sock(sk);
151 
152 	sock_put(sk);
153 out:
154 	return rc;
155 }
156 
157 static void smc_destruct(struct sock *sk)
158 {
159 	if (sk->sk_state != SMC_CLOSED)
160 		return;
161 	if (!sock_flag(sk, SOCK_DEAD))
162 		return;
163 
164 	sk_refcnt_debug_dec(sk);
165 }
166 
167 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
168 {
169 	struct smc_sock *smc;
170 	struct sock *sk;
171 
172 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
173 	if (!sk)
174 		return NULL;
175 
176 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
177 	sk->sk_state = SMC_INIT;
178 	sk->sk_destruct = smc_destruct;
179 	sk->sk_protocol = SMCPROTO_SMC;
180 	smc = smc_sk(sk);
181 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
182 	INIT_LIST_HEAD(&smc->accept_q);
183 	spin_lock_init(&smc->accept_q_lock);
184 	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
185 	sk->sk_prot->hash(sk);
186 	sk_refcnt_debug_inc(sk);
187 
188 	return sk;
189 }
190 
191 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
192 		    int addr_len)
193 {
194 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
195 	struct sock *sk = sock->sk;
196 	struct smc_sock *smc;
197 	int rc;
198 
199 	smc = smc_sk(sk);
200 
201 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
202 	rc = -EINVAL;
203 	if (addr_len < sizeof(struct sockaddr_in))
204 		goto out;
205 
206 	rc = -EAFNOSUPPORT;
207 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
208 	if ((addr->sin_family != AF_INET) &&
209 	    ((addr->sin_family != AF_UNSPEC) ||
210 	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
211 		goto out;
212 
213 	lock_sock(sk);
214 
215 	/* Check if socket is already active */
216 	rc = -EINVAL;
217 	if (sk->sk_state != SMC_INIT)
218 		goto out_rel;
219 
220 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
221 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
222 
223 out_rel:
224 	release_sock(sk);
225 out:
226 	return rc;
227 }
228 
229 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
230 				   unsigned long mask)
231 {
232 	/* options we don't get control via setsockopt for */
233 	nsk->sk_type = osk->sk_type;
234 	nsk->sk_sndbuf = osk->sk_sndbuf;
235 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
236 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
237 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
238 	nsk->sk_mark = osk->sk_mark;
239 	nsk->sk_priority = osk->sk_priority;
240 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
241 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
242 	nsk->sk_err = osk->sk_err;
243 
244 	nsk->sk_flags &= ~mask;
245 	nsk->sk_flags |= osk->sk_flags & mask;
246 }
247 
248 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
249 			     (1UL << SOCK_KEEPOPEN) | \
250 			     (1UL << SOCK_LINGER) | \
251 			     (1UL << SOCK_BROADCAST) | \
252 			     (1UL << SOCK_TIMESTAMP) | \
253 			     (1UL << SOCK_DBG) | \
254 			     (1UL << SOCK_RCVTSTAMP) | \
255 			     (1UL << SOCK_RCVTSTAMPNS) | \
256 			     (1UL << SOCK_LOCALROUTE) | \
257 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
258 			     (1UL << SOCK_RXQ_OVFL) | \
259 			     (1UL << SOCK_WIFI_STATUS) | \
260 			     (1UL << SOCK_NOFCS) | \
261 			     (1UL << SOCK_FILTER_LOCKED))
262 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
263  * clc socket (since smc is not called for these options from net/core)
264  */
265 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
266 {
267 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
268 }
269 
270 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
271 			     (1UL << SOCK_KEEPOPEN) | \
272 			     (1UL << SOCK_LINGER) | \
273 			     (1UL << SOCK_DBG))
274 /* copy only settings and flags relevant for smc from clc to smc socket */
275 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
276 {
277 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
278 }
279 
280 /* determine subnet and mask of internal TCP socket */
281 int smc_netinfo_by_tcpsk(struct socket *clcsock,
282 			 __be32 *subnet, u8 *prefix_len)
283 {
284 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
285 	struct sockaddr_in addr;
286 	int rc = -ENOENT;
287 	int len;
288 
289 	if (!dst) {
290 		rc = -ENOTCONN;
291 		goto out;
292 	}
293 	if (!dst->dev) {
294 		rc = -ENODEV;
295 		goto out_rel;
296 	}
297 
298 	/* get address to which the internal TCP socket is bound */
299 	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
300 	/* analyze IPv4 specific data of net_device belonging to TCP socket */
301 	for_ifa(dst->dev->ip_ptr) {
302 		if (ifa->ifa_address != addr.sin_addr.s_addr)
303 			continue;
304 		*prefix_len = inet_mask_len(ifa->ifa_mask);
305 		*subnet = ifa->ifa_address & ifa->ifa_mask;
306 		rc = 0;
307 		break;
308 	} endfor_ifa(dst->dev->ip_ptr);
309 
310 out_rel:
311 	dst_release(dst);
312 out:
313 	return rc;
314 }
315 
316 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
317 {
318 	struct smc_link_group *lgr = smc->conn.lgr;
319 	struct smc_link *link;
320 	int rest;
321 	int rc;
322 
323 	link = &lgr->lnk[SMC_SINGLE_LINK];
324 	/* receive CONFIRM LINK request from server over RoCE fabric */
325 	rest = wait_for_completion_interruptible_timeout(
326 		&link->llc_confirm,
327 		SMC_LLC_WAIT_FIRST_TIME);
328 	if (rest <= 0) {
329 		struct smc_clc_msg_decline dclc;
330 
331 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
332 				      SMC_CLC_DECLINE);
333 		return rc;
334 	}
335 
336 	rc = smc_ib_modify_qp_rts(link);
337 	if (rc)
338 		return SMC_CLC_DECL_INTERR;
339 
340 	smc_wr_remember_qp_attr(link);
341 
342 	rc = smc_wr_reg_send(link,
343 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
344 	if (rc)
345 		return SMC_CLC_DECL_INTERR;
346 
347 	/* send CONFIRM LINK response over RoCE fabric */
348 	rc = smc_llc_send_confirm_link(link,
349 				       link->smcibdev->mac[link->ibport - 1],
350 				       gid, SMC_LLC_RESP);
351 	if (rc < 0)
352 		return SMC_CLC_DECL_TCL;
353 
354 	return rc;
355 }
356 
357 static void smc_conn_save_peer_info(struct smc_sock *smc,
358 				    struct smc_clc_msg_accept_confirm *clc)
359 {
360 	smc->conn.peer_conn_idx = clc->conn_idx;
361 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
362 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
363 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
364 }
365 
366 static void smc_link_save_peer_info(struct smc_link *link,
367 				    struct smc_clc_msg_accept_confirm *clc)
368 {
369 	link->peer_qpn = ntoh24(clc->qpn);
370 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
371 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
372 	link->peer_psn = ntoh24(clc->psn);
373 	link->peer_mtu = clc->qp_mtu;
374 }
375 
376 /* setup for RDMA connection of client */
377 static int smc_connect_rdma(struct smc_sock *smc)
378 {
379 	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
380 	struct smc_clc_msg_accept_confirm aclc;
381 	int local_contact = SMC_FIRST_CONTACT;
382 	struct smc_ib_device *smcibdev;
383 	struct smc_link *link;
384 	u8 srv_first_contact;
385 	int reason_code = 0;
386 	int rc = 0;
387 	u8 ibport;
388 
389 	/* IPSec connections opt out of SMC-R optimizations */
390 	if (using_ipsec(smc)) {
391 		reason_code = SMC_CLC_DECL_IPSEC;
392 		goto decline_rdma;
393 	}
394 
395 	/* PNET table look up: search active ib_device and port
396 	 * within same PNETID that also contains the ethernet device
397 	 * used for the internal TCP socket
398 	 */
399 	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
400 	if (!smcibdev) {
401 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
402 		goto decline_rdma;
403 	}
404 
405 	/* do inband token exchange */
406 	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
407 	if (reason_code < 0) {
408 		rc = reason_code;
409 		goto out_err;
410 	}
411 	if (reason_code > 0) /* configuration error */
412 		goto decline_rdma;
413 	/* receive SMC Accept CLC message */
414 	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
415 				       SMC_CLC_ACCEPT);
416 	if (reason_code < 0) {
417 		rc = reason_code;
418 		goto out_err;
419 	}
420 	if (reason_code > 0)
421 		goto decline_rdma;
422 
423 	srv_first_contact = aclc.hdr.flag;
424 	mutex_lock(&smc_create_lgr_pending);
425 	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
426 					ibport, &aclc.lcl, srv_first_contact);
427 	if (local_contact < 0) {
428 		rc = local_contact;
429 		if (rc == -ENOMEM)
430 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
431 		else if (rc == -ENOLINK)
432 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
433 		goto decline_rdma_unlock;
434 	}
435 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
436 
437 	smc_conn_save_peer_info(smc, &aclc);
438 
439 	/* create send buffer and rmb */
440 	rc = smc_buf_create(smc);
441 	if (rc) {
442 		reason_code = SMC_CLC_DECL_MEM;
443 		goto decline_rdma_unlock;
444 	}
445 
446 	if (local_contact == SMC_FIRST_CONTACT)
447 		smc_link_save_peer_info(link, &aclc);
448 
449 	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
450 	if (rc) {
451 		reason_code = SMC_CLC_DECL_INTERR;
452 		goto decline_rdma_unlock;
453 	}
454 
455 	smc_close_init(smc);
456 	smc_rx_init(smc);
457 
458 	if (local_contact == SMC_FIRST_CONTACT) {
459 		rc = smc_ib_ready_link(link);
460 		if (rc) {
461 			reason_code = SMC_CLC_DECL_INTERR;
462 			goto decline_rdma_unlock;
463 		}
464 	} else {
465 		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
466 
467 		if (!buf_desc->reused) {
468 			/* register memory region for new rmb */
469 			rc = smc_wr_reg_send(link,
470 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
471 			if (rc) {
472 				reason_code = SMC_CLC_DECL_INTERR;
473 				goto decline_rdma_unlock;
474 			}
475 		}
476 	}
477 	smc_rmb_sync_sg_for_device(&smc->conn);
478 
479 	rc = smc_clc_send_confirm(smc);
480 	if (rc)
481 		goto out_err_unlock;
482 
483 	if (local_contact == SMC_FIRST_CONTACT) {
484 		/* QP confirmation over RoCE fabric */
485 		reason_code = smc_clnt_conf_first_link(
486 			smc, &smcibdev->gid[ibport - 1]);
487 		if (reason_code < 0) {
488 			rc = reason_code;
489 			goto out_err_unlock;
490 		}
491 		if (reason_code > 0)
492 			goto decline_rdma_unlock;
493 	}
494 
495 	mutex_unlock(&smc_create_lgr_pending);
496 	smc_tx_init(smc);
497 
498 out_connected:
499 	smc_copy_sock_settings_to_clc(smc);
500 	if (smc->sk.sk_state == SMC_INIT)
501 		smc->sk.sk_state = SMC_ACTIVE;
502 
503 	return rc ? rc : local_contact;
504 
505 decline_rdma_unlock:
506 	mutex_unlock(&smc_create_lgr_pending);
507 	smc_conn_free(&smc->conn);
508 decline_rdma:
509 	/* RDMA setup failed, switch back to TCP */
510 	smc->use_fallback = true;
511 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
512 		rc = smc_clc_send_decline(smc, reason_code, 0);
513 		if (rc < sizeof(struct smc_clc_msg_decline))
514 			goto out_err;
515 	}
516 	goto out_connected;
517 
518 out_err_unlock:
519 	mutex_unlock(&smc_create_lgr_pending);
520 	smc_conn_free(&smc->conn);
521 out_err:
522 	return rc;
523 }
524 
525 static int smc_connect(struct socket *sock, struct sockaddr *addr,
526 		       int alen, int flags)
527 {
528 	struct sock *sk = sock->sk;
529 	struct smc_sock *smc;
530 	int rc = -EINVAL;
531 
532 	smc = smc_sk(sk);
533 
534 	/* separate smc parameter checking to be safe */
535 	if (alen < sizeof(addr->sa_family))
536 		goto out_err;
537 	if (addr->sa_family != AF_INET)
538 		goto out_err;
539 	smc->addr = addr;	/* needed for nonblocking connect */
540 
541 	lock_sock(sk);
542 	switch (sk->sk_state) {
543 	default:
544 		goto out;
545 	case SMC_ACTIVE:
546 		rc = -EISCONN;
547 		goto out;
548 	case SMC_INIT:
549 		rc = 0;
550 		break;
551 	}
552 
553 	smc_copy_sock_settings_to_clc(smc);
554 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
555 	if (rc)
556 		goto out;
557 
558 	/* setup RDMA connection */
559 	rc = smc_connect_rdma(smc);
560 	if (rc < 0)
561 		goto out;
562 	else
563 		rc = 0; /* success cases including fallback */
564 
565 out:
566 	release_sock(sk);
567 out_err:
568 	return rc;
569 }
570 
571 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
572 {
573 	struct sock *sk = &lsmc->sk;
574 	struct socket *new_clcsock;
575 	struct sock *new_sk;
576 	int rc;
577 
578 	release_sock(&lsmc->sk);
579 	new_sk = smc_sock_alloc(sock_net(sk), NULL);
580 	if (!new_sk) {
581 		rc = -ENOMEM;
582 		lsmc->sk.sk_err = ENOMEM;
583 		*new_smc = NULL;
584 		lock_sock(&lsmc->sk);
585 		goto out;
586 	}
587 	*new_smc = smc_sk(new_sk);
588 
589 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
590 	lock_sock(&lsmc->sk);
591 	if  (rc < 0) {
592 		lsmc->sk.sk_err = -rc;
593 		new_sk->sk_state = SMC_CLOSED;
594 		sock_set_flag(new_sk, SOCK_DEAD);
595 		sk->sk_prot->unhash(new_sk);
596 		sock_put(new_sk);
597 		*new_smc = NULL;
598 		goto out;
599 	}
600 	if (lsmc->sk.sk_state == SMC_CLOSED) {
601 		if (new_clcsock)
602 			sock_release(new_clcsock);
603 		new_sk->sk_state = SMC_CLOSED;
604 		sock_set_flag(new_sk, SOCK_DEAD);
605 		sk->sk_prot->unhash(new_sk);
606 		sock_put(new_sk);
607 		*new_smc = NULL;
608 		goto out;
609 	}
610 
611 	(*new_smc)->clcsock = new_clcsock;
612 out:
613 	return rc;
614 }
615 
616 /* add a just created sock to the accept queue of the listen sock as
617  * candidate for a following socket accept call from user space
618  */
619 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
620 {
621 	struct smc_sock *par = smc_sk(parent);
622 
623 	sock_hold(sk);
624 	spin_lock(&par->accept_q_lock);
625 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
626 	spin_unlock(&par->accept_q_lock);
627 	sk_acceptq_added(parent);
628 }
629 
630 /* remove a socket from the accept queue of its parental listening socket */
631 static void smc_accept_unlink(struct sock *sk)
632 {
633 	struct smc_sock *par = smc_sk(sk)->listen_smc;
634 
635 	spin_lock(&par->accept_q_lock);
636 	list_del_init(&smc_sk(sk)->accept_q);
637 	spin_unlock(&par->accept_q_lock);
638 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
639 	sock_put(sk);
640 }
641 
642 /* remove a sock from the accept queue to bind it to a new socket created
643  * for a socket accept call from user space
644  */
645 struct sock *smc_accept_dequeue(struct sock *parent,
646 				struct socket *new_sock)
647 {
648 	struct smc_sock *isk, *n;
649 	struct sock *new_sk;
650 
651 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
652 		new_sk = (struct sock *)isk;
653 
654 		smc_accept_unlink(new_sk);
655 		if (new_sk->sk_state == SMC_CLOSED) {
656 			new_sk->sk_prot->unhash(new_sk);
657 			sock_put(new_sk);
658 			continue;
659 		}
660 		if (new_sock)
661 			sock_graft(new_sk, new_sock);
662 		return new_sk;
663 	}
664 	return NULL;
665 }
666 
667 /* clean up for a created but never accepted sock */
668 void smc_close_non_accepted(struct sock *sk)
669 {
670 	struct smc_sock *smc = smc_sk(sk);
671 
672 	sock_hold(sk);
673 	lock_sock(sk);
674 	if (!sk->sk_lingertime)
675 		/* wait for peer closing */
676 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
677 	if (smc->use_fallback) {
678 		sk->sk_state = SMC_CLOSED;
679 	} else {
680 		smc_close_active(smc);
681 		sock_set_flag(sk, SOCK_DEAD);
682 		sk->sk_shutdown |= SHUTDOWN_MASK;
683 	}
684 	if (smc->clcsock) {
685 		struct socket *tcp;
686 
687 		tcp = smc->clcsock;
688 		smc->clcsock = NULL;
689 		sock_release(tcp);
690 	}
691 	if (smc->use_fallback) {
692 		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
693 	} else if (sk->sk_state == SMC_CLOSED) {
694 		smc_conn_free(&smc->conn);
695 		schedule_delayed_work(&smc->sock_put_work,
696 				      SMC_CLOSE_SOCK_PUT_DELAY);
697 	}
698 	release_sock(sk);
699 	sock_put(sk);
700 }
701 
702 static int smc_serv_conf_first_link(struct smc_sock *smc)
703 {
704 	struct smc_link_group *lgr = smc->conn.lgr;
705 	struct smc_link *link;
706 	int rest;
707 	int rc;
708 
709 	link = &lgr->lnk[SMC_SINGLE_LINK];
710 
711 	rc = smc_wr_reg_send(link,
712 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
713 	if (rc)
714 		return SMC_CLC_DECL_INTERR;
715 
716 	/* send CONFIRM LINK request to client over the RoCE fabric */
717 	rc = smc_llc_send_confirm_link(link,
718 				       link->smcibdev->mac[link->ibport - 1],
719 				       &link->smcibdev->gid[link->ibport - 1],
720 				       SMC_LLC_REQ);
721 	if (rc < 0)
722 		return SMC_CLC_DECL_TCL;
723 
724 	/* receive CONFIRM LINK response from client over the RoCE fabric */
725 	rest = wait_for_completion_interruptible_timeout(
726 		&link->llc_confirm_resp,
727 		SMC_LLC_WAIT_FIRST_TIME);
728 	if (rest <= 0) {
729 		struct smc_clc_msg_decline dclc;
730 
731 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
732 				      SMC_CLC_DECLINE);
733 	}
734 
735 	return rc;
736 }
737 
738 /* setup for RDMA connection of server */
739 static void smc_listen_work(struct work_struct *work)
740 {
741 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
742 						smc_listen_work);
743 	struct socket *newclcsock = new_smc->clcsock;
744 	struct smc_sock *lsmc = new_smc->listen_smc;
745 	struct smc_clc_msg_accept_confirm cclc;
746 	int local_contact = SMC_REUSE_CONTACT;
747 	struct sock *newsmcsk = &new_smc->sk;
748 	struct smc_clc_msg_proposal pclc;
749 	struct smc_ib_device *smcibdev;
750 	struct sockaddr_in peeraddr;
751 	struct smc_link *link;
752 	int reason_code = 0;
753 	int rc = 0, len;
754 	__be32 subnet;
755 	u8 prefix_len;
756 	u8 ibport;
757 
758 	/* do inband token exchange -
759 	 *wait for and receive SMC Proposal CLC message
760 	 */
761 	reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
762 				       SMC_CLC_PROPOSAL);
763 	if (reason_code < 0)
764 		goto out_err;
765 	if (reason_code > 0)
766 		goto decline_rdma;
767 
768 	/* IPSec connections opt out of SMC-R optimizations */
769 	if (using_ipsec(new_smc)) {
770 		reason_code = SMC_CLC_DECL_IPSEC;
771 		goto decline_rdma;
772 	}
773 
774 	/* PNET table look up: search active ib_device and port
775 	 * within same PNETID that also contains the ethernet device
776 	 * used for the internal TCP socket
777 	 */
778 	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
779 	if (!smcibdev) {
780 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
781 		goto decline_rdma;
782 	}
783 
784 	/* determine subnet and mask from internal TCP socket */
785 	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
786 	if (rc) {
787 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
788 		goto decline_rdma;
789 	}
790 	if ((pclc.outgoing_subnet != subnet) ||
791 	    (pclc.prefix_len != prefix_len)) {
792 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
793 		goto decline_rdma;
794 	}
795 
796 	/* get address of the peer connected to the internal TCP socket */
797 	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
798 
799 	/* allocate connection / link group */
800 	mutex_lock(&smc_create_lgr_pending);
801 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
802 					smcibdev, ibport, &pclc.lcl, 0);
803 	if (local_contact < 0) {
804 		rc = local_contact;
805 		if (rc == -ENOMEM)
806 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
807 		else if (rc == -ENOLINK)
808 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
809 		goto decline_rdma;
810 	}
811 	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
812 
813 	/* create send buffer and rmb */
814 	rc = smc_buf_create(new_smc);
815 	if (rc) {
816 		reason_code = SMC_CLC_DECL_MEM;
817 		goto decline_rdma;
818 	}
819 
820 	smc_close_init(new_smc);
821 	smc_rx_init(new_smc);
822 
823 	if (local_contact != SMC_FIRST_CONTACT) {
824 		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
825 
826 		if (!buf_desc->reused) {
827 			/* register memory region for new rmb */
828 			rc = smc_wr_reg_send(link,
829 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
830 			if (rc) {
831 				reason_code = SMC_CLC_DECL_INTERR;
832 				goto decline_rdma;
833 			}
834 		}
835 	}
836 	smc_rmb_sync_sg_for_device(&new_smc->conn);
837 
838 	rc = smc_clc_send_accept(new_smc, local_contact);
839 	if (rc)
840 		goto out_err;
841 
842 	/* receive SMC Confirm CLC message */
843 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
844 				       SMC_CLC_CONFIRM);
845 	if (reason_code < 0)
846 		goto out_err;
847 	if (reason_code > 0)
848 		goto decline_rdma;
849 	smc_conn_save_peer_info(new_smc, &cclc);
850 	if (local_contact == SMC_FIRST_CONTACT)
851 		smc_link_save_peer_info(link, &cclc);
852 
853 	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
854 	if (rc) {
855 		reason_code = SMC_CLC_DECL_INTERR;
856 		goto decline_rdma;
857 	}
858 
859 	if (local_contact == SMC_FIRST_CONTACT) {
860 		rc = smc_ib_ready_link(link);
861 		if (rc) {
862 			reason_code = SMC_CLC_DECL_INTERR;
863 			goto decline_rdma;
864 		}
865 		/* QP confirmation over RoCE fabric */
866 		reason_code = smc_serv_conf_first_link(new_smc);
867 		if (reason_code < 0) {
868 			/* peer is not aware of a problem */
869 			rc = reason_code;
870 			goto out_err;
871 		}
872 		if (reason_code > 0)
873 			goto decline_rdma;
874 	}
875 
876 	smc_tx_init(new_smc);
877 
878 out_connected:
879 	sk_refcnt_debug_inc(newsmcsk);
880 	if (newsmcsk->sk_state == SMC_INIT)
881 		newsmcsk->sk_state = SMC_ACTIVE;
882 enqueue:
883 	mutex_unlock(&smc_create_lgr_pending);
884 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
885 	if (lsmc->sk.sk_state == SMC_LISTEN) {
886 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
887 	} else { /* no longer listening */
888 		smc_close_non_accepted(newsmcsk);
889 	}
890 	release_sock(&lsmc->sk);
891 
892 	/* Wake up accept */
893 	lsmc->sk.sk_data_ready(&lsmc->sk);
894 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
895 	return;
896 
897 decline_rdma:
898 	/* RDMA setup failed, switch back to TCP */
899 	smc_conn_free(&new_smc->conn);
900 	new_smc->use_fallback = true;
901 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
902 		rc = smc_clc_send_decline(new_smc, reason_code, 0);
903 		if (rc < sizeof(struct smc_clc_msg_decline))
904 			goto out_err;
905 	}
906 	goto out_connected;
907 
908 out_err:
909 	newsmcsk->sk_state = SMC_CLOSED;
910 	smc_conn_free(&new_smc->conn);
911 	goto enqueue; /* queue new sock with sk_err set */
912 }
913 
914 static void smc_tcp_listen_work(struct work_struct *work)
915 {
916 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
917 					     tcp_listen_work);
918 	struct smc_sock *new_smc;
919 	int rc = 0;
920 
921 	lock_sock(&lsmc->sk);
922 	while (lsmc->sk.sk_state == SMC_LISTEN) {
923 		rc = smc_clcsock_accept(lsmc, &new_smc);
924 		if (rc)
925 			goto out;
926 		if (!new_smc)
927 			continue;
928 
929 		new_smc->listen_smc = lsmc;
930 		new_smc->use_fallback = false; /* assume rdma capability first*/
931 		sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
932 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
933 		smc_copy_sock_settings_to_smc(new_smc);
934 		schedule_work(&new_smc->smc_listen_work);
935 	}
936 
937 out:
938 	release_sock(&lsmc->sk);
939 	lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
940 }
941 
942 static int smc_listen(struct socket *sock, int backlog)
943 {
944 	struct sock *sk = sock->sk;
945 	struct smc_sock *smc;
946 	int rc;
947 
948 	smc = smc_sk(sk);
949 	lock_sock(sk);
950 
951 	rc = -EINVAL;
952 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
953 		goto out;
954 
955 	rc = 0;
956 	if (sk->sk_state == SMC_LISTEN) {
957 		sk->sk_max_ack_backlog = backlog;
958 		goto out;
959 	}
960 	/* some socket options are handled in core, so we could not apply
961 	 * them to the clc socket -- copy smc socket options to clc socket
962 	 */
963 	smc_copy_sock_settings_to_clc(smc);
964 
965 	rc = kernel_listen(smc->clcsock, backlog);
966 	if (rc)
967 		goto out;
968 	sk->sk_max_ack_backlog = backlog;
969 	sk->sk_ack_backlog = 0;
970 	sk->sk_state = SMC_LISTEN;
971 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
972 	schedule_work(&smc->tcp_listen_work);
973 
974 out:
975 	release_sock(sk);
976 	return rc;
977 }
978 
979 static int smc_accept(struct socket *sock, struct socket *new_sock,
980 		      int flags, bool kern)
981 {
982 	struct sock *sk = sock->sk, *nsk;
983 	DECLARE_WAITQUEUE(wait, current);
984 	struct smc_sock *lsmc;
985 	long timeo;
986 	int rc = 0;
987 
988 	lsmc = smc_sk(sk);
989 	lock_sock(sk);
990 
991 	if (lsmc->sk.sk_state != SMC_LISTEN) {
992 		rc = -EINVAL;
993 		goto out;
994 	}
995 
996 	/* Wait for an incoming connection */
997 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
998 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
999 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1000 		set_current_state(TASK_INTERRUPTIBLE);
1001 		if (!timeo) {
1002 			rc = -EAGAIN;
1003 			break;
1004 		}
1005 		release_sock(sk);
1006 		timeo = schedule_timeout(timeo);
1007 		/* wakeup by sk_data_ready in smc_listen_work() */
1008 		sched_annotate_sleep();
1009 		lock_sock(sk);
1010 		if (signal_pending(current)) {
1011 			rc = sock_intr_errno(timeo);
1012 			break;
1013 		}
1014 	}
1015 	set_current_state(TASK_RUNNING);
1016 	remove_wait_queue(sk_sleep(sk), &wait);
1017 
1018 	if (!rc)
1019 		rc = sock_error(nsk);
1020 
1021 out:
1022 	release_sock(sk);
1023 	return rc;
1024 }
1025 
1026 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1027 		       int *len, int peer)
1028 {
1029 	struct smc_sock *smc;
1030 
1031 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1032 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1033 		return -ENOTCONN;
1034 
1035 	smc = smc_sk(sock->sk);
1036 
1037 	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1038 }
1039 
1040 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1041 {
1042 	struct sock *sk = sock->sk;
1043 	struct smc_sock *smc;
1044 	int rc = -EPIPE;
1045 
1046 	smc = smc_sk(sk);
1047 	lock_sock(sk);
1048 	if ((sk->sk_state != SMC_ACTIVE) &&
1049 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1050 	    (sk->sk_state != SMC_INIT))
1051 		goto out;
1052 	if (smc->use_fallback)
1053 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1054 	else
1055 		rc = smc_tx_sendmsg(smc, msg, len);
1056 out:
1057 	release_sock(sk);
1058 	return rc;
1059 }
1060 
1061 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1062 		       int flags)
1063 {
1064 	struct sock *sk = sock->sk;
1065 	struct smc_sock *smc;
1066 	int rc = -ENOTCONN;
1067 
1068 	smc = smc_sk(sk);
1069 	lock_sock(sk);
1070 	if ((sk->sk_state == SMC_INIT) ||
1071 	    (sk->sk_state == SMC_LISTEN) ||
1072 	    (sk->sk_state == SMC_CLOSED))
1073 		goto out;
1074 
1075 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1076 		rc = 0;
1077 		goto out;
1078 	}
1079 
1080 	if (smc->use_fallback)
1081 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1082 	else
1083 		rc = smc_rx_recvmsg(smc, msg, len, flags);
1084 
1085 out:
1086 	release_sock(sk);
1087 	return rc;
1088 }
1089 
1090 static unsigned int smc_accept_poll(struct sock *parent)
1091 {
1092 	struct smc_sock *isk;
1093 	struct sock *sk;
1094 
1095 	lock_sock(parent);
1096 	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1097 		sk = (struct sock *)isk;
1098 
1099 		if (sk->sk_state == SMC_ACTIVE) {
1100 			release_sock(parent);
1101 			return POLLIN | POLLRDNORM;
1102 		}
1103 	}
1104 	release_sock(parent);
1105 
1106 	return 0;
1107 }
1108 
1109 static unsigned int smc_poll(struct file *file, struct socket *sock,
1110 			     poll_table *wait)
1111 {
1112 	struct sock *sk = sock->sk;
1113 	unsigned int mask = 0;
1114 	struct smc_sock *smc;
1115 	int rc;
1116 
1117 	smc = smc_sk(sock->sk);
1118 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1119 		/* delegate to CLC child sock */
1120 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1121 		/* if non-blocking connect finished ... */
1122 		lock_sock(sk);
1123 		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1124 			sk->sk_err = smc->clcsock->sk->sk_err;
1125 			if (sk->sk_err) {
1126 				mask |= POLLERR;
1127 			} else {
1128 				rc = smc_connect_rdma(smc);
1129 				if (rc < 0)
1130 					mask |= POLLERR;
1131 				else
1132 					/* success cases including fallback */
1133 					mask |= POLLOUT | POLLWRNORM;
1134 			}
1135 		}
1136 		release_sock(sk);
1137 	} else {
1138 		sock_poll_wait(file, sk_sleep(sk), wait);
1139 		if (sk->sk_state == SMC_LISTEN)
1140 			/* woken up by sk_data_ready in smc_listen_work() */
1141 			mask |= smc_accept_poll(sk);
1142 		if (sk->sk_err)
1143 			mask |= POLLERR;
1144 		if (atomic_read(&smc->conn.sndbuf_space) ||
1145 		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
1146 			mask |= POLLOUT | POLLWRNORM;
1147 		} else {
1148 			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1149 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1150 		}
1151 		if (atomic_read(&smc->conn.bytes_to_rcv))
1152 			mask |= POLLIN | POLLRDNORM;
1153 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1154 		    (sk->sk_state == SMC_CLOSED))
1155 			mask |= POLLHUP;
1156 		if (sk->sk_shutdown & RCV_SHUTDOWN)
1157 			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1158 		if (sk->sk_state == SMC_APPCLOSEWAIT1)
1159 			mask |= POLLIN;
1160 
1161 	}
1162 
1163 	return mask;
1164 }
1165 
1166 static int smc_shutdown(struct socket *sock, int how)
1167 {
1168 	struct sock *sk = sock->sk;
1169 	struct smc_sock *smc;
1170 	int rc = -EINVAL;
1171 	int rc1 = 0;
1172 
1173 	smc = smc_sk(sk);
1174 
1175 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1176 		return rc;
1177 
1178 	lock_sock(sk);
1179 
1180 	rc = -ENOTCONN;
1181 	if ((sk->sk_state != SMC_LISTEN) &&
1182 	    (sk->sk_state != SMC_ACTIVE) &&
1183 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1184 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1185 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1186 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1187 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1188 		goto out;
1189 	if (smc->use_fallback) {
1190 		rc = kernel_sock_shutdown(smc->clcsock, how);
1191 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1192 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1193 			sk->sk_state = SMC_CLOSED;
1194 		goto out;
1195 	}
1196 	switch (how) {
1197 	case SHUT_RDWR:		/* shutdown in both directions */
1198 		rc = smc_close_active(smc);
1199 		break;
1200 	case SHUT_WR:
1201 		rc = smc_close_shutdown_write(smc);
1202 		break;
1203 	case SHUT_RD:
1204 		if (sk->sk_state == SMC_LISTEN)
1205 			rc = smc_close_active(smc);
1206 		else
1207 			rc = 0;
1208 			/* nothing more to do because peer is not involved */
1209 		break;
1210 	}
1211 	rc1 = kernel_sock_shutdown(smc->clcsock, how);
1212 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1213 	sk->sk_shutdown |= how + 1;
1214 
1215 out:
1216 	release_sock(sk);
1217 	return rc ? rc : rc1;
1218 }
1219 
1220 static int smc_setsockopt(struct socket *sock, int level, int optname,
1221 			  char __user *optval, unsigned int optlen)
1222 {
1223 	struct sock *sk = sock->sk;
1224 	struct smc_sock *smc;
1225 
1226 	smc = smc_sk(sk);
1227 
1228 	/* generic setsockopts reaching us here always apply to the
1229 	 * CLC socket
1230 	 */
1231 	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1232 					     optval, optlen);
1233 }
1234 
1235 static int smc_getsockopt(struct socket *sock, int level, int optname,
1236 			  char __user *optval, int __user *optlen)
1237 {
1238 	struct smc_sock *smc;
1239 
1240 	smc = smc_sk(sock->sk);
1241 	/* socket options apply to the CLC socket */
1242 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1243 					     optval, optlen);
1244 }
1245 
1246 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1247 		     unsigned long arg)
1248 {
1249 	struct smc_sock *smc;
1250 
1251 	smc = smc_sk(sock->sk);
1252 	if (smc->use_fallback)
1253 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1254 	else
1255 		return sock_no_ioctl(sock, cmd, arg);
1256 }
1257 
1258 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1259 			    int offset, size_t size, int flags)
1260 {
1261 	struct sock *sk = sock->sk;
1262 	struct smc_sock *smc;
1263 	int rc = -EPIPE;
1264 
1265 	smc = smc_sk(sk);
1266 	lock_sock(sk);
1267 	if (sk->sk_state != SMC_ACTIVE)
1268 		goto out;
1269 	if (smc->use_fallback)
1270 		rc = kernel_sendpage(smc->clcsock, page, offset,
1271 				     size, flags);
1272 	else
1273 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1274 
1275 out:
1276 	release_sock(sk);
1277 	return rc;
1278 }
1279 
1280 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1281 			       struct pipe_inode_info *pipe, size_t len,
1282 				    unsigned int flags)
1283 {
1284 	struct sock *sk = sock->sk;
1285 	struct smc_sock *smc;
1286 	int rc = -ENOTCONN;
1287 
1288 	smc = smc_sk(sk);
1289 	lock_sock(sk);
1290 	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1291 		goto out;
1292 	if (smc->use_fallback) {
1293 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1294 						    pipe, len, flags);
1295 	} else {
1296 		rc = -EOPNOTSUPP;
1297 	}
1298 out:
1299 	release_sock(sk);
1300 	return rc;
1301 }
1302 
1303 /* must look like tcp */
1304 static const struct proto_ops smc_sock_ops = {
1305 	.family		= PF_SMC,
1306 	.owner		= THIS_MODULE,
1307 	.release	= smc_release,
1308 	.bind		= smc_bind,
1309 	.connect	= smc_connect,
1310 	.socketpair	= sock_no_socketpair,
1311 	.accept		= smc_accept,
1312 	.getname	= smc_getname,
1313 	.poll		= smc_poll,
1314 	.ioctl		= smc_ioctl,
1315 	.listen		= smc_listen,
1316 	.shutdown	= smc_shutdown,
1317 	.setsockopt	= smc_setsockopt,
1318 	.getsockopt	= smc_getsockopt,
1319 	.sendmsg	= smc_sendmsg,
1320 	.recvmsg	= smc_recvmsg,
1321 	.mmap		= sock_no_mmap,
1322 	.sendpage	= smc_sendpage,
1323 	.splice_read	= smc_splice_read,
1324 };
1325 
1326 static int smc_create(struct net *net, struct socket *sock, int protocol,
1327 		      int kern)
1328 {
1329 	struct smc_sock *smc;
1330 	struct sock *sk;
1331 	int rc;
1332 
1333 	rc = -ESOCKTNOSUPPORT;
1334 	if (sock->type != SOCK_STREAM)
1335 		goto out;
1336 
1337 	rc = -EPROTONOSUPPORT;
1338 	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1339 		goto out;
1340 
1341 	rc = -ENOBUFS;
1342 	sock->ops = &smc_sock_ops;
1343 	sk = smc_sock_alloc(net, sock);
1344 	if (!sk)
1345 		goto out;
1346 
1347 	/* create internal TCP socket for CLC handshake and fallback */
1348 	smc = smc_sk(sk);
1349 	smc->use_fallback = false; /* assume rdma capability first */
1350 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1351 			      IPPROTO_TCP, &smc->clcsock);
1352 	if (rc)
1353 		sk_common_release(sk);
1354 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1355 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1356 
1357 out:
1358 	return rc;
1359 }
1360 
1361 static const struct net_proto_family smc_sock_family_ops = {
1362 	.family	= PF_SMC,
1363 	.owner	= THIS_MODULE,
1364 	.create	= smc_create,
1365 };
1366 
1367 static int __init smc_init(void)
1368 {
1369 	int rc;
1370 
1371 	rc = smc_pnet_init();
1372 	if (rc)
1373 		return rc;
1374 
1375 	rc = smc_llc_init();
1376 	if (rc) {
1377 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1378 		goto out_pnet;
1379 	}
1380 
1381 	rc = smc_cdc_init();
1382 	if (rc) {
1383 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1384 		goto out_pnet;
1385 	}
1386 
1387 	rc = proto_register(&smc_proto, 1);
1388 	if (rc) {
1389 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
1390 		goto out_pnet;
1391 	}
1392 
1393 	rc = sock_register(&smc_sock_family_ops);
1394 	if (rc) {
1395 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1396 		goto out_proto;
1397 	}
1398 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1399 
1400 	rc = smc_ib_register_client();
1401 	if (rc) {
1402 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1403 		goto out_sock;
1404 	}
1405 
1406 	return 0;
1407 
1408 out_sock:
1409 	sock_unregister(PF_SMC);
1410 out_proto:
1411 	proto_unregister(&smc_proto);
1412 out_pnet:
1413 	smc_pnet_exit();
1414 	return rc;
1415 }
1416 
1417 static void __exit smc_exit(void)
1418 {
1419 	struct smc_link_group *lgr, *lg;
1420 	LIST_HEAD(lgr_freeing_list);
1421 
1422 	spin_lock_bh(&smc_lgr_list.lock);
1423 	if (!list_empty(&smc_lgr_list.list))
1424 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1425 	spin_unlock_bh(&smc_lgr_list.lock);
1426 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1427 		list_del_init(&lgr->list);
1428 		smc_lgr_free(lgr); /* free link group */
1429 	}
1430 	smc_ib_unregister_client();
1431 	sock_unregister(PF_SMC);
1432 	proto_unregister(&smc_proto);
1433 	smc_pnet_exit();
1434 }
1435 
1436 module_init(smc_init);
1437 module_exit(smc_exit);
1438 
1439 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1440 MODULE_DESCRIPTION("smc socket address family");
1441 MODULE_LICENSE("GPL");
1442 MODULE_ALIAS_NETPROTO(PF_SMC);
1443