xref: /openbmc/linux/net/smc/af_smc.c (revision d2ba09c1)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *    - partial support for non-blocking sockets only
12  *    - support for urgent data postponed
13  *
14  *  Copyright IBM Corp. 2016, 2018
15  *
16  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
17  *              based on prototype from Frank Blaschka
18  */
19 
20 #define KMSG_COMPONENT "smc"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22 
23 #include <linux/module.h>
24 #include <linux/socket.h>
25 #include <linux/workqueue.h>
26 #include <linux/in.h>
27 #include <linux/sched/signal.h>
28 
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33 
34 #include "smc.h"
35 #include "smc_clc.h"
36 #include "smc_llc.h"
37 #include "smc_cdc.h"
38 #include "smc_core.h"
39 #include "smc_ib.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44 
45 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
46 						 * creation
47 						 */
48 
49 static void smc_tcp_listen_work(struct work_struct *);
50 
51 static void smc_set_keepalive(struct sock *sk, int val)
52 {
53 	struct smc_sock *smc = smc_sk(sk);
54 
55 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
56 }
57 
58 static struct smc_hashinfo smc_v4_hashinfo = {
59 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
60 };
61 
62 static struct smc_hashinfo smc_v6_hashinfo = {
63 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
64 };
65 
66 int smc_hash_sk(struct sock *sk)
67 {
68 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
69 	struct hlist_head *head;
70 
71 	head = &h->ht;
72 
73 	write_lock_bh(&h->lock);
74 	sk_add_node(sk, head);
75 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
76 	write_unlock_bh(&h->lock);
77 
78 	return 0;
79 }
80 EXPORT_SYMBOL_GPL(smc_hash_sk);
81 
82 void smc_unhash_sk(struct sock *sk)
83 {
84 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
85 
86 	write_lock_bh(&h->lock);
87 	if (sk_del_node_init(sk))
88 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
89 	write_unlock_bh(&h->lock);
90 }
91 EXPORT_SYMBOL_GPL(smc_unhash_sk);
92 
93 struct proto smc_proto = {
94 	.name		= "SMC",
95 	.owner		= THIS_MODULE,
96 	.keepalive	= smc_set_keepalive,
97 	.hash		= smc_hash_sk,
98 	.unhash		= smc_unhash_sk,
99 	.obj_size	= sizeof(struct smc_sock),
100 	.h.smc_hash	= &smc_v4_hashinfo,
101 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
102 };
103 EXPORT_SYMBOL_GPL(smc_proto);
104 
105 struct proto smc_proto6 = {
106 	.name		= "SMC6",
107 	.owner		= THIS_MODULE,
108 	.keepalive	= smc_set_keepalive,
109 	.hash		= smc_hash_sk,
110 	.unhash		= smc_unhash_sk,
111 	.obj_size	= sizeof(struct smc_sock),
112 	.h.smc_hash	= &smc_v6_hashinfo,
113 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
114 };
115 EXPORT_SYMBOL_GPL(smc_proto6);
116 
117 static int smc_release(struct socket *sock)
118 {
119 	struct sock *sk = sock->sk;
120 	struct smc_sock *smc;
121 	int rc = 0;
122 
123 	if (!sk)
124 		goto out;
125 
126 	smc = smc_sk(sk);
127 	if (sk->sk_state == SMC_LISTEN)
128 		/* smc_close_non_accepted() is called and acquires
129 		 * sock lock for child sockets again
130 		 */
131 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
132 	else
133 		lock_sock(sk);
134 
135 	if (!smc->use_fallback) {
136 		rc = smc_close_active(smc);
137 		sock_set_flag(sk, SOCK_DEAD);
138 		sk->sk_shutdown |= SHUTDOWN_MASK;
139 	}
140 	if (smc->clcsock) {
141 		sock_release(smc->clcsock);
142 		smc->clcsock = NULL;
143 	}
144 	if (smc->use_fallback) {
145 		sock_put(sk); /* passive closing */
146 		sk->sk_state = SMC_CLOSED;
147 		sk->sk_state_change(sk);
148 	}
149 
150 	/* detach socket */
151 	sock_orphan(sk);
152 	sock->sk = NULL;
153 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
154 		smc_conn_free(&smc->conn);
155 	release_sock(sk);
156 
157 	sk->sk_prot->unhash(sk);
158 	sock_put(sk); /* final sock_put */
159 out:
160 	return rc;
161 }
162 
163 static void smc_destruct(struct sock *sk)
164 {
165 	if (sk->sk_state != SMC_CLOSED)
166 		return;
167 	if (!sock_flag(sk, SOCK_DEAD))
168 		return;
169 
170 	sk_refcnt_debug_dec(sk);
171 }
172 
173 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
174 				   int protocol)
175 {
176 	struct smc_sock *smc;
177 	struct proto *prot;
178 	struct sock *sk;
179 
180 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
181 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
182 	if (!sk)
183 		return NULL;
184 
185 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
186 	sk->sk_state = SMC_INIT;
187 	sk->sk_destruct = smc_destruct;
188 	sk->sk_protocol = protocol;
189 	smc = smc_sk(sk);
190 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
191 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
192 	INIT_LIST_HEAD(&smc->accept_q);
193 	spin_lock_init(&smc->accept_q_lock);
194 	spin_lock_init(&smc->conn.send_lock);
195 	sk->sk_prot->hash(sk);
196 	sk_refcnt_debug_inc(sk);
197 
198 	return sk;
199 }
200 
201 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
202 		    int addr_len)
203 {
204 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
205 	struct sock *sk = sock->sk;
206 	struct smc_sock *smc;
207 	int rc;
208 
209 	smc = smc_sk(sk);
210 
211 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
212 	rc = -EINVAL;
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		goto out;
215 
216 	rc = -EAFNOSUPPORT;
217 	if (addr->sin_family != AF_INET &&
218 	    addr->sin_family != AF_INET6 &&
219 	    addr->sin_family != AF_UNSPEC)
220 		goto out;
221 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
222 	if (addr->sin_family == AF_UNSPEC &&
223 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
224 		goto out;
225 
226 	lock_sock(sk);
227 
228 	/* Check if socket is already active */
229 	rc = -EINVAL;
230 	if (sk->sk_state != SMC_INIT)
231 		goto out_rel;
232 
233 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
234 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
235 
236 out_rel:
237 	release_sock(sk);
238 out:
239 	return rc;
240 }
241 
242 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
243 				   unsigned long mask)
244 {
245 	/* options we don't get control via setsockopt for */
246 	nsk->sk_type = osk->sk_type;
247 	nsk->sk_sndbuf = osk->sk_sndbuf;
248 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
249 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
250 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
251 	nsk->sk_mark = osk->sk_mark;
252 	nsk->sk_priority = osk->sk_priority;
253 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
254 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
255 	nsk->sk_err = osk->sk_err;
256 
257 	nsk->sk_flags &= ~mask;
258 	nsk->sk_flags |= osk->sk_flags & mask;
259 }
260 
261 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
262 			     (1UL << SOCK_KEEPOPEN) | \
263 			     (1UL << SOCK_LINGER) | \
264 			     (1UL << SOCK_BROADCAST) | \
265 			     (1UL << SOCK_TIMESTAMP) | \
266 			     (1UL << SOCK_DBG) | \
267 			     (1UL << SOCK_RCVTSTAMP) | \
268 			     (1UL << SOCK_RCVTSTAMPNS) | \
269 			     (1UL << SOCK_LOCALROUTE) | \
270 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
271 			     (1UL << SOCK_RXQ_OVFL) | \
272 			     (1UL << SOCK_WIFI_STATUS) | \
273 			     (1UL << SOCK_NOFCS) | \
274 			     (1UL << SOCK_FILTER_LOCKED))
275 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
276  * clc socket (since smc is not called for these options from net/core)
277  */
278 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
279 {
280 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
281 }
282 
283 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
284 			     (1UL << SOCK_KEEPOPEN) | \
285 			     (1UL << SOCK_LINGER) | \
286 			     (1UL << SOCK_DBG))
287 /* copy only settings and flags relevant for smc from clc to smc socket */
288 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
289 {
290 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
291 }
292 
293 /* register a new rmb, optionally send confirm_rkey msg to register with peer */
294 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
295 		       bool conf_rkey)
296 {
297 	/* register memory region for new rmb */
298 	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
299 		rmb_desc->regerr = 1;
300 		return -EFAULT;
301 	}
302 	if (!conf_rkey)
303 		return 0;
304 	/* exchange confirm_rkey msg with peer */
305 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
306 		rmb_desc->regerr = 1;
307 		return -EFAULT;
308 	}
309 	return 0;
310 }
311 
312 static int smc_clnt_conf_first_link(struct smc_sock *smc)
313 {
314 	struct net *net = sock_net(smc->clcsock->sk);
315 	struct smc_link_group *lgr = smc->conn.lgr;
316 	struct smc_link *link;
317 	int rest;
318 	int rc;
319 
320 	link = &lgr->lnk[SMC_SINGLE_LINK];
321 	/* receive CONFIRM LINK request from server over RoCE fabric */
322 	rest = wait_for_completion_interruptible_timeout(
323 		&link->llc_confirm,
324 		SMC_LLC_WAIT_FIRST_TIME);
325 	if (rest <= 0) {
326 		struct smc_clc_msg_decline dclc;
327 
328 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
329 				      SMC_CLC_DECLINE);
330 		return rc;
331 	}
332 
333 	if (link->llc_confirm_rc)
334 		return SMC_CLC_DECL_RMBE_EC;
335 
336 	rc = smc_ib_modify_qp_rts(link);
337 	if (rc)
338 		return SMC_CLC_DECL_INTERR;
339 
340 	smc_wr_remember_qp_attr(link);
341 
342 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
343 		return SMC_CLC_DECL_INTERR;
344 
345 	/* send CONFIRM LINK response over RoCE fabric */
346 	rc = smc_llc_send_confirm_link(link,
347 				       link->smcibdev->mac[link->ibport - 1],
348 				       &link->smcibdev->gid[link->ibport - 1],
349 				       SMC_LLC_RESP);
350 	if (rc < 0)
351 		return SMC_CLC_DECL_TCL;
352 
353 	/* receive ADD LINK request from server over RoCE fabric */
354 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
355 							 SMC_LLC_WAIT_TIME);
356 	if (rest <= 0) {
357 		struct smc_clc_msg_decline dclc;
358 
359 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
360 				      SMC_CLC_DECLINE);
361 		return rc;
362 	}
363 
364 	/* send add link reject message, only one link supported for now */
365 	rc = smc_llc_send_add_link(link,
366 				   link->smcibdev->mac[link->ibport - 1],
367 				   &link->smcibdev->gid[link->ibport - 1],
368 				   SMC_LLC_RESP);
369 	if (rc < 0)
370 		return SMC_CLC_DECL_TCL;
371 
372 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
373 
374 	return 0;
375 }
376 
377 static void smc_conn_save_peer_info(struct smc_sock *smc,
378 				    struct smc_clc_msg_accept_confirm *clc)
379 {
380 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
381 
382 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
383 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
384 	smc->conn.peer_rmbe_size = bufsize;
385 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
386 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
387 }
388 
389 static void smc_link_save_peer_info(struct smc_link *link,
390 				    struct smc_clc_msg_accept_confirm *clc)
391 {
392 	link->peer_qpn = ntoh24(clc->qpn);
393 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
394 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
395 	link->peer_psn = ntoh24(clc->psn);
396 	link->peer_mtu = clc->qp_mtu;
397 }
398 
399 /* fall back during connect */
400 static int smc_connect_fallback(struct smc_sock *smc)
401 {
402 	smc->use_fallback = true;
403 	smc_copy_sock_settings_to_clc(smc);
404 	if (smc->sk.sk_state == SMC_INIT)
405 		smc->sk.sk_state = SMC_ACTIVE;
406 	return 0;
407 }
408 
409 /* decline and fall back during connect */
410 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
411 {
412 	int rc;
413 
414 	if (reason_code < 0) /* error, fallback is not possible */
415 		return reason_code;
416 	if (reason_code != SMC_CLC_DECL_REPLY) {
417 		rc = smc_clc_send_decline(smc, reason_code);
418 		if (rc < 0)
419 			return rc;
420 	}
421 	return smc_connect_fallback(smc);
422 }
423 
424 /* abort connecting */
425 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
426 			     int local_contact)
427 {
428 	if (local_contact == SMC_FIRST_CONTACT)
429 		smc_lgr_forget(smc->conn.lgr);
430 	mutex_unlock(&smc_create_lgr_pending);
431 	smc_conn_free(&smc->conn);
432 	if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
433 		sock_put(&smc->sk); /* passive closing */
434 	return reason_code;
435 }
436 
437 /* check if there is a rdma device available for this connection. */
438 /* called for connect and listen */
439 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
440 			  u8 *ibport)
441 {
442 	int reason_code = 0;
443 
444 	/* PNET table look up: search active ib_device and port
445 	 * within same PNETID that also contains the ethernet device
446 	 * used for the internal TCP socket
447 	 */
448 	smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
449 	if (!(*ibdev))
450 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
451 
452 	return reason_code;
453 }
454 
455 /* CLC handshake during connect */
456 static int smc_connect_clc(struct smc_sock *smc,
457 			   struct smc_clc_msg_accept_confirm *aclc,
458 			   struct smc_ib_device *ibdev, u8 ibport)
459 {
460 	int rc = 0;
461 
462 	/* do inband token exchange */
463 	rc = smc_clc_send_proposal(smc, ibdev, ibport);
464 	if (rc)
465 		return rc;
466 	/* receive SMC Accept CLC message */
467 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
468 }
469 
470 /* setup for RDMA connection of client */
471 static int smc_connect_rdma(struct smc_sock *smc,
472 			    struct smc_clc_msg_accept_confirm *aclc,
473 			    struct smc_ib_device *ibdev, u8 ibport)
474 {
475 	int local_contact = SMC_FIRST_CONTACT;
476 	struct smc_link *link;
477 	int reason_code = 0;
478 
479 	mutex_lock(&smc_create_lgr_pending);
480 	local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
481 					aclc->hdr.flag);
482 	if (local_contact < 0) {
483 		if (local_contact == -ENOMEM)
484 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
485 		else if (local_contact == -ENOLINK)
486 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
487 		else
488 			reason_code = SMC_CLC_DECL_INTERR; /* other error */
489 		return smc_connect_abort(smc, reason_code, 0);
490 	}
491 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
492 
493 	smc_conn_save_peer_info(smc, aclc);
494 
495 	/* create send buffer and rmb */
496 	if (smc_buf_create(smc))
497 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
498 
499 	if (local_contact == SMC_FIRST_CONTACT)
500 		smc_link_save_peer_info(link, aclc);
501 
502 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
503 		return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
504 					 local_contact);
505 
506 	smc_close_init(smc);
507 	smc_rx_init(smc);
508 
509 	if (local_contact == SMC_FIRST_CONTACT) {
510 		if (smc_ib_ready_link(link))
511 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
512 						 local_contact);
513 	} else {
514 		if (!smc->conn.rmb_desc->reused &&
515 		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
516 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
517 						 local_contact);
518 	}
519 	smc_rmb_sync_sg_for_device(&smc->conn);
520 
521 	reason_code = smc_clc_send_confirm(smc);
522 	if (reason_code)
523 		return smc_connect_abort(smc, reason_code, local_contact);
524 
525 	smc_tx_init(smc);
526 
527 	if (local_contact == SMC_FIRST_CONTACT) {
528 		/* QP confirmation over RoCE fabric */
529 		reason_code = smc_clnt_conf_first_link(smc);
530 		if (reason_code)
531 			return smc_connect_abort(smc, reason_code,
532 						 local_contact);
533 	}
534 	mutex_unlock(&smc_create_lgr_pending);
535 
536 	smc_copy_sock_settings_to_clc(smc);
537 	if (smc->sk.sk_state == SMC_INIT)
538 		smc->sk.sk_state = SMC_ACTIVE;
539 
540 	return 0;
541 }
542 
543 /* perform steps before actually connecting */
544 static int __smc_connect(struct smc_sock *smc)
545 {
546 	struct smc_clc_msg_accept_confirm aclc;
547 	struct smc_ib_device *ibdev;
548 	int rc = 0;
549 	u8 ibport;
550 
551 	sock_hold(&smc->sk); /* sock put in passive closing */
552 
553 	if (smc->use_fallback)
554 		return smc_connect_fallback(smc);
555 
556 	/* if peer has not signalled SMC-capability, fall back */
557 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
558 		return smc_connect_fallback(smc);
559 
560 	/* IPSec connections opt out of SMC-R optimizations */
561 	if (using_ipsec(smc))
562 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
563 
564 	/* check if a RDMA device is available; if not, fall back */
565 	if (smc_check_rdma(smc, &ibdev, &ibport))
566 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
567 
568 	/* perform CLC handshake */
569 	rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
570 	if (rc)
571 		return smc_connect_decline_fallback(smc, rc);
572 
573 	/* connect using rdma */
574 	rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
575 	if (rc)
576 		return smc_connect_decline_fallback(smc, rc);
577 
578 	return 0;
579 }
580 
581 static int smc_connect(struct socket *sock, struct sockaddr *addr,
582 		       int alen, int flags)
583 {
584 	struct sock *sk = sock->sk;
585 	struct smc_sock *smc;
586 	int rc = -EINVAL;
587 
588 	smc = smc_sk(sk);
589 
590 	/* separate smc parameter checking to be safe */
591 	if (alen < sizeof(addr->sa_family))
592 		goto out_err;
593 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
594 		goto out_err;
595 
596 	lock_sock(sk);
597 	switch (sk->sk_state) {
598 	default:
599 		goto out;
600 	case SMC_ACTIVE:
601 		rc = -EISCONN;
602 		goto out;
603 	case SMC_INIT:
604 		rc = 0;
605 		break;
606 	}
607 
608 	smc_copy_sock_settings_to_clc(smc);
609 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
610 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
611 	if (rc)
612 		goto out;
613 
614 	rc = __smc_connect(smc);
615 	if (rc < 0)
616 		goto out;
617 	else
618 		rc = 0; /* success cases including fallback */
619 
620 out:
621 	release_sock(sk);
622 out_err:
623 	return rc;
624 }
625 
626 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
627 {
628 	struct socket *new_clcsock = NULL;
629 	struct sock *lsk = &lsmc->sk;
630 	struct sock *new_sk;
631 	int rc;
632 
633 	release_sock(lsk);
634 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
635 	if (!new_sk) {
636 		rc = -ENOMEM;
637 		lsk->sk_err = ENOMEM;
638 		*new_smc = NULL;
639 		lock_sock(lsk);
640 		goto out;
641 	}
642 	*new_smc = smc_sk(new_sk);
643 
644 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
645 	lock_sock(lsk);
646 	if  (rc < 0)
647 		lsk->sk_err = -rc;
648 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
649 		if (new_clcsock)
650 			sock_release(new_clcsock);
651 		new_sk->sk_state = SMC_CLOSED;
652 		sock_set_flag(new_sk, SOCK_DEAD);
653 		new_sk->sk_prot->unhash(new_sk);
654 		sock_put(new_sk); /* final */
655 		*new_smc = NULL;
656 		goto out;
657 	}
658 
659 	(*new_smc)->clcsock = new_clcsock;
660 out:
661 	return rc;
662 }
663 
664 /* add a just created sock to the accept queue of the listen sock as
665  * candidate for a following socket accept call from user space
666  */
667 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
668 {
669 	struct smc_sock *par = smc_sk(parent);
670 
671 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
672 	spin_lock(&par->accept_q_lock);
673 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
674 	spin_unlock(&par->accept_q_lock);
675 	sk_acceptq_added(parent);
676 }
677 
678 /* remove a socket from the accept queue of its parental listening socket */
679 static void smc_accept_unlink(struct sock *sk)
680 {
681 	struct smc_sock *par = smc_sk(sk)->listen_smc;
682 
683 	spin_lock(&par->accept_q_lock);
684 	list_del_init(&smc_sk(sk)->accept_q);
685 	spin_unlock(&par->accept_q_lock);
686 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
687 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
688 }
689 
690 /* remove a sock from the accept queue to bind it to a new socket created
691  * for a socket accept call from user space
692  */
693 struct sock *smc_accept_dequeue(struct sock *parent,
694 				struct socket *new_sock)
695 {
696 	struct smc_sock *isk, *n;
697 	struct sock *new_sk;
698 
699 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
700 		new_sk = (struct sock *)isk;
701 
702 		smc_accept_unlink(new_sk);
703 		if (new_sk->sk_state == SMC_CLOSED) {
704 			if (isk->clcsock) {
705 				sock_release(isk->clcsock);
706 				isk->clcsock = NULL;
707 			}
708 			new_sk->sk_prot->unhash(new_sk);
709 			sock_put(new_sk); /* final */
710 			continue;
711 		}
712 		if (new_sock)
713 			sock_graft(new_sk, new_sock);
714 		return new_sk;
715 	}
716 	return NULL;
717 }
718 
719 /* clean up for a created but never accepted sock */
720 void smc_close_non_accepted(struct sock *sk)
721 {
722 	struct smc_sock *smc = smc_sk(sk);
723 
724 	lock_sock(sk);
725 	if (!sk->sk_lingertime)
726 		/* wait for peer closing */
727 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
728 	if (!smc->use_fallback) {
729 		smc_close_active(smc);
730 		sock_set_flag(sk, SOCK_DEAD);
731 		sk->sk_shutdown |= SHUTDOWN_MASK;
732 	}
733 	if (smc->clcsock) {
734 		struct socket *tcp;
735 
736 		tcp = smc->clcsock;
737 		smc->clcsock = NULL;
738 		sock_release(tcp);
739 	}
740 	if (smc->use_fallback) {
741 		sock_put(sk); /* passive closing */
742 		sk->sk_state = SMC_CLOSED;
743 	} else {
744 		if (sk->sk_state == SMC_CLOSED)
745 			smc_conn_free(&smc->conn);
746 	}
747 	release_sock(sk);
748 	sk->sk_prot->unhash(sk);
749 	sock_put(sk); /* final sock_put */
750 }
751 
752 static int smc_serv_conf_first_link(struct smc_sock *smc)
753 {
754 	struct net *net = sock_net(smc->clcsock->sk);
755 	struct smc_link_group *lgr = smc->conn.lgr;
756 	struct smc_link *link;
757 	int rest;
758 	int rc;
759 
760 	link = &lgr->lnk[SMC_SINGLE_LINK];
761 
762 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
763 		return SMC_CLC_DECL_INTERR;
764 
765 	/* send CONFIRM LINK request to client over the RoCE fabric */
766 	rc = smc_llc_send_confirm_link(link,
767 				       link->smcibdev->mac[link->ibport - 1],
768 				       &link->smcibdev->gid[link->ibport - 1],
769 				       SMC_LLC_REQ);
770 	if (rc < 0)
771 		return SMC_CLC_DECL_TCL;
772 
773 	/* receive CONFIRM LINK response from client over the RoCE fabric */
774 	rest = wait_for_completion_interruptible_timeout(
775 		&link->llc_confirm_resp,
776 		SMC_LLC_WAIT_FIRST_TIME);
777 	if (rest <= 0) {
778 		struct smc_clc_msg_decline dclc;
779 
780 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
781 				      SMC_CLC_DECLINE);
782 		return rc;
783 	}
784 
785 	if (link->llc_confirm_resp_rc)
786 		return SMC_CLC_DECL_RMBE_EC;
787 
788 	/* send ADD LINK request to client over the RoCE fabric */
789 	rc = smc_llc_send_add_link(link,
790 				   link->smcibdev->mac[link->ibport - 1],
791 				   &link->smcibdev->gid[link->ibport - 1],
792 				   SMC_LLC_REQ);
793 	if (rc < 0)
794 		return SMC_CLC_DECL_TCL;
795 
796 	/* receive ADD LINK response from client over the RoCE fabric */
797 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
798 							 SMC_LLC_WAIT_TIME);
799 	if (rest <= 0) {
800 		struct smc_clc_msg_decline dclc;
801 
802 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
803 				      SMC_CLC_DECLINE);
804 		return rc;
805 	}
806 
807 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
808 
809 	return 0;
810 }
811 
812 /* listen worker: finish */
813 static void smc_listen_out(struct smc_sock *new_smc)
814 {
815 	struct smc_sock *lsmc = new_smc->listen_smc;
816 	struct sock *newsmcsk = &new_smc->sk;
817 
818 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
819 	if (lsmc->sk.sk_state == SMC_LISTEN) {
820 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
821 	} else { /* no longer listening */
822 		smc_close_non_accepted(newsmcsk);
823 	}
824 	release_sock(&lsmc->sk);
825 
826 	/* Wake up accept */
827 	lsmc->sk.sk_data_ready(&lsmc->sk);
828 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
829 }
830 
831 /* listen worker: finish in state connected */
832 static void smc_listen_out_connected(struct smc_sock *new_smc)
833 {
834 	struct sock *newsmcsk = &new_smc->sk;
835 
836 	sk_refcnt_debug_inc(newsmcsk);
837 	if (newsmcsk->sk_state == SMC_INIT)
838 		newsmcsk->sk_state = SMC_ACTIVE;
839 
840 	smc_listen_out(new_smc);
841 }
842 
843 /* listen worker: finish in error state */
844 static void smc_listen_out_err(struct smc_sock *new_smc)
845 {
846 	struct sock *newsmcsk = &new_smc->sk;
847 
848 	if (newsmcsk->sk_state == SMC_INIT)
849 		sock_put(&new_smc->sk); /* passive closing */
850 	newsmcsk->sk_state = SMC_CLOSED;
851 	smc_conn_free(&new_smc->conn);
852 
853 	smc_listen_out(new_smc);
854 }
855 
856 /* listen worker: decline and fall back if possible */
857 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
858 			       int local_contact)
859 {
860 	/* RDMA setup failed, switch back to TCP */
861 	if (local_contact == SMC_FIRST_CONTACT)
862 		smc_lgr_forget(new_smc->conn.lgr);
863 	if (reason_code < 0) { /* error, no fallback possible */
864 		smc_listen_out_err(new_smc);
865 		return;
866 	}
867 	smc_conn_free(&new_smc->conn);
868 	new_smc->use_fallback = true;
869 	if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
870 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
871 			smc_listen_out_err(new_smc);
872 			return;
873 		}
874 	}
875 	smc_listen_out_connected(new_smc);
876 }
877 
878 /* listen worker: check prefixes */
879 static int smc_listen_rdma_check(struct smc_sock *new_smc,
880 				 struct smc_clc_msg_proposal *pclc)
881 {
882 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
883 	struct socket *newclcsock = new_smc->clcsock;
884 
885 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
886 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
887 		return SMC_CLC_DECL_CNFERR;
888 
889 	return 0;
890 }
891 
892 /* listen worker: initialize connection and buffers */
893 static int smc_listen_rdma_init(struct smc_sock *new_smc,
894 				struct smc_clc_msg_proposal *pclc,
895 				struct smc_ib_device *ibdev, u8 ibport,
896 				int *local_contact)
897 {
898 	/* allocate connection / link group */
899 	*local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
900 	if (*local_contact < 0) {
901 		if (*local_contact == -ENOMEM)
902 			return SMC_CLC_DECL_MEM;/* insufficient memory*/
903 		return SMC_CLC_DECL_INTERR; /* other error */
904 	}
905 
906 	/* create send buffer and rmb */
907 	if (smc_buf_create(new_smc))
908 		return SMC_CLC_DECL_MEM;
909 
910 	return 0;
911 }
912 
913 /* listen worker: register buffers */
914 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
915 {
916 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
917 
918 	if (local_contact != SMC_FIRST_CONTACT) {
919 		if (!new_smc->conn.rmb_desc->reused) {
920 			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
921 				return SMC_CLC_DECL_INTERR;
922 		}
923 	}
924 	smc_rmb_sync_sg_for_device(&new_smc->conn);
925 
926 	return 0;
927 }
928 
929 /* listen worker: finish RDMA setup */
930 static void smc_listen_rdma_finish(struct smc_sock *new_smc,
931 				   struct smc_clc_msg_accept_confirm *cclc,
932 				   int local_contact)
933 {
934 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
935 	int reason_code = 0;
936 
937 	if (local_contact == SMC_FIRST_CONTACT)
938 		smc_link_save_peer_info(link, cclc);
939 
940 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
941 		reason_code = SMC_CLC_DECL_INTERR;
942 		goto decline;
943 	}
944 
945 	if (local_contact == SMC_FIRST_CONTACT) {
946 		if (smc_ib_ready_link(link)) {
947 			reason_code = SMC_CLC_DECL_INTERR;
948 			goto decline;
949 		}
950 		/* QP confirmation over RoCE fabric */
951 		reason_code = smc_serv_conf_first_link(new_smc);
952 		if (reason_code)
953 			goto decline;
954 	}
955 	return;
956 
957 decline:
958 	mutex_unlock(&smc_create_lgr_pending);
959 	smc_listen_decline(new_smc, reason_code, local_contact);
960 }
961 
962 /* setup for RDMA connection of server */
963 static void smc_listen_work(struct work_struct *work)
964 {
965 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
966 						smc_listen_work);
967 	struct socket *newclcsock = new_smc->clcsock;
968 	struct smc_clc_msg_accept_confirm cclc;
969 	struct smc_clc_msg_proposal *pclc;
970 	struct smc_ib_device *ibdev;
971 	u8 buf[SMC_CLC_MAX_LEN];
972 	int local_contact = 0;
973 	int reason_code = 0;
974 	int rc = 0;
975 	u8 ibport;
976 
977 	if (new_smc->use_fallback) {
978 		smc_listen_out_connected(new_smc);
979 		return;
980 	}
981 
982 	/* check if peer is smc capable */
983 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
984 		new_smc->use_fallback = true;
985 		smc_listen_out_connected(new_smc);
986 		return;
987 	}
988 
989 	/* do inband token exchange -
990 	 * wait for and receive SMC Proposal CLC message
991 	 */
992 	pclc = (struct smc_clc_msg_proposal *)&buf;
993 	reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
994 				       SMC_CLC_PROPOSAL);
995 	if (reason_code) {
996 		smc_listen_decline(new_smc, reason_code, 0);
997 		return;
998 	}
999 
1000 	/* IPSec connections opt out of SMC-R optimizations */
1001 	if (using_ipsec(new_smc)) {
1002 		smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1003 		return;
1004 	}
1005 
1006 	mutex_lock(&smc_create_lgr_pending);
1007 	smc_close_init(new_smc);
1008 	smc_rx_init(new_smc);
1009 	smc_tx_init(new_smc);
1010 
1011 	/* check if RDMA is available */
1012 	if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
1013 	    smc_listen_rdma_check(new_smc, pclc) ||
1014 	    smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1015 				 &local_contact) ||
1016 	    smc_listen_rdma_reg(new_smc, local_contact)) {
1017 		/* SMC not supported, decline */
1018 		mutex_unlock(&smc_create_lgr_pending);
1019 		smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
1020 		return;
1021 	}
1022 
1023 	/* send SMC Accept CLC message */
1024 	rc = smc_clc_send_accept(new_smc, local_contact);
1025 	if (rc) {
1026 		mutex_unlock(&smc_create_lgr_pending);
1027 		smc_listen_decline(new_smc, rc, local_contact);
1028 		return;
1029 	}
1030 
1031 	/* receive SMC Confirm CLC message */
1032 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1033 				       SMC_CLC_CONFIRM);
1034 	if (reason_code) {
1035 		mutex_unlock(&smc_create_lgr_pending);
1036 		smc_listen_decline(new_smc, reason_code, local_contact);
1037 		return;
1038 	}
1039 
1040 	/* finish worker */
1041 	smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1042 	smc_conn_save_peer_info(new_smc, &cclc);
1043 	mutex_unlock(&smc_create_lgr_pending);
1044 	smc_listen_out_connected(new_smc);
1045 }
1046 
1047 static void smc_tcp_listen_work(struct work_struct *work)
1048 {
1049 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1050 					     tcp_listen_work);
1051 	struct sock *lsk = &lsmc->sk;
1052 	struct smc_sock *new_smc;
1053 	int rc = 0;
1054 
1055 	lock_sock(lsk);
1056 	while (lsk->sk_state == SMC_LISTEN) {
1057 		rc = smc_clcsock_accept(lsmc, &new_smc);
1058 		if (rc)
1059 			goto out;
1060 		if (!new_smc)
1061 			continue;
1062 
1063 		new_smc->listen_smc = lsmc;
1064 		new_smc->use_fallback = lsmc->use_fallback;
1065 		sock_hold(lsk); /* sock_put in smc_listen_work */
1066 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1067 		smc_copy_sock_settings_to_smc(new_smc);
1068 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1069 		if (!schedule_work(&new_smc->smc_listen_work))
1070 			sock_put(&new_smc->sk);
1071 	}
1072 
1073 out:
1074 	release_sock(lsk);
1075 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1076 }
1077 
1078 static int smc_listen(struct socket *sock, int backlog)
1079 {
1080 	struct sock *sk = sock->sk;
1081 	struct smc_sock *smc;
1082 	int rc;
1083 
1084 	smc = smc_sk(sk);
1085 	lock_sock(sk);
1086 
1087 	rc = -EINVAL;
1088 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1089 		goto out;
1090 
1091 	rc = 0;
1092 	if (sk->sk_state == SMC_LISTEN) {
1093 		sk->sk_max_ack_backlog = backlog;
1094 		goto out;
1095 	}
1096 	/* some socket options are handled in core, so we could not apply
1097 	 * them to the clc socket -- copy smc socket options to clc socket
1098 	 */
1099 	smc_copy_sock_settings_to_clc(smc);
1100 	if (!smc->use_fallback)
1101 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1102 
1103 	rc = kernel_listen(smc->clcsock, backlog);
1104 	if (rc)
1105 		goto out;
1106 	sk->sk_max_ack_backlog = backlog;
1107 	sk->sk_ack_backlog = 0;
1108 	sk->sk_state = SMC_LISTEN;
1109 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1110 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1111 	if (!schedule_work(&smc->tcp_listen_work))
1112 		sock_put(sk);
1113 
1114 out:
1115 	release_sock(sk);
1116 	return rc;
1117 }
1118 
1119 static int smc_accept(struct socket *sock, struct socket *new_sock,
1120 		      int flags, bool kern)
1121 {
1122 	struct sock *sk = sock->sk, *nsk;
1123 	DECLARE_WAITQUEUE(wait, current);
1124 	struct smc_sock *lsmc;
1125 	long timeo;
1126 	int rc = 0;
1127 
1128 	lsmc = smc_sk(sk);
1129 	sock_hold(sk); /* sock_put below */
1130 	lock_sock(sk);
1131 
1132 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1133 		rc = -EINVAL;
1134 		release_sock(sk);
1135 		goto out;
1136 	}
1137 
1138 	/* Wait for an incoming connection */
1139 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1140 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1141 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1142 		set_current_state(TASK_INTERRUPTIBLE);
1143 		if (!timeo) {
1144 			rc = -EAGAIN;
1145 			break;
1146 		}
1147 		release_sock(sk);
1148 		timeo = schedule_timeout(timeo);
1149 		/* wakeup by sk_data_ready in smc_listen_work() */
1150 		sched_annotate_sleep();
1151 		lock_sock(sk);
1152 		if (signal_pending(current)) {
1153 			rc = sock_intr_errno(timeo);
1154 			break;
1155 		}
1156 	}
1157 	set_current_state(TASK_RUNNING);
1158 	remove_wait_queue(sk_sleep(sk), &wait);
1159 
1160 	if (!rc)
1161 		rc = sock_error(nsk);
1162 	release_sock(sk);
1163 	if (rc)
1164 		goto out;
1165 
1166 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1167 		/* wait till data arrives on the socket */
1168 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1169 								MSEC_PER_SEC);
1170 		if (smc_sk(nsk)->use_fallback) {
1171 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1172 
1173 			lock_sock(clcsk);
1174 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1175 				sk_wait_data(clcsk, &timeo, NULL);
1176 			release_sock(clcsk);
1177 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1178 			lock_sock(nsk);
1179 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1180 			release_sock(nsk);
1181 		}
1182 	}
1183 
1184 out:
1185 	sock_put(sk); /* sock_hold above */
1186 	return rc;
1187 }
1188 
1189 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1190 		       int peer)
1191 {
1192 	struct smc_sock *smc;
1193 
1194 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1195 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1196 		return -ENOTCONN;
1197 
1198 	smc = smc_sk(sock->sk);
1199 
1200 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1201 }
1202 
1203 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1204 {
1205 	struct sock *sk = sock->sk;
1206 	struct smc_sock *smc;
1207 	int rc = -EPIPE;
1208 
1209 	smc = smc_sk(sk);
1210 	lock_sock(sk);
1211 	if ((sk->sk_state != SMC_ACTIVE) &&
1212 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1213 	    (sk->sk_state != SMC_INIT))
1214 		goto out;
1215 
1216 	if (msg->msg_flags & MSG_FASTOPEN) {
1217 		if (sk->sk_state == SMC_INIT) {
1218 			smc->use_fallback = true;
1219 		} else {
1220 			rc = -EINVAL;
1221 			goto out;
1222 		}
1223 	}
1224 
1225 	if (smc->use_fallback)
1226 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1227 	else
1228 		rc = smc_tx_sendmsg(smc, msg, len);
1229 out:
1230 	release_sock(sk);
1231 	return rc;
1232 }
1233 
1234 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1235 		       int flags)
1236 {
1237 	struct sock *sk = sock->sk;
1238 	struct smc_sock *smc;
1239 	int rc = -ENOTCONN;
1240 
1241 	smc = smc_sk(sk);
1242 	lock_sock(sk);
1243 	if ((sk->sk_state == SMC_INIT) ||
1244 	    (sk->sk_state == SMC_LISTEN) ||
1245 	    (sk->sk_state == SMC_CLOSED))
1246 		goto out;
1247 
1248 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1249 		rc = 0;
1250 		goto out;
1251 	}
1252 
1253 	if (smc->use_fallback) {
1254 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1255 	} else {
1256 		msg->msg_namelen = 0;
1257 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1258 	}
1259 
1260 out:
1261 	release_sock(sk);
1262 	return rc;
1263 }
1264 
1265 static __poll_t smc_accept_poll(struct sock *parent)
1266 {
1267 	struct smc_sock *isk = smc_sk(parent);
1268 	__poll_t mask = 0;
1269 
1270 	spin_lock(&isk->accept_q_lock);
1271 	if (!list_empty(&isk->accept_q))
1272 		mask = EPOLLIN | EPOLLRDNORM;
1273 	spin_unlock(&isk->accept_q_lock);
1274 
1275 	return mask;
1276 }
1277 
1278 static __poll_t smc_poll(struct file *file, struct socket *sock,
1279 			     poll_table *wait)
1280 {
1281 	struct sock *sk = sock->sk;
1282 	__poll_t mask = 0;
1283 	struct smc_sock *smc;
1284 	int rc;
1285 
1286 	if (!sk)
1287 		return EPOLLNVAL;
1288 
1289 	smc = smc_sk(sock->sk);
1290 	sock_hold(sk);
1291 	lock_sock(sk);
1292 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1293 		/* delegate to CLC child sock */
1294 		release_sock(sk);
1295 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1296 		lock_sock(sk);
1297 		sk->sk_err = smc->clcsock->sk->sk_err;
1298 		if (sk->sk_err) {
1299 			mask |= EPOLLERR;
1300 		} else {
1301 			/* if non-blocking connect finished ... */
1302 			if (sk->sk_state == SMC_INIT &&
1303 			    mask & EPOLLOUT &&
1304 			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
1305 				rc = __smc_connect(smc);
1306 				if (rc < 0)
1307 					mask |= EPOLLERR;
1308 				/* success cases including fallback */
1309 				mask |= EPOLLOUT | EPOLLWRNORM;
1310 			}
1311 		}
1312 	} else {
1313 		if (sk->sk_state != SMC_CLOSED) {
1314 			release_sock(sk);
1315 			sock_poll_wait(file, sk_sleep(sk), wait);
1316 			lock_sock(sk);
1317 		}
1318 		if (sk->sk_err)
1319 			mask |= EPOLLERR;
1320 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1321 		    (sk->sk_state == SMC_CLOSED))
1322 			mask |= EPOLLHUP;
1323 		if (sk->sk_state == SMC_LISTEN) {
1324 			/* woken up by sk_data_ready in smc_listen_work() */
1325 			mask = smc_accept_poll(sk);
1326 		} else {
1327 			if (atomic_read(&smc->conn.sndbuf_space) ||
1328 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1329 				mask |= EPOLLOUT | EPOLLWRNORM;
1330 			} else {
1331 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1332 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1333 			}
1334 			if (atomic_read(&smc->conn.bytes_to_rcv))
1335 				mask |= EPOLLIN | EPOLLRDNORM;
1336 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1337 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1338 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1339 				mask |= EPOLLIN;
1340 		}
1341 
1342 	}
1343 	release_sock(sk);
1344 	sock_put(sk);
1345 
1346 	return mask;
1347 }
1348 
1349 static int smc_shutdown(struct socket *sock, int how)
1350 {
1351 	struct sock *sk = sock->sk;
1352 	struct smc_sock *smc;
1353 	int rc = -EINVAL;
1354 	int rc1 = 0;
1355 
1356 	smc = smc_sk(sk);
1357 
1358 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1359 		return rc;
1360 
1361 	lock_sock(sk);
1362 
1363 	rc = -ENOTCONN;
1364 	if ((sk->sk_state != SMC_LISTEN) &&
1365 	    (sk->sk_state != SMC_ACTIVE) &&
1366 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1367 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1368 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1369 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1370 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1371 		goto out;
1372 	if (smc->use_fallback) {
1373 		rc = kernel_sock_shutdown(smc->clcsock, how);
1374 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1375 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1376 			sk->sk_state = SMC_CLOSED;
1377 		goto out;
1378 	}
1379 	switch (how) {
1380 	case SHUT_RDWR:		/* shutdown in both directions */
1381 		rc = smc_close_active(smc);
1382 		break;
1383 	case SHUT_WR:
1384 		rc = smc_close_shutdown_write(smc);
1385 		break;
1386 	case SHUT_RD:
1387 		rc = 0;
1388 		/* nothing more to do because peer is not involved */
1389 		break;
1390 	}
1391 	if (smc->clcsock)
1392 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1393 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1394 	sk->sk_shutdown |= how + 1;
1395 
1396 out:
1397 	release_sock(sk);
1398 	return rc ? rc : rc1;
1399 }
1400 
1401 static int smc_setsockopt(struct socket *sock, int level, int optname,
1402 			  char __user *optval, unsigned int optlen)
1403 {
1404 	struct sock *sk = sock->sk;
1405 	struct smc_sock *smc;
1406 	int val, rc;
1407 
1408 	smc = smc_sk(sk);
1409 
1410 	/* generic setsockopts reaching us here always apply to the
1411 	 * CLC socket
1412 	 */
1413 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1414 					   optval, optlen);
1415 	if (smc->clcsock->sk->sk_err) {
1416 		sk->sk_err = smc->clcsock->sk->sk_err;
1417 		sk->sk_error_report(sk);
1418 	}
1419 	if (rc)
1420 		return rc;
1421 
1422 	if (optlen < sizeof(int))
1423 		return rc;
1424 	get_user(val, (int __user *)optval);
1425 
1426 	lock_sock(sk);
1427 	switch (optname) {
1428 	case TCP_ULP:
1429 	case TCP_FASTOPEN:
1430 	case TCP_FASTOPEN_CONNECT:
1431 	case TCP_FASTOPEN_KEY:
1432 	case TCP_FASTOPEN_NO_COOKIE:
1433 		/* option not supported by SMC */
1434 		if (sk->sk_state == SMC_INIT) {
1435 			smc->use_fallback = true;
1436 		} else {
1437 			if (!smc->use_fallback)
1438 				rc = -EINVAL;
1439 		}
1440 		break;
1441 	case TCP_NODELAY:
1442 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1443 			if (val && !smc->use_fallback)
1444 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1445 						 0);
1446 		}
1447 		break;
1448 	case TCP_CORK:
1449 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1450 			if (!val && !smc->use_fallback)
1451 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1452 						 0);
1453 		}
1454 		break;
1455 	case TCP_DEFER_ACCEPT:
1456 		smc->sockopt_defer_accept = val;
1457 		break;
1458 	default:
1459 		break;
1460 	}
1461 	release_sock(sk);
1462 
1463 	return rc;
1464 }
1465 
1466 static int smc_getsockopt(struct socket *sock, int level, int optname,
1467 			  char __user *optval, int __user *optlen)
1468 {
1469 	struct smc_sock *smc;
1470 
1471 	smc = smc_sk(sock->sk);
1472 	/* socket options apply to the CLC socket */
1473 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1474 					     optval, optlen);
1475 }
1476 
1477 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1478 		     unsigned long arg)
1479 {
1480 	struct smc_sock *smc;
1481 	int answ;
1482 
1483 	smc = smc_sk(sock->sk);
1484 	if (smc->use_fallback) {
1485 		if (!smc->clcsock)
1486 			return -EBADF;
1487 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1488 	}
1489 	switch (cmd) {
1490 	case SIOCINQ: /* same as FIONREAD */
1491 		if (smc->sk.sk_state == SMC_LISTEN)
1492 			return -EINVAL;
1493 		answ = atomic_read(&smc->conn.bytes_to_rcv);
1494 		break;
1495 	case SIOCOUTQ:
1496 		/* output queue size (not send + not acked) */
1497 		if (smc->sk.sk_state == SMC_LISTEN)
1498 			return -EINVAL;
1499 		answ = smc->conn.sndbuf_desc->len -
1500 					atomic_read(&smc->conn.sndbuf_space);
1501 		break;
1502 	case SIOCOUTQNSD:
1503 		/* output queue size (not send only) */
1504 		if (smc->sk.sk_state == SMC_LISTEN)
1505 			return -EINVAL;
1506 		answ = smc_tx_prepared_sends(&smc->conn);
1507 		break;
1508 	default:
1509 		return -ENOIOCTLCMD;
1510 	}
1511 
1512 	return put_user(answ, (int __user *)arg);
1513 }
1514 
1515 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1516 			    int offset, size_t size, int flags)
1517 {
1518 	struct sock *sk = sock->sk;
1519 	struct smc_sock *smc;
1520 	int rc = -EPIPE;
1521 
1522 	smc = smc_sk(sk);
1523 	lock_sock(sk);
1524 	if (sk->sk_state != SMC_ACTIVE) {
1525 		release_sock(sk);
1526 		goto out;
1527 	}
1528 	release_sock(sk);
1529 	if (smc->use_fallback)
1530 		rc = kernel_sendpage(smc->clcsock, page, offset,
1531 				     size, flags);
1532 	else
1533 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1534 
1535 out:
1536 	return rc;
1537 }
1538 
1539 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1540  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1541  * updates till whenever a respective page has been fully processed.
1542  * Note that subsequent recv() calls have to wait till all splice() processing
1543  * completed.
1544  */
1545 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1546 			       struct pipe_inode_info *pipe, size_t len,
1547 			       unsigned int flags)
1548 {
1549 	struct sock *sk = sock->sk;
1550 	struct smc_sock *smc;
1551 	int rc = -ENOTCONN;
1552 
1553 	smc = smc_sk(sk);
1554 	lock_sock(sk);
1555 
1556 	if (sk->sk_state == SMC_INIT ||
1557 	    sk->sk_state == SMC_LISTEN ||
1558 	    sk->sk_state == SMC_CLOSED)
1559 		goto out;
1560 
1561 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1562 		rc = 0;
1563 		goto out;
1564 	}
1565 
1566 	if (smc->use_fallback) {
1567 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1568 						    pipe, len, flags);
1569 	} else {
1570 		if (*ppos) {
1571 			rc = -ESPIPE;
1572 			goto out;
1573 		}
1574 		if (flags & SPLICE_F_NONBLOCK)
1575 			flags = MSG_DONTWAIT;
1576 		else
1577 			flags = 0;
1578 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1579 	}
1580 out:
1581 	release_sock(sk);
1582 
1583 	return rc;
1584 }
1585 
1586 /* must look like tcp */
1587 static const struct proto_ops smc_sock_ops = {
1588 	.family		= PF_SMC,
1589 	.owner		= THIS_MODULE,
1590 	.release	= smc_release,
1591 	.bind		= smc_bind,
1592 	.connect	= smc_connect,
1593 	.socketpair	= sock_no_socketpair,
1594 	.accept		= smc_accept,
1595 	.getname	= smc_getname,
1596 	.poll		= smc_poll,
1597 	.ioctl		= smc_ioctl,
1598 	.listen		= smc_listen,
1599 	.shutdown	= smc_shutdown,
1600 	.setsockopt	= smc_setsockopt,
1601 	.getsockopt	= smc_getsockopt,
1602 	.sendmsg	= smc_sendmsg,
1603 	.recvmsg	= smc_recvmsg,
1604 	.mmap		= sock_no_mmap,
1605 	.sendpage	= smc_sendpage,
1606 	.splice_read	= smc_splice_read,
1607 };
1608 
1609 static int smc_create(struct net *net, struct socket *sock, int protocol,
1610 		      int kern)
1611 {
1612 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1613 	struct smc_sock *smc;
1614 	struct sock *sk;
1615 	int rc;
1616 
1617 	rc = -ESOCKTNOSUPPORT;
1618 	if (sock->type != SOCK_STREAM)
1619 		goto out;
1620 
1621 	rc = -EPROTONOSUPPORT;
1622 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1623 		goto out;
1624 
1625 	rc = -ENOBUFS;
1626 	sock->ops = &smc_sock_ops;
1627 	sk = smc_sock_alloc(net, sock, protocol);
1628 	if (!sk)
1629 		goto out;
1630 
1631 	/* create internal TCP socket for CLC handshake and fallback */
1632 	smc = smc_sk(sk);
1633 	smc->use_fallback = false; /* assume rdma capability first */
1634 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1635 			      &smc->clcsock);
1636 	if (rc) {
1637 		sk_common_release(sk);
1638 		goto out;
1639 	}
1640 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1641 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1642 
1643 out:
1644 	return rc;
1645 }
1646 
1647 static const struct net_proto_family smc_sock_family_ops = {
1648 	.family	= PF_SMC,
1649 	.owner	= THIS_MODULE,
1650 	.create	= smc_create,
1651 };
1652 
1653 static int __init smc_init(void)
1654 {
1655 	int rc;
1656 
1657 	rc = smc_pnet_init();
1658 	if (rc)
1659 		return rc;
1660 
1661 	rc = smc_llc_init();
1662 	if (rc) {
1663 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1664 		goto out_pnet;
1665 	}
1666 
1667 	rc = smc_cdc_init();
1668 	if (rc) {
1669 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1670 		goto out_pnet;
1671 	}
1672 
1673 	rc = proto_register(&smc_proto, 1);
1674 	if (rc) {
1675 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1676 		goto out_pnet;
1677 	}
1678 
1679 	rc = proto_register(&smc_proto6, 1);
1680 	if (rc) {
1681 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1682 		goto out_proto;
1683 	}
1684 
1685 	rc = sock_register(&smc_sock_family_ops);
1686 	if (rc) {
1687 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1688 		goto out_proto6;
1689 	}
1690 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1691 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1692 
1693 	rc = smc_ib_register_client();
1694 	if (rc) {
1695 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1696 		goto out_sock;
1697 	}
1698 
1699 	static_branch_enable(&tcp_have_smc);
1700 	return 0;
1701 
1702 out_sock:
1703 	sock_unregister(PF_SMC);
1704 out_proto6:
1705 	proto_unregister(&smc_proto6);
1706 out_proto:
1707 	proto_unregister(&smc_proto);
1708 out_pnet:
1709 	smc_pnet_exit();
1710 	return rc;
1711 }
1712 
1713 static void __exit smc_exit(void)
1714 {
1715 	smc_core_exit();
1716 	static_branch_disable(&tcp_have_smc);
1717 	smc_ib_unregister_client();
1718 	sock_unregister(PF_SMC);
1719 	proto_unregister(&smc_proto6);
1720 	proto_unregister(&smc_proto);
1721 	smc_pnet_exit();
1722 }
1723 
1724 module_init(smc_init);
1725 module_exit(smc_exit);
1726 
1727 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1728 MODULE_DESCRIPTION("smc socket address family");
1729 MODULE_LICENSE("GPL");
1730 MODULE_ALIAS_NETPROTO(PF_SMC);
1731