xref: /openbmc/linux/net/smc/af_smc.c (revision 96ac6d43)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18 
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21 
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/netns/generic.h>
36 #include "smc_netns.h"
37 
38 #include "smc.h"
39 #include "smc_clc.h"
40 #include "smc_llc.h"
41 #include "smc_cdc.h"
42 #include "smc_core.h"
43 #include "smc_ib.h"
44 #include "smc_ism.h"
45 #include "smc_pnet.h"
46 #include "smc_tx.h"
47 #include "smc_rx.h"
48 #include "smc_close.h"
49 
50 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
51 						 * creation on server
52 						 */
53 static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
54 						 * creation on client
55 						 */
56 
57 static void smc_tcp_listen_work(struct work_struct *);
58 static void smc_connect_work(struct work_struct *);
59 
60 static void smc_set_keepalive(struct sock *sk, int val)
61 {
62 	struct smc_sock *smc = smc_sk(sk);
63 
64 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
65 }
66 
67 static struct smc_hashinfo smc_v4_hashinfo = {
68 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
69 };
70 
71 static struct smc_hashinfo smc_v6_hashinfo = {
72 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
73 };
74 
75 int smc_hash_sk(struct sock *sk)
76 {
77 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
78 	struct hlist_head *head;
79 
80 	head = &h->ht;
81 
82 	write_lock_bh(&h->lock);
83 	sk_add_node(sk, head);
84 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
85 	write_unlock_bh(&h->lock);
86 
87 	return 0;
88 }
89 EXPORT_SYMBOL_GPL(smc_hash_sk);
90 
91 void smc_unhash_sk(struct sock *sk)
92 {
93 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
94 
95 	write_lock_bh(&h->lock);
96 	if (sk_del_node_init(sk))
97 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
98 	write_unlock_bh(&h->lock);
99 }
100 EXPORT_SYMBOL_GPL(smc_unhash_sk);
101 
102 struct proto smc_proto = {
103 	.name		= "SMC",
104 	.owner		= THIS_MODULE,
105 	.keepalive	= smc_set_keepalive,
106 	.hash		= smc_hash_sk,
107 	.unhash		= smc_unhash_sk,
108 	.obj_size	= sizeof(struct smc_sock),
109 	.h.smc_hash	= &smc_v4_hashinfo,
110 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
111 };
112 EXPORT_SYMBOL_GPL(smc_proto);
113 
114 struct proto smc_proto6 = {
115 	.name		= "SMC6",
116 	.owner		= THIS_MODULE,
117 	.keepalive	= smc_set_keepalive,
118 	.hash		= smc_hash_sk,
119 	.unhash		= smc_unhash_sk,
120 	.obj_size	= sizeof(struct smc_sock),
121 	.h.smc_hash	= &smc_v6_hashinfo,
122 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
123 };
124 EXPORT_SYMBOL_GPL(smc_proto6);
125 
126 static int smc_release(struct socket *sock)
127 {
128 	struct sock *sk = sock->sk;
129 	struct smc_sock *smc;
130 	int rc = 0;
131 
132 	if (!sk)
133 		goto out;
134 
135 	smc = smc_sk(sk);
136 
137 	/* cleanup for a dangling non-blocking connect */
138 	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
139 		tcp_abort(smc->clcsock->sk, ECONNABORTED);
140 	flush_work(&smc->connect_work);
141 
142 	if (sk->sk_state == SMC_LISTEN)
143 		/* smc_close_non_accepted() is called and acquires
144 		 * sock lock for child sockets again
145 		 */
146 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
147 	else
148 		lock_sock(sk);
149 
150 	if (!smc->use_fallback) {
151 		rc = smc_close_active(smc);
152 		sock_set_flag(sk, SOCK_DEAD);
153 		sk->sk_shutdown |= SHUTDOWN_MASK;
154 	} else {
155 		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
156 			sock_put(sk); /* passive closing */
157 		if (sk->sk_state == SMC_LISTEN) {
158 			/* wake up clcsock accept */
159 			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
160 		}
161 		sk->sk_state = SMC_CLOSED;
162 		sk->sk_state_change(sk);
163 	}
164 
165 	sk->sk_prot->unhash(sk);
166 
167 	if (sk->sk_state == SMC_CLOSED) {
168 		if (smc->clcsock) {
169 			release_sock(sk);
170 			smc_clcsock_release(smc);
171 			lock_sock(sk);
172 		}
173 		if (!smc->use_fallback)
174 			smc_conn_free(&smc->conn);
175 	}
176 
177 	/* detach socket */
178 	sock_orphan(sk);
179 	sock->sk = NULL;
180 	release_sock(sk);
181 
182 	sock_put(sk); /* final sock_put */
183 out:
184 	return rc;
185 }
186 
187 static void smc_destruct(struct sock *sk)
188 {
189 	if (sk->sk_state != SMC_CLOSED)
190 		return;
191 	if (!sock_flag(sk, SOCK_DEAD))
192 		return;
193 
194 	sk_refcnt_debug_dec(sk);
195 }
196 
197 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
198 				   int protocol)
199 {
200 	struct smc_sock *smc;
201 	struct proto *prot;
202 	struct sock *sk;
203 
204 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
205 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
206 	if (!sk)
207 		return NULL;
208 
209 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
210 	sk->sk_state = SMC_INIT;
211 	sk->sk_destruct = smc_destruct;
212 	sk->sk_protocol = protocol;
213 	smc = smc_sk(sk);
214 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
215 	INIT_WORK(&smc->connect_work, smc_connect_work);
216 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
217 	INIT_LIST_HEAD(&smc->accept_q);
218 	spin_lock_init(&smc->accept_q_lock);
219 	spin_lock_init(&smc->conn.send_lock);
220 	sk->sk_prot->hash(sk);
221 	sk_refcnt_debug_inc(sk);
222 	mutex_init(&smc->clcsock_release_lock);
223 
224 	return sk;
225 }
226 
227 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
228 		    int addr_len)
229 {
230 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
231 	struct sock *sk = sock->sk;
232 	struct smc_sock *smc;
233 	int rc;
234 
235 	smc = smc_sk(sk);
236 
237 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
238 	rc = -EINVAL;
239 	if (addr_len < sizeof(struct sockaddr_in))
240 		goto out;
241 
242 	rc = -EAFNOSUPPORT;
243 	if (addr->sin_family != AF_INET &&
244 	    addr->sin_family != AF_INET6 &&
245 	    addr->sin_family != AF_UNSPEC)
246 		goto out;
247 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
248 	if (addr->sin_family == AF_UNSPEC &&
249 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
250 		goto out;
251 
252 	lock_sock(sk);
253 
254 	/* Check if socket is already active */
255 	rc = -EINVAL;
256 	if (sk->sk_state != SMC_INIT)
257 		goto out_rel;
258 
259 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
260 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
261 
262 out_rel:
263 	release_sock(sk);
264 out:
265 	return rc;
266 }
267 
268 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
269 				   unsigned long mask)
270 {
271 	/* options we don't get control via setsockopt for */
272 	nsk->sk_type = osk->sk_type;
273 	nsk->sk_sndbuf = osk->sk_sndbuf;
274 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
275 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
276 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
277 	nsk->sk_mark = osk->sk_mark;
278 	nsk->sk_priority = osk->sk_priority;
279 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
280 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
281 	nsk->sk_err = osk->sk_err;
282 
283 	nsk->sk_flags &= ~mask;
284 	nsk->sk_flags |= osk->sk_flags & mask;
285 }
286 
287 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
288 			     (1UL << SOCK_KEEPOPEN) | \
289 			     (1UL << SOCK_LINGER) | \
290 			     (1UL << SOCK_BROADCAST) | \
291 			     (1UL << SOCK_TIMESTAMP) | \
292 			     (1UL << SOCK_DBG) | \
293 			     (1UL << SOCK_RCVTSTAMP) | \
294 			     (1UL << SOCK_RCVTSTAMPNS) | \
295 			     (1UL << SOCK_LOCALROUTE) | \
296 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
297 			     (1UL << SOCK_RXQ_OVFL) | \
298 			     (1UL << SOCK_WIFI_STATUS) | \
299 			     (1UL << SOCK_NOFCS) | \
300 			     (1UL << SOCK_FILTER_LOCKED) | \
301 			     (1UL << SOCK_TSTAMP_NEW))
302 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
303  * clc socket (since smc is not called for these options from net/core)
304  */
305 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
306 {
307 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
308 }
309 
310 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
311 			     (1UL << SOCK_KEEPOPEN) | \
312 			     (1UL << SOCK_LINGER) | \
313 			     (1UL << SOCK_DBG))
314 /* copy only settings and flags relevant for smc from clc to smc socket */
315 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
316 {
317 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
318 }
319 
320 /* register a new rmb, send confirm_rkey msg to register with peer */
321 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
322 		       bool conf_rkey)
323 {
324 	if (!rmb_desc->wr_reg) {
325 		/* register memory region for new rmb */
326 		if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
327 			rmb_desc->regerr = 1;
328 			return -EFAULT;
329 		}
330 		rmb_desc->wr_reg = 1;
331 	}
332 	if (!conf_rkey)
333 		return 0;
334 	/* exchange confirm_rkey msg with peer */
335 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
336 		rmb_desc->regerr = 1;
337 		return -EFAULT;
338 	}
339 	return 0;
340 }
341 
342 static int smc_clnt_conf_first_link(struct smc_sock *smc)
343 {
344 	struct net *net = sock_net(smc->clcsock->sk);
345 	struct smc_link_group *lgr = smc->conn.lgr;
346 	struct smc_link *link;
347 	int rest;
348 	int rc;
349 
350 	link = &lgr->lnk[SMC_SINGLE_LINK];
351 	/* receive CONFIRM LINK request from server over RoCE fabric */
352 	rest = wait_for_completion_interruptible_timeout(
353 		&link->llc_confirm,
354 		SMC_LLC_WAIT_FIRST_TIME);
355 	if (rest <= 0) {
356 		struct smc_clc_msg_decline dclc;
357 
358 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
359 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
360 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
361 	}
362 
363 	if (link->llc_confirm_rc)
364 		return SMC_CLC_DECL_RMBE_EC;
365 
366 	rc = smc_ib_modify_qp_rts(link);
367 	if (rc)
368 		return SMC_CLC_DECL_ERR_RDYLNK;
369 
370 	smc_wr_remember_qp_attr(link);
371 
372 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
373 		return SMC_CLC_DECL_ERR_REGRMB;
374 
375 	/* send CONFIRM LINK response over RoCE fabric */
376 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
377 	if (rc < 0)
378 		return SMC_CLC_DECL_TIMEOUT_CL;
379 
380 	/* receive ADD LINK request from server over RoCE fabric */
381 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
382 							 SMC_LLC_WAIT_TIME);
383 	if (rest <= 0) {
384 		struct smc_clc_msg_decline dclc;
385 
386 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
387 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
388 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
389 	}
390 
391 	/* send add link reject message, only one link supported for now */
392 	rc = smc_llc_send_add_link(link,
393 				   link->smcibdev->mac[link->ibport - 1],
394 				   link->gid, SMC_LLC_RESP);
395 	if (rc < 0)
396 		return SMC_CLC_DECL_TIMEOUT_AL;
397 
398 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
399 
400 	return 0;
401 }
402 
403 static void smcr_conn_save_peer_info(struct smc_sock *smc,
404 				     struct smc_clc_msg_accept_confirm *clc)
405 {
406 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
407 
408 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
409 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
410 	smc->conn.peer_rmbe_size = bufsize;
411 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
412 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
413 }
414 
415 static void smcd_conn_save_peer_info(struct smc_sock *smc,
416 				     struct smc_clc_msg_accept_confirm *clc)
417 {
418 	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
419 
420 	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
421 	smc->conn.peer_token = clc->token;
422 	/* msg header takes up space in the buffer */
423 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
424 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
425 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
426 }
427 
428 static void smc_conn_save_peer_info(struct smc_sock *smc,
429 				    struct smc_clc_msg_accept_confirm *clc)
430 {
431 	if (smc->conn.lgr->is_smcd)
432 		smcd_conn_save_peer_info(smc, clc);
433 	else
434 		smcr_conn_save_peer_info(smc, clc);
435 }
436 
437 static void smc_link_save_peer_info(struct smc_link *link,
438 				    struct smc_clc_msg_accept_confirm *clc)
439 {
440 	link->peer_qpn = ntoh24(clc->qpn);
441 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
442 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
443 	link->peer_psn = ntoh24(clc->psn);
444 	link->peer_mtu = clc->qp_mtu;
445 }
446 
447 static void smc_switch_to_fallback(struct smc_sock *smc)
448 {
449 	smc->use_fallback = true;
450 	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
451 		smc->clcsock->file = smc->sk.sk_socket->file;
452 		smc->clcsock->file->private_data = smc->clcsock;
453 	}
454 }
455 
456 /* fall back during connect */
457 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
458 {
459 	smc_switch_to_fallback(smc);
460 	smc->fallback_rsn = reason_code;
461 	smc_copy_sock_settings_to_clc(smc);
462 	smc->connect_nonblock = 0;
463 	if (smc->sk.sk_state == SMC_INIT)
464 		smc->sk.sk_state = SMC_ACTIVE;
465 	return 0;
466 }
467 
468 /* decline and fall back during connect */
469 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
470 {
471 	int rc;
472 
473 	if (reason_code < 0) { /* error, fallback is not possible */
474 		if (smc->sk.sk_state == SMC_INIT)
475 			sock_put(&smc->sk); /* passive closing */
476 		return reason_code;
477 	}
478 	if (reason_code != SMC_CLC_DECL_PEERDECL) {
479 		rc = smc_clc_send_decline(smc, reason_code);
480 		if (rc < 0) {
481 			if (smc->sk.sk_state == SMC_INIT)
482 				sock_put(&smc->sk); /* passive closing */
483 			return rc;
484 		}
485 	}
486 	return smc_connect_fallback(smc, reason_code);
487 }
488 
489 /* abort connecting */
490 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
491 			     int local_contact)
492 {
493 	if (local_contact == SMC_FIRST_CONTACT)
494 		smc_lgr_forget(smc->conn.lgr);
495 	if (smc->conn.lgr->is_smcd)
496 		/* there is only one lgr role for SMC-D; use server lock */
497 		mutex_unlock(&smc_server_lgr_pending);
498 	else
499 		mutex_unlock(&smc_client_lgr_pending);
500 
501 	smc_conn_free(&smc->conn);
502 	smc->connect_nonblock = 0;
503 	return reason_code;
504 }
505 
506 /* check if there is a rdma device available for this connection. */
507 /* called for connect and listen */
508 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
509 {
510 	/* PNET table look up: search active ib_device and port
511 	 * within same PNETID that also contains the ethernet device
512 	 * used for the internal TCP socket
513 	 */
514 	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
515 	if (!ini->ib_dev)
516 		return SMC_CLC_DECL_NOSMCRDEV;
517 	return 0;
518 }
519 
520 /* check if there is an ISM device available for this connection. */
521 /* called for connect and listen */
522 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
523 {
524 	/* Find ISM device with same PNETID as connecting interface  */
525 	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
526 	if (!ini->ism_dev)
527 		return SMC_CLC_DECL_NOSMCDDEV;
528 	return 0;
529 }
530 
531 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
532 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
533 				      struct smc_init_info *ini)
534 {
535 	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
536 		return SMC_CLC_DECL_ISMVLANERR;
537 	return 0;
538 }
539 
540 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
541  * used, the VLAN ID will be registered again during the connection setup.
542  */
543 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
544 					struct smc_init_info *ini)
545 {
546 	if (!is_smcd)
547 		return 0;
548 	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
549 		return SMC_CLC_DECL_CNFERR;
550 	return 0;
551 }
552 
553 /* CLC handshake during connect */
554 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
555 			   struct smc_clc_msg_accept_confirm *aclc,
556 			   struct smc_init_info *ini)
557 {
558 	int rc = 0;
559 
560 	/* do inband token exchange */
561 	rc = smc_clc_send_proposal(smc, smc_type, ini);
562 	if (rc)
563 		return rc;
564 	/* receive SMC Accept CLC message */
565 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
566 				CLC_WAIT_TIME);
567 }
568 
569 /* setup for RDMA connection of client */
570 static int smc_connect_rdma(struct smc_sock *smc,
571 			    struct smc_clc_msg_accept_confirm *aclc,
572 			    struct smc_init_info *ini)
573 {
574 	struct smc_link *link;
575 	int reason_code = 0;
576 
577 	ini->is_smcd = false;
578 	ini->ib_lcl = &aclc->lcl;
579 	ini->ib_clcqpn = ntoh24(aclc->qpn);
580 	ini->srv_first_contact = aclc->hdr.flag;
581 
582 	mutex_lock(&smc_client_lgr_pending);
583 	reason_code = smc_conn_create(smc, ini);
584 	if (reason_code) {
585 		mutex_unlock(&smc_client_lgr_pending);
586 		return reason_code;
587 	}
588 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
589 
590 	smc_conn_save_peer_info(smc, aclc);
591 
592 	/* create send buffer and rmb */
593 	if (smc_buf_create(smc, false))
594 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
595 					 ini->cln_first_contact);
596 
597 	if (ini->cln_first_contact == SMC_FIRST_CONTACT)
598 		smc_link_save_peer_info(link, aclc);
599 
600 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
601 		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
602 					 ini->cln_first_contact);
603 
604 	smc_close_init(smc);
605 	smc_rx_init(smc);
606 
607 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
608 		if (smc_ib_ready_link(link))
609 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
610 						 ini->cln_first_contact);
611 	} else {
612 		if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
613 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
614 						 ini->cln_first_contact);
615 	}
616 	smc_rmb_sync_sg_for_device(&smc->conn);
617 
618 	reason_code = smc_clc_send_confirm(smc);
619 	if (reason_code)
620 		return smc_connect_abort(smc, reason_code,
621 					 ini->cln_first_contact);
622 
623 	smc_tx_init(smc);
624 
625 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
626 		/* QP confirmation over RoCE fabric */
627 		reason_code = smc_clnt_conf_first_link(smc);
628 		if (reason_code)
629 			return smc_connect_abort(smc, reason_code,
630 						 ini->cln_first_contact);
631 	}
632 	mutex_unlock(&smc_client_lgr_pending);
633 
634 	smc_copy_sock_settings_to_clc(smc);
635 	smc->connect_nonblock = 0;
636 	if (smc->sk.sk_state == SMC_INIT)
637 		smc->sk.sk_state = SMC_ACTIVE;
638 
639 	return 0;
640 }
641 
642 /* setup for ISM connection of client */
643 static int smc_connect_ism(struct smc_sock *smc,
644 			   struct smc_clc_msg_accept_confirm *aclc,
645 			   struct smc_init_info *ini)
646 {
647 	int rc = 0;
648 
649 	ini->is_smcd = true;
650 	ini->ism_gid = aclc->gid;
651 	ini->srv_first_contact = aclc->hdr.flag;
652 
653 	/* there is only one lgr role for SMC-D; use server lock */
654 	mutex_lock(&smc_server_lgr_pending);
655 	rc = smc_conn_create(smc, ini);
656 	if (rc) {
657 		mutex_unlock(&smc_server_lgr_pending);
658 		return rc;
659 	}
660 
661 	/* Create send and receive buffers */
662 	if (smc_buf_create(smc, true))
663 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
664 					 ini->cln_first_contact);
665 
666 	smc_conn_save_peer_info(smc, aclc);
667 	smc_close_init(smc);
668 	smc_rx_init(smc);
669 	smc_tx_init(smc);
670 
671 	rc = smc_clc_send_confirm(smc);
672 	if (rc)
673 		return smc_connect_abort(smc, rc, ini->cln_first_contact);
674 	mutex_unlock(&smc_server_lgr_pending);
675 
676 	smc_copy_sock_settings_to_clc(smc);
677 	smc->connect_nonblock = 0;
678 	if (smc->sk.sk_state == SMC_INIT)
679 		smc->sk.sk_state = SMC_ACTIVE;
680 
681 	return 0;
682 }
683 
684 /* perform steps before actually connecting */
685 static int __smc_connect(struct smc_sock *smc)
686 {
687 	bool ism_supported = false, rdma_supported = false;
688 	struct smc_clc_msg_accept_confirm aclc;
689 	struct smc_init_info ini = {0};
690 	int smc_type;
691 	int rc = 0;
692 
693 	sock_hold(&smc->sk); /* sock put in passive closing */
694 
695 	if (smc->use_fallback)
696 		return smc_connect_fallback(smc, smc->fallback_rsn);
697 
698 	/* if peer has not signalled SMC-capability, fall back */
699 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
700 		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
701 
702 	/* IPSec connections opt out of SMC-R optimizations */
703 	if (using_ipsec(smc))
704 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
705 
706 	/* get vlan id from IP device */
707 	if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
708 		return smc_connect_decline_fallback(smc,
709 						    SMC_CLC_DECL_GETVLANERR);
710 
711 	/* check if there is an ism device available */
712 	if (!smc_find_ism_device(smc, &ini) &&
713 	    !smc_connect_ism_vlan_setup(smc, &ini)) {
714 		/* ISM is supported for this connection */
715 		ism_supported = true;
716 		smc_type = SMC_TYPE_D;
717 	}
718 
719 	/* check if there is a rdma device available */
720 	if (!smc_find_rdma_device(smc, &ini)) {
721 		/* RDMA is supported for this connection */
722 		rdma_supported = true;
723 		if (ism_supported)
724 			smc_type = SMC_TYPE_B; /* both */
725 		else
726 			smc_type = SMC_TYPE_R; /* only RDMA */
727 	}
728 
729 	/* if neither ISM nor RDMA are supported, fallback */
730 	if (!rdma_supported && !ism_supported)
731 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
732 
733 	/* perform CLC handshake */
734 	rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
735 	if (rc) {
736 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
737 		return smc_connect_decline_fallback(smc, rc);
738 	}
739 
740 	/* depending on previous steps, connect using rdma or ism */
741 	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
742 		rc = smc_connect_rdma(smc, &aclc, &ini);
743 	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
744 		rc = smc_connect_ism(smc, &aclc, &ini);
745 	else
746 		rc = SMC_CLC_DECL_MODEUNSUPP;
747 	if (rc) {
748 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
749 		return smc_connect_decline_fallback(smc, rc);
750 	}
751 
752 	smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
753 	return 0;
754 }
755 
756 static void smc_connect_work(struct work_struct *work)
757 {
758 	struct smc_sock *smc = container_of(work, struct smc_sock,
759 					    connect_work);
760 	long timeo = smc->sk.sk_sndtimeo;
761 	int rc = 0;
762 
763 	if (!timeo)
764 		timeo = MAX_SCHEDULE_TIMEOUT;
765 	lock_sock(smc->clcsock->sk);
766 	if (smc->clcsock->sk->sk_err) {
767 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
768 	} else if ((1 << smc->clcsock->sk->sk_state) &
769 					(TCPF_SYN_SENT | TCP_SYN_RECV)) {
770 		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
771 		if ((rc == -EPIPE) &&
772 		    ((1 << smc->clcsock->sk->sk_state) &
773 					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
774 			rc = 0;
775 	}
776 	release_sock(smc->clcsock->sk);
777 	lock_sock(&smc->sk);
778 	if (rc != 0 || smc->sk.sk_err) {
779 		smc->sk.sk_state = SMC_CLOSED;
780 		if (rc == -EPIPE || rc == -EAGAIN)
781 			smc->sk.sk_err = EPIPE;
782 		else if (signal_pending(current))
783 			smc->sk.sk_err = -sock_intr_errno(timeo);
784 		goto out;
785 	}
786 
787 	rc = __smc_connect(smc);
788 	if (rc < 0)
789 		smc->sk.sk_err = -rc;
790 
791 out:
792 	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
793 		if (smc->sk.sk_err) {
794 			smc->sk.sk_state_change(&smc->sk);
795 		} else { /* allow polling before and after fallback decision */
796 			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
797 			smc->sk.sk_write_space(&smc->sk);
798 		}
799 	}
800 	release_sock(&smc->sk);
801 }
802 
803 static int smc_connect(struct socket *sock, struct sockaddr *addr,
804 		       int alen, int flags)
805 {
806 	struct sock *sk = sock->sk;
807 	struct smc_sock *smc;
808 	int rc = -EINVAL;
809 
810 	smc = smc_sk(sk);
811 
812 	/* separate smc parameter checking to be safe */
813 	if (alen < sizeof(addr->sa_family))
814 		goto out_err;
815 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
816 		goto out_err;
817 
818 	lock_sock(sk);
819 	switch (sk->sk_state) {
820 	default:
821 		goto out;
822 	case SMC_ACTIVE:
823 		rc = -EISCONN;
824 		goto out;
825 	case SMC_INIT:
826 		rc = 0;
827 		break;
828 	}
829 
830 	smc_copy_sock_settings_to_clc(smc);
831 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
832 	if (smc->connect_nonblock) {
833 		rc = -EALREADY;
834 		goto out;
835 	}
836 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
837 	if (rc && rc != -EINPROGRESS)
838 		goto out;
839 	if (flags & O_NONBLOCK) {
840 		if (schedule_work(&smc->connect_work))
841 			smc->connect_nonblock = 1;
842 		rc = -EINPROGRESS;
843 	} else {
844 		rc = __smc_connect(smc);
845 		if (rc < 0)
846 			goto out;
847 		else
848 			rc = 0; /* success cases including fallback */
849 	}
850 
851 out:
852 	release_sock(sk);
853 out_err:
854 	return rc;
855 }
856 
857 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
858 {
859 	struct socket *new_clcsock = NULL;
860 	struct sock *lsk = &lsmc->sk;
861 	struct sock *new_sk;
862 	int rc = -EINVAL;
863 
864 	release_sock(lsk);
865 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
866 	if (!new_sk) {
867 		rc = -ENOMEM;
868 		lsk->sk_err = ENOMEM;
869 		*new_smc = NULL;
870 		lock_sock(lsk);
871 		goto out;
872 	}
873 	*new_smc = smc_sk(new_sk);
874 
875 	mutex_lock(&lsmc->clcsock_release_lock);
876 	if (lsmc->clcsock)
877 		rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
878 	mutex_unlock(&lsmc->clcsock_release_lock);
879 	lock_sock(lsk);
880 	if  (rc < 0)
881 		lsk->sk_err = -rc;
882 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
883 		new_sk->sk_prot->unhash(new_sk);
884 		if (new_clcsock)
885 			sock_release(new_clcsock);
886 		new_sk->sk_state = SMC_CLOSED;
887 		sock_set_flag(new_sk, SOCK_DEAD);
888 		sock_put(new_sk); /* final */
889 		*new_smc = NULL;
890 		goto out;
891 	}
892 
893 	(*new_smc)->clcsock = new_clcsock;
894 out:
895 	return rc;
896 }
897 
898 /* add a just created sock to the accept queue of the listen sock as
899  * candidate for a following socket accept call from user space
900  */
901 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
902 {
903 	struct smc_sock *par = smc_sk(parent);
904 
905 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
906 	spin_lock(&par->accept_q_lock);
907 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
908 	spin_unlock(&par->accept_q_lock);
909 	sk_acceptq_added(parent);
910 }
911 
912 /* remove a socket from the accept queue of its parental listening socket */
913 static void smc_accept_unlink(struct sock *sk)
914 {
915 	struct smc_sock *par = smc_sk(sk)->listen_smc;
916 
917 	spin_lock(&par->accept_q_lock);
918 	list_del_init(&smc_sk(sk)->accept_q);
919 	spin_unlock(&par->accept_q_lock);
920 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
921 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
922 }
923 
924 /* remove a sock from the accept queue to bind it to a new socket created
925  * for a socket accept call from user space
926  */
927 struct sock *smc_accept_dequeue(struct sock *parent,
928 				struct socket *new_sock)
929 {
930 	struct smc_sock *isk, *n;
931 	struct sock *new_sk;
932 
933 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
934 		new_sk = (struct sock *)isk;
935 
936 		smc_accept_unlink(new_sk);
937 		if (new_sk->sk_state == SMC_CLOSED) {
938 			new_sk->sk_prot->unhash(new_sk);
939 			if (isk->clcsock) {
940 				sock_release(isk->clcsock);
941 				isk->clcsock = NULL;
942 			}
943 			sock_put(new_sk); /* final */
944 			continue;
945 		}
946 		if (new_sock) {
947 			sock_graft(new_sk, new_sock);
948 			if (isk->use_fallback) {
949 				smc_sk(new_sk)->clcsock->file = new_sock->file;
950 				isk->clcsock->file->private_data = isk->clcsock;
951 			}
952 		}
953 		return new_sk;
954 	}
955 	return NULL;
956 }
957 
958 /* clean up for a created but never accepted sock */
959 void smc_close_non_accepted(struct sock *sk)
960 {
961 	struct smc_sock *smc = smc_sk(sk);
962 
963 	lock_sock(sk);
964 	if (!sk->sk_lingertime)
965 		/* wait for peer closing */
966 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
967 	if (!smc->use_fallback) {
968 		smc_close_active(smc);
969 		sock_set_flag(sk, SOCK_DEAD);
970 		sk->sk_shutdown |= SHUTDOWN_MASK;
971 	}
972 	sk->sk_prot->unhash(sk);
973 	if (smc->clcsock) {
974 		struct socket *tcp;
975 
976 		tcp = smc->clcsock;
977 		smc->clcsock = NULL;
978 		sock_release(tcp);
979 	}
980 	if (smc->use_fallback) {
981 		sock_put(sk); /* passive closing */
982 		sk->sk_state = SMC_CLOSED;
983 	} else {
984 		if (sk->sk_state == SMC_CLOSED)
985 			smc_conn_free(&smc->conn);
986 	}
987 	release_sock(sk);
988 	sock_put(sk); /* final sock_put */
989 }
990 
991 static int smc_serv_conf_first_link(struct smc_sock *smc)
992 {
993 	struct net *net = sock_net(smc->clcsock->sk);
994 	struct smc_link_group *lgr = smc->conn.lgr;
995 	struct smc_link *link;
996 	int rest;
997 	int rc;
998 
999 	link = &lgr->lnk[SMC_SINGLE_LINK];
1000 
1001 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
1002 		return SMC_CLC_DECL_ERR_REGRMB;
1003 
1004 	/* send CONFIRM LINK request to client over the RoCE fabric */
1005 	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1006 	if (rc < 0)
1007 		return SMC_CLC_DECL_TIMEOUT_CL;
1008 
1009 	/* receive CONFIRM LINK response from client over the RoCE fabric */
1010 	rest = wait_for_completion_interruptible_timeout(
1011 		&link->llc_confirm_resp,
1012 		SMC_LLC_WAIT_FIRST_TIME);
1013 	if (rest <= 0) {
1014 		struct smc_clc_msg_decline dclc;
1015 
1016 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1017 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1018 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1019 	}
1020 
1021 	if (link->llc_confirm_resp_rc)
1022 		return SMC_CLC_DECL_RMBE_EC;
1023 
1024 	/* send ADD LINK request to client over the RoCE fabric */
1025 	rc = smc_llc_send_add_link(link,
1026 				   link->smcibdev->mac[link->ibport - 1],
1027 				   link->gid, SMC_LLC_REQ);
1028 	if (rc < 0)
1029 		return SMC_CLC_DECL_TIMEOUT_AL;
1030 
1031 	/* receive ADD LINK response from client over the RoCE fabric */
1032 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1033 							 SMC_LLC_WAIT_TIME);
1034 	if (rest <= 0) {
1035 		struct smc_clc_msg_decline dclc;
1036 
1037 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1038 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1039 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1040 	}
1041 
1042 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1043 
1044 	return 0;
1045 }
1046 
1047 /* listen worker: finish */
1048 static void smc_listen_out(struct smc_sock *new_smc)
1049 {
1050 	struct smc_sock *lsmc = new_smc->listen_smc;
1051 	struct sock *newsmcsk = &new_smc->sk;
1052 
1053 	if (lsmc->sk.sk_state == SMC_LISTEN) {
1054 		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1055 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1056 		release_sock(&lsmc->sk);
1057 	} else { /* no longer listening */
1058 		smc_close_non_accepted(newsmcsk);
1059 	}
1060 
1061 	/* Wake up accept */
1062 	lsmc->sk.sk_data_ready(&lsmc->sk);
1063 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1064 }
1065 
1066 /* listen worker: finish in state connected */
1067 static void smc_listen_out_connected(struct smc_sock *new_smc)
1068 {
1069 	struct sock *newsmcsk = &new_smc->sk;
1070 
1071 	sk_refcnt_debug_inc(newsmcsk);
1072 	if (newsmcsk->sk_state == SMC_INIT)
1073 		newsmcsk->sk_state = SMC_ACTIVE;
1074 
1075 	smc_listen_out(new_smc);
1076 }
1077 
1078 /* listen worker: finish in error state */
1079 static void smc_listen_out_err(struct smc_sock *new_smc)
1080 {
1081 	struct sock *newsmcsk = &new_smc->sk;
1082 
1083 	if (newsmcsk->sk_state == SMC_INIT)
1084 		sock_put(&new_smc->sk); /* passive closing */
1085 	newsmcsk->sk_state = SMC_CLOSED;
1086 	smc_conn_free(&new_smc->conn);
1087 
1088 	smc_listen_out(new_smc);
1089 }
1090 
1091 /* listen worker: decline and fall back if possible */
1092 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1093 			       int local_contact)
1094 {
1095 	/* RDMA setup failed, switch back to TCP */
1096 	if (local_contact == SMC_FIRST_CONTACT)
1097 		smc_lgr_forget(new_smc->conn.lgr);
1098 	if (reason_code < 0) { /* error, no fallback possible */
1099 		smc_listen_out_err(new_smc);
1100 		return;
1101 	}
1102 	smc_conn_free(&new_smc->conn);
1103 	smc_switch_to_fallback(new_smc);
1104 	new_smc->fallback_rsn = reason_code;
1105 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1106 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1107 			smc_listen_out_err(new_smc);
1108 			return;
1109 		}
1110 	}
1111 	smc_listen_out_connected(new_smc);
1112 }
1113 
1114 /* listen worker: check prefixes */
1115 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1116 				 struct smc_clc_msg_proposal *pclc)
1117 {
1118 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1119 	struct socket *newclcsock = new_smc->clcsock;
1120 
1121 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1122 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1123 		return SMC_CLC_DECL_DIFFPREFIX;
1124 
1125 	return 0;
1126 }
1127 
1128 /* listen worker: initialize connection and buffers */
1129 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1130 				struct smc_init_info *ini)
1131 {
1132 	int rc;
1133 
1134 	/* allocate connection / link group */
1135 	rc = smc_conn_create(new_smc, ini);
1136 	if (rc)
1137 		return rc;
1138 
1139 	/* create send buffer and rmb */
1140 	if (smc_buf_create(new_smc, false))
1141 		return SMC_CLC_DECL_MEM;
1142 
1143 	return 0;
1144 }
1145 
1146 /* listen worker: initialize connection and buffers for SMC-D */
1147 static int smc_listen_ism_init(struct smc_sock *new_smc,
1148 			       struct smc_clc_msg_proposal *pclc,
1149 			       struct smc_init_info *ini)
1150 {
1151 	struct smc_clc_msg_smcd *pclc_smcd;
1152 	int rc;
1153 
1154 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1155 	ini->ism_gid = pclc_smcd->gid;
1156 	rc = smc_conn_create(new_smc, ini);
1157 	if (rc)
1158 		return rc;
1159 
1160 	/* Check if peer can be reached via ISM device */
1161 	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1162 			    new_smc->conn.lgr->vlan_id,
1163 			    new_smc->conn.lgr->smcd)) {
1164 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1165 			smc_lgr_forget(new_smc->conn.lgr);
1166 		smc_conn_free(&new_smc->conn);
1167 		return SMC_CLC_DECL_SMCDNOTALK;
1168 	}
1169 
1170 	/* Create send and receive buffers */
1171 	if (smc_buf_create(new_smc, true)) {
1172 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1173 			smc_lgr_forget(new_smc->conn.lgr);
1174 		smc_conn_free(&new_smc->conn);
1175 		return SMC_CLC_DECL_MEM;
1176 	}
1177 
1178 	return 0;
1179 }
1180 
1181 /* listen worker: register buffers */
1182 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1183 {
1184 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1185 
1186 	if (local_contact != SMC_FIRST_CONTACT) {
1187 		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1188 			return SMC_CLC_DECL_ERR_REGRMB;
1189 	}
1190 	smc_rmb_sync_sg_for_device(&new_smc->conn);
1191 
1192 	return 0;
1193 }
1194 
1195 /* listen worker: finish RDMA setup */
1196 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1197 				  struct smc_clc_msg_accept_confirm *cclc,
1198 				  int local_contact)
1199 {
1200 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1201 	int reason_code = 0;
1202 
1203 	if (local_contact == SMC_FIRST_CONTACT)
1204 		smc_link_save_peer_info(link, cclc);
1205 
1206 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1207 		reason_code = SMC_CLC_DECL_ERR_RTOK;
1208 		goto decline;
1209 	}
1210 
1211 	if (local_contact == SMC_FIRST_CONTACT) {
1212 		if (smc_ib_ready_link(link)) {
1213 			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1214 			goto decline;
1215 		}
1216 		/* QP confirmation over RoCE fabric */
1217 		reason_code = smc_serv_conf_first_link(new_smc);
1218 		if (reason_code)
1219 			goto decline;
1220 	}
1221 	return 0;
1222 
1223 decline:
1224 	smc_listen_decline(new_smc, reason_code, local_contact);
1225 	return reason_code;
1226 }
1227 
1228 /* setup for RDMA connection of server */
1229 static void smc_listen_work(struct work_struct *work)
1230 {
1231 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1232 						smc_listen_work);
1233 	struct socket *newclcsock = new_smc->clcsock;
1234 	struct smc_clc_msg_accept_confirm cclc;
1235 	struct smc_clc_msg_proposal *pclc;
1236 	struct smc_init_info ini = {0};
1237 	bool ism_supported = false;
1238 	u8 buf[SMC_CLC_MAX_LEN];
1239 	int rc = 0;
1240 
1241 	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1242 		return smc_listen_out_err(new_smc);
1243 
1244 	if (new_smc->use_fallback) {
1245 		smc_listen_out_connected(new_smc);
1246 		return;
1247 	}
1248 
1249 	/* check if peer is smc capable */
1250 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1251 		smc_switch_to_fallback(new_smc);
1252 		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1253 		smc_listen_out_connected(new_smc);
1254 		return;
1255 	}
1256 
1257 	/* do inband token exchange -
1258 	 * wait for and receive SMC Proposal CLC message
1259 	 */
1260 	pclc = (struct smc_clc_msg_proposal *)&buf;
1261 	rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1262 			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1263 	if (rc)
1264 		goto out_decl;
1265 
1266 	/* IPSec connections opt out of SMC-R optimizations */
1267 	if (using_ipsec(new_smc)) {
1268 		rc = SMC_CLC_DECL_IPSEC;
1269 		goto out_decl;
1270 	}
1271 
1272 	/* check for matching IP prefix and subnet length */
1273 	rc = smc_listen_prfx_check(new_smc, pclc);
1274 	if (rc)
1275 		goto out_decl;
1276 
1277 	/* get vlan id from IP device */
1278 	if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1279 		rc = SMC_CLC_DECL_GETVLANERR;
1280 		goto out_decl;
1281 	}
1282 
1283 	mutex_lock(&smc_server_lgr_pending);
1284 	smc_close_init(new_smc);
1285 	smc_rx_init(new_smc);
1286 	smc_tx_init(new_smc);
1287 
1288 	/* check if ISM is available */
1289 	if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1290 		ini.is_smcd = true; /* prepare ISM check */
1291 		rc = smc_find_ism_device(new_smc, &ini);
1292 		if (!rc)
1293 			rc = smc_listen_ism_init(new_smc, pclc, &ini);
1294 		if (!rc)
1295 			ism_supported = true;
1296 		else if (pclc->hdr.path == SMC_TYPE_D)
1297 			goto out_unlock; /* skip RDMA and decline */
1298 	}
1299 
1300 	/* check if RDMA is available */
1301 	if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1302 		/* prepare RDMA check */
1303 		memset(&ini, 0, sizeof(ini));
1304 		ini.is_smcd = false;
1305 		ini.ib_lcl = &pclc->lcl;
1306 		rc = smc_find_rdma_device(new_smc, &ini);
1307 		if (rc) {
1308 			/* no RDMA device found */
1309 			if (pclc->hdr.path == SMC_TYPE_B)
1310 				/* neither ISM nor RDMA device found */
1311 				rc = SMC_CLC_DECL_NOSMCDEV;
1312 			goto out_unlock;
1313 		}
1314 		rc = smc_listen_rdma_init(new_smc, &ini);
1315 		if (rc)
1316 			goto out_unlock;
1317 		rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1318 		if (rc)
1319 			goto out_unlock;
1320 	}
1321 
1322 	/* send SMC Accept CLC message */
1323 	rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1324 	if (rc)
1325 		goto out_unlock;
1326 
1327 	/* SMC-D does not need this lock any more */
1328 	if (ism_supported)
1329 		mutex_unlock(&smc_server_lgr_pending);
1330 
1331 	/* receive SMC Confirm CLC message */
1332 	rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1333 			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1334 	if (rc) {
1335 		if (!ism_supported)
1336 			goto out_unlock;
1337 		goto out_decl;
1338 	}
1339 
1340 	/* finish worker */
1341 	if (!ism_supported) {
1342 		rc = smc_listen_rdma_finish(new_smc, &cclc,
1343 					    ini.cln_first_contact);
1344 		mutex_unlock(&smc_server_lgr_pending);
1345 		if (rc)
1346 			return;
1347 	}
1348 	smc_conn_save_peer_info(new_smc, &cclc);
1349 	smc_listen_out_connected(new_smc);
1350 	return;
1351 
1352 out_unlock:
1353 	mutex_unlock(&smc_server_lgr_pending);
1354 out_decl:
1355 	smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1356 }
1357 
1358 static void smc_tcp_listen_work(struct work_struct *work)
1359 {
1360 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1361 					     tcp_listen_work);
1362 	struct sock *lsk = &lsmc->sk;
1363 	struct smc_sock *new_smc;
1364 	int rc = 0;
1365 
1366 	lock_sock(lsk);
1367 	while (lsk->sk_state == SMC_LISTEN) {
1368 		rc = smc_clcsock_accept(lsmc, &new_smc);
1369 		if (rc)
1370 			goto out;
1371 		if (!new_smc)
1372 			continue;
1373 
1374 		new_smc->listen_smc = lsmc;
1375 		new_smc->use_fallback = lsmc->use_fallback;
1376 		new_smc->fallback_rsn = lsmc->fallback_rsn;
1377 		sock_hold(lsk); /* sock_put in smc_listen_work */
1378 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1379 		smc_copy_sock_settings_to_smc(new_smc);
1380 		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1381 		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1382 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1383 		if (!schedule_work(&new_smc->smc_listen_work))
1384 			sock_put(&new_smc->sk);
1385 	}
1386 
1387 out:
1388 	release_sock(lsk);
1389 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1390 }
1391 
1392 static int smc_listen(struct socket *sock, int backlog)
1393 {
1394 	struct sock *sk = sock->sk;
1395 	struct smc_sock *smc;
1396 	int rc;
1397 
1398 	smc = smc_sk(sk);
1399 	lock_sock(sk);
1400 
1401 	rc = -EINVAL;
1402 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1403 		goto out;
1404 
1405 	rc = 0;
1406 	if (sk->sk_state == SMC_LISTEN) {
1407 		sk->sk_max_ack_backlog = backlog;
1408 		goto out;
1409 	}
1410 	/* some socket options are handled in core, so we could not apply
1411 	 * them to the clc socket -- copy smc socket options to clc socket
1412 	 */
1413 	smc_copy_sock_settings_to_clc(smc);
1414 	if (!smc->use_fallback)
1415 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1416 
1417 	rc = kernel_listen(smc->clcsock, backlog);
1418 	if (rc)
1419 		goto out;
1420 	sk->sk_max_ack_backlog = backlog;
1421 	sk->sk_ack_backlog = 0;
1422 	sk->sk_state = SMC_LISTEN;
1423 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1424 	if (!schedule_work(&smc->tcp_listen_work))
1425 		sock_put(sk);
1426 
1427 out:
1428 	release_sock(sk);
1429 	return rc;
1430 }
1431 
1432 static int smc_accept(struct socket *sock, struct socket *new_sock,
1433 		      int flags, bool kern)
1434 {
1435 	struct sock *sk = sock->sk, *nsk;
1436 	DECLARE_WAITQUEUE(wait, current);
1437 	struct smc_sock *lsmc;
1438 	long timeo;
1439 	int rc = 0;
1440 
1441 	lsmc = smc_sk(sk);
1442 	sock_hold(sk); /* sock_put below */
1443 	lock_sock(sk);
1444 
1445 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1446 		rc = -EINVAL;
1447 		release_sock(sk);
1448 		goto out;
1449 	}
1450 
1451 	/* Wait for an incoming connection */
1452 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1453 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1454 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1455 		set_current_state(TASK_INTERRUPTIBLE);
1456 		if (!timeo) {
1457 			rc = -EAGAIN;
1458 			break;
1459 		}
1460 		release_sock(sk);
1461 		timeo = schedule_timeout(timeo);
1462 		/* wakeup by sk_data_ready in smc_listen_work() */
1463 		sched_annotate_sleep();
1464 		lock_sock(sk);
1465 		if (signal_pending(current)) {
1466 			rc = sock_intr_errno(timeo);
1467 			break;
1468 		}
1469 	}
1470 	set_current_state(TASK_RUNNING);
1471 	remove_wait_queue(sk_sleep(sk), &wait);
1472 
1473 	if (!rc)
1474 		rc = sock_error(nsk);
1475 	release_sock(sk);
1476 	if (rc)
1477 		goto out;
1478 
1479 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1480 		/* wait till data arrives on the socket */
1481 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1482 								MSEC_PER_SEC);
1483 		if (smc_sk(nsk)->use_fallback) {
1484 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1485 
1486 			lock_sock(clcsk);
1487 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1488 				sk_wait_data(clcsk, &timeo, NULL);
1489 			release_sock(clcsk);
1490 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1491 			lock_sock(nsk);
1492 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1493 			release_sock(nsk);
1494 		}
1495 	}
1496 
1497 out:
1498 	sock_put(sk); /* sock_hold above */
1499 	return rc;
1500 }
1501 
1502 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1503 		       int peer)
1504 {
1505 	struct smc_sock *smc;
1506 
1507 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1508 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1509 		return -ENOTCONN;
1510 
1511 	smc = smc_sk(sock->sk);
1512 
1513 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1514 }
1515 
1516 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1517 {
1518 	struct sock *sk = sock->sk;
1519 	struct smc_sock *smc;
1520 	int rc = -EPIPE;
1521 
1522 	smc = smc_sk(sk);
1523 	lock_sock(sk);
1524 	if ((sk->sk_state != SMC_ACTIVE) &&
1525 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1526 	    (sk->sk_state != SMC_INIT))
1527 		goto out;
1528 
1529 	if (msg->msg_flags & MSG_FASTOPEN) {
1530 		if (sk->sk_state == SMC_INIT) {
1531 			smc_switch_to_fallback(smc);
1532 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1533 		} else {
1534 			rc = -EINVAL;
1535 			goto out;
1536 		}
1537 	}
1538 
1539 	if (smc->use_fallback)
1540 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1541 	else
1542 		rc = smc_tx_sendmsg(smc, msg, len);
1543 out:
1544 	release_sock(sk);
1545 	return rc;
1546 }
1547 
1548 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1549 		       int flags)
1550 {
1551 	struct sock *sk = sock->sk;
1552 	struct smc_sock *smc;
1553 	int rc = -ENOTCONN;
1554 
1555 	smc = smc_sk(sk);
1556 	lock_sock(sk);
1557 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1558 		/* socket was connected before, no more data to read */
1559 		rc = 0;
1560 		goto out;
1561 	}
1562 	if ((sk->sk_state == SMC_INIT) ||
1563 	    (sk->sk_state == SMC_LISTEN) ||
1564 	    (sk->sk_state == SMC_CLOSED))
1565 		goto out;
1566 
1567 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1568 		rc = 0;
1569 		goto out;
1570 	}
1571 
1572 	if (smc->use_fallback) {
1573 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1574 	} else {
1575 		msg->msg_namelen = 0;
1576 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1577 	}
1578 
1579 out:
1580 	release_sock(sk);
1581 	return rc;
1582 }
1583 
1584 static __poll_t smc_accept_poll(struct sock *parent)
1585 {
1586 	struct smc_sock *isk = smc_sk(parent);
1587 	__poll_t mask = 0;
1588 
1589 	spin_lock(&isk->accept_q_lock);
1590 	if (!list_empty(&isk->accept_q))
1591 		mask = EPOLLIN | EPOLLRDNORM;
1592 	spin_unlock(&isk->accept_q_lock);
1593 
1594 	return mask;
1595 }
1596 
1597 static __poll_t smc_poll(struct file *file, struct socket *sock,
1598 			     poll_table *wait)
1599 {
1600 	struct sock *sk = sock->sk;
1601 	struct smc_sock *smc;
1602 	__poll_t mask = 0;
1603 
1604 	if (!sk)
1605 		return EPOLLNVAL;
1606 
1607 	smc = smc_sk(sock->sk);
1608 	if (smc->use_fallback) {
1609 		/* delegate to CLC child sock */
1610 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1611 		sk->sk_err = smc->clcsock->sk->sk_err;
1612 	} else {
1613 		if (sk->sk_state != SMC_CLOSED)
1614 			sock_poll_wait(file, sock, wait);
1615 		if (sk->sk_err)
1616 			mask |= EPOLLERR;
1617 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1618 		    (sk->sk_state == SMC_CLOSED))
1619 			mask |= EPOLLHUP;
1620 		if (sk->sk_state == SMC_LISTEN) {
1621 			/* woken up by sk_data_ready in smc_listen_work() */
1622 			mask |= smc_accept_poll(sk);
1623 		} else if (smc->use_fallback) { /* as result of connect_work()*/
1624 			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1625 							   wait);
1626 			sk->sk_err = smc->clcsock->sk->sk_err;
1627 		} else {
1628 			if ((sk->sk_state != SMC_INIT &&
1629 			     atomic_read(&smc->conn.sndbuf_space)) ||
1630 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1631 				mask |= EPOLLOUT | EPOLLWRNORM;
1632 			} else {
1633 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1634 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1635 			}
1636 			if (atomic_read(&smc->conn.bytes_to_rcv))
1637 				mask |= EPOLLIN | EPOLLRDNORM;
1638 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1639 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1640 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1641 				mask |= EPOLLIN;
1642 			if (smc->conn.urg_state == SMC_URG_VALID)
1643 				mask |= EPOLLPRI;
1644 		}
1645 	}
1646 
1647 	return mask;
1648 }
1649 
1650 static int smc_shutdown(struct socket *sock, int how)
1651 {
1652 	struct sock *sk = sock->sk;
1653 	struct smc_sock *smc;
1654 	int rc = -EINVAL;
1655 	int rc1 = 0;
1656 
1657 	smc = smc_sk(sk);
1658 
1659 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1660 		return rc;
1661 
1662 	lock_sock(sk);
1663 
1664 	rc = -ENOTCONN;
1665 	if ((sk->sk_state != SMC_ACTIVE) &&
1666 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1667 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1668 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1669 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1670 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1671 		goto out;
1672 	if (smc->use_fallback) {
1673 		rc = kernel_sock_shutdown(smc->clcsock, how);
1674 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1675 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1676 			sk->sk_state = SMC_CLOSED;
1677 		goto out;
1678 	}
1679 	switch (how) {
1680 	case SHUT_RDWR:		/* shutdown in both directions */
1681 		rc = smc_close_active(smc);
1682 		break;
1683 	case SHUT_WR:
1684 		rc = smc_close_shutdown_write(smc);
1685 		break;
1686 	case SHUT_RD:
1687 		rc = 0;
1688 		/* nothing more to do because peer is not involved */
1689 		break;
1690 	}
1691 	if (smc->clcsock)
1692 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1693 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1694 	sk->sk_shutdown |= how + 1;
1695 
1696 out:
1697 	release_sock(sk);
1698 	return rc ? rc : rc1;
1699 }
1700 
1701 static int smc_setsockopt(struct socket *sock, int level, int optname,
1702 			  char __user *optval, unsigned int optlen)
1703 {
1704 	struct sock *sk = sock->sk;
1705 	struct smc_sock *smc;
1706 	int val, rc;
1707 
1708 	smc = smc_sk(sk);
1709 
1710 	/* generic setsockopts reaching us here always apply to the
1711 	 * CLC socket
1712 	 */
1713 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1714 					   optval, optlen);
1715 	if (smc->clcsock->sk->sk_err) {
1716 		sk->sk_err = smc->clcsock->sk->sk_err;
1717 		sk->sk_error_report(sk);
1718 	}
1719 	if (rc)
1720 		return rc;
1721 
1722 	if (optlen < sizeof(int))
1723 		return -EINVAL;
1724 	if (get_user(val, (int __user *)optval))
1725 		return -EFAULT;
1726 
1727 	lock_sock(sk);
1728 	switch (optname) {
1729 	case TCP_ULP:
1730 	case TCP_FASTOPEN:
1731 	case TCP_FASTOPEN_CONNECT:
1732 	case TCP_FASTOPEN_KEY:
1733 	case TCP_FASTOPEN_NO_COOKIE:
1734 		/* option not supported by SMC */
1735 		if (sk->sk_state == SMC_INIT) {
1736 			smc_switch_to_fallback(smc);
1737 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1738 		} else {
1739 			if (!smc->use_fallback)
1740 				rc = -EINVAL;
1741 		}
1742 		break;
1743 	case TCP_NODELAY:
1744 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1745 			if (val && !smc->use_fallback)
1746 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1747 						 0);
1748 		}
1749 		break;
1750 	case TCP_CORK:
1751 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1752 			if (!val && !smc->use_fallback)
1753 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1754 						 0);
1755 		}
1756 		break;
1757 	case TCP_DEFER_ACCEPT:
1758 		smc->sockopt_defer_accept = val;
1759 		break;
1760 	default:
1761 		break;
1762 	}
1763 	release_sock(sk);
1764 
1765 	return rc;
1766 }
1767 
1768 static int smc_getsockopt(struct socket *sock, int level, int optname,
1769 			  char __user *optval, int __user *optlen)
1770 {
1771 	struct smc_sock *smc;
1772 
1773 	smc = smc_sk(sock->sk);
1774 	/* socket options apply to the CLC socket */
1775 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1776 					     optval, optlen);
1777 }
1778 
1779 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1780 		     unsigned long arg)
1781 {
1782 	union smc_host_cursor cons, urg;
1783 	struct smc_connection *conn;
1784 	struct smc_sock *smc;
1785 	int answ;
1786 
1787 	smc = smc_sk(sock->sk);
1788 	conn = &smc->conn;
1789 	lock_sock(&smc->sk);
1790 	if (smc->use_fallback) {
1791 		if (!smc->clcsock) {
1792 			release_sock(&smc->sk);
1793 			return -EBADF;
1794 		}
1795 		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1796 		release_sock(&smc->sk);
1797 		return answ;
1798 	}
1799 	switch (cmd) {
1800 	case SIOCINQ: /* same as FIONREAD */
1801 		if (smc->sk.sk_state == SMC_LISTEN) {
1802 			release_sock(&smc->sk);
1803 			return -EINVAL;
1804 		}
1805 		if (smc->sk.sk_state == SMC_INIT ||
1806 		    smc->sk.sk_state == SMC_CLOSED)
1807 			answ = 0;
1808 		else
1809 			answ = atomic_read(&smc->conn.bytes_to_rcv);
1810 		break;
1811 	case SIOCOUTQ:
1812 		/* output queue size (not send + not acked) */
1813 		if (smc->sk.sk_state == SMC_LISTEN) {
1814 			release_sock(&smc->sk);
1815 			return -EINVAL;
1816 		}
1817 		if (smc->sk.sk_state == SMC_INIT ||
1818 		    smc->sk.sk_state == SMC_CLOSED)
1819 			answ = 0;
1820 		else
1821 			answ = smc->conn.sndbuf_desc->len -
1822 					atomic_read(&smc->conn.sndbuf_space);
1823 		break;
1824 	case SIOCOUTQNSD:
1825 		/* output queue size (not send only) */
1826 		if (smc->sk.sk_state == SMC_LISTEN) {
1827 			release_sock(&smc->sk);
1828 			return -EINVAL;
1829 		}
1830 		if (smc->sk.sk_state == SMC_INIT ||
1831 		    smc->sk.sk_state == SMC_CLOSED)
1832 			answ = 0;
1833 		else
1834 			answ = smc_tx_prepared_sends(&smc->conn);
1835 		break;
1836 	case SIOCATMARK:
1837 		if (smc->sk.sk_state == SMC_LISTEN) {
1838 			release_sock(&smc->sk);
1839 			return -EINVAL;
1840 		}
1841 		if (smc->sk.sk_state == SMC_INIT ||
1842 		    smc->sk.sk_state == SMC_CLOSED) {
1843 			answ = 0;
1844 		} else {
1845 			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1846 			smc_curs_copy(&urg, &conn->urg_curs, conn);
1847 			answ = smc_curs_diff(conn->rmb_desc->len,
1848 					     &cons, &urg) == 1;
1849 		}
1850 		break;
1851 	default:
1852 		release_sock(&smc->sk);
1853 		return -ENOIOCTLCMD;
1854 	}
1855 	release_sock(&smc->sk);
1856 
1857 	return put_user(answ, (int __user *)arg);
1858 }
1859 
1860 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1861 			    int offset, size_t size, int flags)
1862 {
1863 	struct sock *sk = sock->sk;
1864 	struct smc_sock *smc;
1865 	int rc = -EPIPE;
1866 
1867 	smc = smc_sk(sk);
1868 	lock_sock(sk);
1869 	if (sk->sk_state != SMC_ACTIVE) {
1870 		release_sock(sk);
1871 		goto out;
1872 	}
1873 	release_sock(sk);
1874 	if (smc->use_fallback)
1875 		rc = kernel_sendpage(smc->clcsock, page, offset,
1876 				     size, flags);
1877 	else
1878 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1879 
1880 out:
1881 	return rc;
1882 }
1883 
1884 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1885  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1886  * updates till whenever a respective page has been fully processed.
1887  * Note that subsequent recv() calls have to wait till all splice() processing
1888  * completed.
1889  */
1890 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1891 			       struct pipe_inode_info *pipe, size_t len,
1892 			       unsigned int flags)
1893 {
1894 	struct sock *sk = sock->sk;
1895 	struct smc_sock *smc;
1896 	int rc = -ENOTCONN;
1897 
1898 	smc = smc_sk(sk);
1899 	lock_sock(sk);
1900 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1901 		/* socket was connected before, no more data to read */
1902 		rc = 0;
1903 		goto out;
1904 	}
1905 	if (sk->sk_state == SMC_INIT ||
1906 	    sk->sk_state == SMC_LISTEN ||
1907 	    sk->sk_state == SMC_CLOSED)
1908 		goto out;
1909 
1910 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1911 		rc = 0;
1912 		goto out;
1913 	}
1914 
1915 	if (smc->use_fallback) {
1916 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1917 						    pipe, len, flags);
1918 	} else {
1919 		if (*ppos) {
1920 			rc = -ESPIPE;
1921 			goto out;
1922 		}
1923 		if (flags & SPLICE_F_NONBLOCK)
1924 			flags = MSG_DONTWAIT;
1925 		else
1926 			flags = 0;
1927 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1928 	}
1929 out:
1930 	release_sock(sk);
1931 
1932 	return rc;
1933 }
1934 
1935 /* must look like tcp */
1936 static const struct proto_ops smc_sock_ops = {
1937 	.family		= PF_SMC,
1938 	.owner		= THIS_MODULE,
1939 	.release	= smc_release,
1940 	.bind		= smc_bind,
1941 	.connect	= smc_connect,
1942 	.socketpair	= sock_no_socketpair,
1943 	.accept		= smc_accept,
1944 	.getname	= smc_getname,
1945 	.poll		= smc_poll,
1946 	.ioctl		= smc_ioctl,
1947 	.listen		= smc_listen,
1948 	.shutdown	= smc_shutdown,
1949 	.setsockopt	= smc_setsockopt,
1950 	.getsockopt	= smc_getsockopt,
1951 	.sendmsg	= smc_sendmsg,
1952 	.recvmsg	= smc_recvmsg,
1953 	.mmap		= sock_no_mmap,
1954 	.sendpage	= smc_sendpage,
1955 	.splice_read	= smc_splice_read,
1956 };
1957 
1958 static int smc_create(struct net *net, struct socket *sock, int protocol,
1959 		      int kern)
1960 {
1961 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1962 	struct smc_sock *smc;
1963 	struct sock *sk;
1964 	int rc;
1965 
1966 	rc = -ESOCKTNOSUPPORT;
1967 	if (sock->type != SOCK_STREAM)
1968 		goto out;
1969 
1970 	rc = -EPROTONOSUPPORT;
1971 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1972 		goto out;
1973 
1974 	rc = -ENOBUFS;
1975 	sock->ops = &smc_sock_ops;
1976 	sk = smc_sock_alloc(net, sock, protocol);
1977 	if (!sk)
1978 		goto out;
1979 
1980 	/* create internal TCP socket for CLC handshake and fallback */
1981 	smc = smc_sk(sk);
1982 	smc->use_fallback = false; /* assume rdma capability first */
1983 	smc->fallback_rsn = 0;
1984 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1985 			      &smc->clcsock);
1986 	if (rc) {
1987 		sk_common_release(sk);
1988 		goto out;
1989 	}
1990 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1991 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1992 
1993 out:
1994 	return rc;
1995 }
1996 
1997 static const struct net_proto_family smc_sock_family_ops = {
1998 	.family	= PF_SMC,
1999 	.owner	= THIS_MODULE,
2000 	.create	= smc_create,
2001 };
2002 
2003 unsigned int smc_net_id;
2004 
2005 static __net_init int smc_net_init(struct net *net)
2006 {
2007 	return smc_pnet_net_init(net);
2008 }
2009 
2010 static void __net_exit smc_net_exit(struct net *net)
2011 {
2012 	smc_pnet_net_exit(net);
2013 }
2014 
2015 static struct pernet_operations smc_net_ops = {
2016 	.init = smc_net_init,
2017 	.exit = smc_net_exit,
2018 	.id   = &smc_net_id,
2019 	.size = sizeof(struct smc_net),
2020 };
2021 
2022 static int __init smc_init(void)
2023 {
2024 	int rc;
2025 
2026 	rc = register_pernet_subsys(&smc_net_ops);
2027 	if (rc)
2028 		return rc;
2029 
2030 	rc = smc_pnet_init();
2031 	if (rc)
2032 		return rc;
2033 
2034 	rc = smc_llc_init();
2035 	if (rc) {
2036 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2037 		goto out_pnet;
2038 	}
2039 
2040 	rc = smc_cdc_init();
2041 	if (rc) {
2042 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2043 		goto out_pnet;
2044 	}
2045 
2046 	rc = proto_register(&smc_proto, 1);
2047 	if (rc) {
2048 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2049 		goto out_pnet;
2050 	}
2051 
2052 	rc = proto_register(&smc_proto6, 1);
2053 	if (rc) {
2054 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2055 		goto out_proto;
2056 	}
2057 
2058 	rc = sock_register(&smc_sock_family_ops);
2059 	if (rc) {
2060 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
2061 		goto out_proto6;
2062 	}
2063 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2064 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2065 
2066 	rc = smc_ib_register_client();
2067 	if (rc) {
2068 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
2069 		goto out_sock;
2070 	}
2071 
2072 	static_branch_enable(&tcp_have_smc);
2073 	return 0;
2074 
2075 out_sock:
2076 	sock_unregister(PF_SMC);
2077 out_proto6:
2078 	proto_unregister(&smc_proto6);
2079 out_proto:
2080 	proto_unregister(&smc_proto);
2081 out_pnet:
2082 	smc_pnet_exit();
2083 	return rc;
2084 }
2085 
2086 static void __exit smc_exit(void)
2087 {
2088 	smc_core_exit();
2089 	static_branch_disable(&tcp_have_smc);
2090 	smc_ib_unregister_client();
2091 	sock_unregister(PF_SMC);
2092 	proto_unregister(&smc_proto6);
2093 	proto_unregister(&smc_proto);
2094 	smc_pnet_exit();
2095 	unregister_pernet_subsys(&smc_net_ops);
2096 }
2097 
2098 module_init(smc_init);
2099 module_exit(smc_exit);
2100 
2101 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2102 MODULE_DESCRIPTION("smc socket address family");
2103 MODULE_LICENSE("GPL");
2104 MODULE_ALIAS_NETPROTO(PF_SMC);
2105