xref: /openbmc/linux/net/smc/af_smc.c (revision b8d312aa075f33282565467662c4628dae0a2aff)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18 
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21 
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/netns/generic.h>
36 #include "smc_netns.h"
37 
38 #include "smc.h"
39 #include "smc_clc.h"
40 #include "smc_llc.h"
41 #include "smc_cdc.h"
42 #include "smc_core.h"
43 #include "smc_ib.h"
44 #include "smc_ism.h"
45 #include "smc_pnet.h"
46 #include "smc_tx.h"
47 #include "smc_rx.h"
48 #include "smc_close.h"
49 
50 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
51 						 * creation on server
52 						 */
53 static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
54 						 * creation on client
55 						 */
56 
57 static void smc_tcp_listen_work(struct work_struct *);
58 static void smc_connect_work(struct work_struct *);
59 
60 static void smc_set_keepalive(struct sock *sk, int val)
61 {
62 	struct smc_sock *smc = smc_sk(sk);
63 
64 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
65 }
66 
67 static struct smc_hashinfo smc_v4_hashinfo = {
68 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
69 };
70 
71 static struct smc_hashinfo smc_v6_hashinfo = {
72 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
73 };
74 
75 int smc_hash_sk(struct sock *sk)
76 {
77 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
78 	struct hlist_head *head;
79 
80 	head = &h->ht;
81 
82 	write_lock_bh(&h->lock);
83 	sk_add_node(sk, head);
84 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
85 	write_unlock_bh(&h->lock);
86 
87 	return 0;
88 }
89 EXPORT_SYMBOL_GPL(smc_hash_sk);
90 
91 void smc_unhash_sk(struct sock *sk)
92 {
93 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
94 
95 	write_lock_bh(&h->lock);
96 	if (sk_del_node_init(sk))
97 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
98 	write_unlock_bh(&h->lock);
99 }
100 EXPORT_SYMBOL_GPL(smc_unhash_sk);
101 
102 struct proto smc_proto = {
103 	.name		= "SMC",
104 	.owner		= THIS_MODULE,
105 	.keepalive	= smc_set_keepalive,
106 	.hash		= smc_hash_sk,
107 	.unhash		= smc_unhash_sk,
108 	.obj_size	= sizeof(struct smc_sock),
109 	.h.smc_hash	= &smc_v4_hashinfo,
110 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
111 };
112 EXPORT_SYMBOL_GPL(smc_proto);
113 
114 struct proto smc_proto6 = {
115 	.name		= "SMC6",
116 	.owner		= THIS_MODULE,
117 	.keepalive	= smc_set_keepalive,
118 	.hash		= smc_hash_sk,
119 	.unhash		= smc_unhash_sk,
120 	.obj_size	= sizeof(struct smc_sock),
121 	.h.smc_hash	= &smc_v6_hashinfo,
122 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
123 };
124 EXPORT_SYMBOL_GPL(smc_proto6);
125 
126 static int __smc_release(struct smc_sock *smc)
127 {
128 	struct sock *sk = &smc->sk;
129 	int rc = 0;
130 
131 	if (!smc->use_fallback) {
132 		rc = smc_close_active(smc);
133 		sock_set_flag(sk, SOCK_DEAD);
134 		sk->sk_shutdown |= SHUTDOWN_MASK;
135 	} else {
136 		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
137 			sock_put(sk); /* passive closing */
138 		if (sk->sk_state == SMC_LISTEN) {
139 			/* wake up clcsock accept */
140 			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
141 		}
142 		sk->sk_state = SMC_CLOSED;
143 		sk->sk_state_change(sk);
144 	}
145 
146 	sk->sk_prot->unhash(sk);
147 
148 	if (sk->sk_state == SMC_CLOSED) {
149 		if (smc->clcsock) {
150 			release_sock(sk);
151 			smc_clcsock_release(smc);
152 			lock_sock(sk);
153 		}
154 		if (!smc->use_fallback)
155 			smc_conn_free(&smc->conn);
156 	}
157 
158 	return rc;
159 }
160 
161 static int smc_release(struct socket *sock)
162 {
163 	struct sock *sk = sock->sk;
164 	struct smc_sock *smc;
165 	int rc = 0;
166 
167 	if (!sk)
168 		goto out;
169 
170 	smc = smc_sk(sk);
171 
172 	/* cleanup for a dangling non-blocking connect */
173 	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
174 		tcp_abort(smc->clcsock->sk, ECONNABORTED);
175 	flush_work(&smc->connect_work);
176 
177 	if (sk->sk_state == SMC_LISTEN)
178 		/* smc_close_non_accepted() is called and acquires
179 		 * sock lock for child sockets again
180 		 */
181 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
182 	else
183 		lock_sock(sk);
184 
185 	rc = __smc_release(smc);
186 
187 	/* detach socket */
188 	sock_orphan(sk);
189 	sock->sk = NULL;
190 	release_sock(sk);
191 
192 	sock_put(sk); /* final sock_put */
193 out:
194 	return rc;
195 }
196 
197 static void smc_destruct(struct sock *sk)
198 {
199 	if (sk->sk_state != SMC_CLOSED)
200 		return;
201 	if (!sock_flag(sk, SOCK_DEAD))
202 		return;
203 
204 	sk_refcnt_debug_dec(sk);
205 }
206 
207 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
208 				   int protocol)
209 {
210 	struct smc_sock *smc;
211 	struct proto *prot;
212 	struct sock *sk;
213 
214 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
215 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
216 	if (!sk)
217 		return NULL;
218 
219 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
220 	sk->sk_state = SMC_INIT;
221 	sk->sk_destruct = smc_destruct;
222 	sk->sk_protocol = protocol;
223 	smc = smc_sk(sk);
224 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
225 	INIT_WORK(&smc->connect_work, smc_connect_work);
226 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
227 	INIT_LIST_HEAD(&smc->accept_q);
228 	spin_lock_init(&smc->accept_q_lock);
229 	spin_lock_init(&smc->conn.send_lock);
230 	sk->sk_prot->hash(sk);
231 	sk_refcnt_debug_inc(sk);
232 	mutex_init(&smc->clcsock_release_lock);
233 
234 	return sk;
235 }
236 
237 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
238 		    int addr_len)
239 {
240 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
241 	struct sock *sk = sock->sk;
242 	struct smc_sock *smc;
243 	int rc;
244 
245 	smc = smc_sk(sk);
246 
247 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
248 	rc = -EINVAL;
249 	if (addr_len < sizeof(struct sockaddr_in))
250 		goto out;
251 
252 	rc = -EAFNOSUPPORT;
253 	if (addr->sin_family != AF_INET &&
254 	    addr->sin_family != AF_INET6 &&
255 	    addr->sin_family != AF_UNSPEC)
256 		goto out;
257 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
258 	if (addr->sin_family == AF_UNSPEC &&
259 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
260 		goto out;
261 
262 	lock_sock(sk);
263 
264 	/* Check if socket is already active */
265 	rc = -EINVAL;
266 	if (sk->sk_state != SMC_INIT)
267 		goto out_rel;
268 
269 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
270 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
271 
272 out_rel:
273 	release_sock(sk);
274 out:
275 	return rc;
276 }
277 
278 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
279 				   unsigned long mask)
280 {
281 	/* options we don't get control via setsockopt for */
282 	nsk->sk_type = osk->sk_type;
283 	nsk->sk_sndbuf = osk->sk_sndbuf;
284 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
285 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
286 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
287 	nsk->sk_mark = osk->sk_mark;
288 	nsk->sk_priority = osk->sk_priority;
289 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
290 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
291 	nsk->sk_err = osk->sk_err;
292 
293 	nsk->sk_flags &= ~mask;
294 	nsk->sk_flags |= osk->sk_flags & mask;
295 }
296 
297 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
298 			     (1UL << SOCK_KEEPOPEN) | \
299 			     (1UL << SOCK_LINGER) | \
300 			     (1UL << SOCK_BROADCAST) | \
301 			     (1UL << SOCK_TIMESTAMP) | \
302 			     (1UL << SOCK_DBG) | \
303 			     (1UL << SOCK_RCVTSTAMP) | \
304 			     (1UL << SOCK_RCVTSTAMPNS) | \
305 			     (1UL << SOCK_LOCALROUTE) | \
306 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
307 			     (1UL << SOCK_RXQ_OVFL) | \
308 			     (1UL << SOCK_WIFI_STATUS) | \
309 			     (1UL << SOCK_NOFCS) | \
310 			     (1UL << SOCK_FILTER_LOCKED) | \
311 			     (1UL << SOCK_TSTAMP_NEW))
312 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
313  * clc socket (since smc is not called for these options from net/core)
314  */
315 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
316 {
317 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
318 }
319 
320 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
321 			     (1UL << SOCK_KEEPOPEN) | \
322 			     (1UL << SOCK_LINGER) | \
323 			     (1UL << SOCK_DBG))
324 /* copy only settings and flags relevant for smc from clc to smc socket */
325 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
326 {
327 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
328 }
329 
330 /* register a new rmb, send confirm_rkey msg to register with peer */
331 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
332 		       bool conf_rkey)
333 {
334 	if (!rmb_desc->wr_reg) {
335 		/* register memory region for new rmb */
336 		if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
337 			rmb_desc->regerr = 1;
338 			return -EFAULT;
339 		}
340 		rmb_desc->wr_reg = 1;
341 	}
342 	if (!conf_rkey)
343 		return 0;
344 	/* exchange confirm_rkey msg with peer */
345 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
346 		rmb_desc->regerr = 1;
347 		return -EFAULT;
348 	}
349 	return 0;
350 }
351 
352 static int smc_clnt_conf_first_link(struct smc_sock *smc)
353 {
354 	struct net *net = sock_net(smc->clcsock->sk);
355 	struct smc_link_group *lgr = smc->conn.lgr;
356 	struct smc_link *link;
357 	int rest;
358 	int rc;
359 
360 	link = &lgr->lnk[SMC_SINGLE_LINK];
361 	/* receive CONFIRM LINK request from server over RoCE fabric */
362 	rest = wait_for_completion_interruptible_timeout(
363 		&link->llc_confirm,
364 		SMC_LLC_WAIT_FIRST_TIME);
365 	if (rest <= 0) {
366 		struct smc_clc_msg_decline dclc;
367 
368 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
369 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
370 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
371 	}
372 
373 	if (link->llc_confirm_rc)
374 		return SMC_CLC_DECL_RMBE_EC;
375 
376 	rc = smc_ib_modify_qp_rts(link);
377 	if (rc)
378 		return SMC_CLC_DECL_ERR_RDYLNK;
379 
380 	smc_wr_remember_qp_attr(link);
381 
382 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
383 		return SMC_CLC_DECL_ERR_REGRMB;
384 
385 	/* send CONFIRM LINK response over RoCE fabric */
386 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
387 	if (rc < 0)
388 		return SMC_CLC_DECL_TIMEOUT_CL;
389 
390 	/* receive ADD LINK request from server over RoCE fabric */
391 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
392 							 SMC_LLC_WAIT_TIME);
393 	if (rest <= 0) {
394 		struct smc_clc_msg_decline dclc;
395 
396 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
397 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
398 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
399 	}
400 
401 	/* send add link reject message, only one link supported for now */
402 	rc = smc_llc_send_add_link(link,
403 				   link->smcibdev->mac[link->ibport - 1],
404 				   link->gid, SMC_LLC_RESP);
405 	if (rc < 0)
406 		return SMC_CLC_DECL_TIMEOUT_AL;
407 
408 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
409 
410 	return 0;
411 }
412 
413 static void smcr_conn_save_peer_info(struct smc_sock *smc,
414 				     struct smc_clc_msg_accept_confirm *clc)
415 {
416 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
417 
418 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
419 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
420 	smc->conn.peer_rmbe_size = bufsize;
421 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
422 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
423 }
424 
425 static void smcd_conn_save_peer_info(struct smc_sock *smc,
426 				     struct smc_clc_msg_accept_confirm *clc)
427 {
428 	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
429 
430 	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
431 	smc->conn.peer_token = clc->token;
432 	/* msg header takes up space in the buffer */
433 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
434 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
435 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
436 }
437 
438 static void smc_conn_save_peer_info(struct smc_sock *smc,
439 				    struct smc_clc_msg_accept_confirm *clc)
440 {
441 	if (smc->conn.lgr->is_smcd)
442 		smcd_conn_save_peer_info(smc, clc);
443 	else
444 		smcr_conn_save_peer_info(smc, clc);
445 }
446 
447 static void smc_link_save_peer_info(struct smc_link *link,
448 				    struct smc_clc_msg_accept_confirm *clc)
449 {
450 	link->peer_qpn = ntoh24(clc->qpn);
451 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
452 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
453 	link->peer_psn = ntoh24(clc->psn);
454 	link->peer_mtu = clc->qp_mtu;
455 }
456 
457 static void smc_switch_to_fallback(struct smc_sock *smc)
458 {
459 	smc->use_fallback = true;
460 	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
461 		smc->clcsock->file = smc->sk.sk_socket->file;
462 		smc->clcsock->file->private_data = smc->clcsock;
463 	}
464 }
465 
466 /* fall back during connect */
467 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
468 {
469 	smc_switch_to_fallback(smc);
470 	smc->fallback_rsn = reason_code;
471 	smc_copy_sock_settings_to_clc(smc);
472 	smc->connect_nonblock = 0;
473 	if (smc->sk.sk_state == SMC_INIT)
474 		smc->sk.sk_state = SMC_ACTIVE;
475 	return 0;
476 }
477 
478 /* decline and fall back during connect */
479 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
480 {
481 	int rc;
482 
483 	if (reason_code < 0) { /* error, fallback is not possible */
484 		if (smc->sk.sk_state == SMC_INIT)
485 			sock_put(&smc->sk); /* passive closing */
486 		return reason_code;
487 	}
488 	if (reason_code != SMC_CLC_DECL_PEERDECL) {
489 		rc = smc_clc_send_decline(smc, reason_code);
490 		if (rc < 0) {
491 			if (smc->sk.sk_state == SMC_INIT)
492 				sock_put(&smc->sk); /* passive closing */
493 			return rc;
494 		}
495 	}
496 	return smc_connect_fallback(smc, reason_code);
497 }
498 
499 /* abort connecting */
500 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
501 			     int local_contact)
502 {
503 	if (local_contact == SMC_FIRST_CONTACT)
504 		smc_lgr_forget(smc->conn.lgr);
505 	if (smc->conn.lgr->is_smcd)
506 		/* there is only one lgr role for SMC-D; use server lock */
507 		mutex_unlock(&smc_server_lgr_pending);
508 	else
509 		mutex_unlock(&smc_client_lgr_pending);
510 
511 	smc_conn_free(&smc->conn);
512 	smc->connect_nonblock = 0;
513 	return reason_code;
514 }
515 
516 /* check if there is a rdma device available for this connection. */
517 /* called for connect and listen */
518 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
519 {
520 	/* PNET table look up: search active ib_device and port
521 	 * within same PNETID that also contains the ethernet device
522 	 * used for the internal TCP socket
523 	 */
524 	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
525 	if (!ini->ib_dev)
526 		return SMC_CLC_DECL_NOSMCRDEV;
527 	return 0;
528 }
529 
530 /* check if there is an ISM device available for this connection. */
531 /* called for connect and listen */
532 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
533 {
534 	/* Find ISM device with same PNETID as connecting interface  */
535 	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
536 	if (!ini->ism_dev)
537 		return SMC_CLC_DECL_NOSMCDDEV;
538 	return 0;
539 }
540 
541 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
542 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
543 				      struct smc_init_info *ini)
544 {
545 	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
546 		return SMC_CLC_DECL_ISMVLANERR;
547 	return 0;
548 }
549 
550 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
551  * used, the VLAN ID will be registered again during the connection setup.
552  */
553 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
554 					struct smc_init_info *ini)
555 {
556 	if (!is_smcd)
557 		return 0;
558 	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
559 		return SMC_CLC_DECL_CNFERR;
560 	return 0;
561 }
562 
563 /* CLC handshake during connect */
564 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
565 			   struct smc_clc_msg_accept_confirm *aclc,
566 			   struct smc_init_info *ini)
567 {
568 	int rc = 0;
569 
570 	/* do inband token exchange */
571 	rc = smc_clc_send_proposal(smc, smc_type, ini);
572 	if (rc)
573 		return rc;
574 	/* receive SMC Accept CLC message */
575 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
576 				CLC_WAIT_TIME);
577 }
578 
579 /* setup for RDMA connection of client */
580 static int smc_connect_rdma(struct smc_sock *smc,
581 			    struct smc_clc_msg_accept_confirm *aclc,
582 			    struct smc_init_info *ini)
583 {
584 	struct smc_link *link;
585 	int reason_code = 0;
586 
587 	ini->is_smcd = false;
588 	ini->ib_lcl = &aclc->lcl;
589 	ini->ib_clcqpn = ntoh24(aclc->qpn);
590 	ini->srv_first_contact = aclc->hdr.flag;
591 
592 	mutex_lock(&smc_client_lgr_pending);
593 	reason_code = smc_conn_create(smc, ini);
594 	if (reason_code) {
595 		mutex_unlock(&smc_client_lgr_pending);
596 		return reason_code;
597 	}
598 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
599 
600 	smc_conn_save_peer_info(smc, aclc);
601 
602 	/* create send buffer and rmb */
603 	if (smc_buf_create(smc, false))
604 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
605 					 ini->cln_first_contact);
606 
607 	if (ini->cln_first_contact == SMC_FIRST_CONTACT)
608 		smc_link_save_peer_info(link, aclc);
609 
610 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
611 		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
612 					 ini->cln_first_contact);
613 
614 	smc_close_init(smc);
615 	smc_rx_init(smc);
616 
617 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
618 		if (smc_ib_ready_link(link))
619 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
620 						 ini->cln_first_contact);
621 	} else {
622 		if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
623 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
624 						 ini->cln_first_contact);
625 	}
626 	smc_rmb_sync_sg_for_device(&smc->conn);
627 
628 	reason_code = smc_clc_send_confirm(smc);
629 	if (reason_code)
630 		return smc_connect_abort(smc, reason_code,
631 					 ini->cln_first_contact);
632 
633 	smc_tx_init(smc);
634 
635 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
636 		/* QP confirmation over RoCE fabric */
637 		reason_code = smc_clnt_conf_first_link(smc);
638 		if (reason_code)
639 			return smc_connect_abort(smc, reason_code,
640 						 ini->cln_first_contact);
641 	}
642 	mutex_unlock(&smc_client_lgr_pending);
643 
644 	smc_copy_sock_settings_to_clc(smc);
645 	smc->connect_nonblock = 0;
646 	if (smc->sk.sk_state == SMC_INIT)
647 		smc->sk.sk_state = SMC_ACTIVE;
648 
649 	return 0;
650 }
651 
652 /* setup for ISM connection of client */
653 static int smc_connect_ism(struct smc_sock *smc,
654 			   struct smc_clc_msg_accept_confirm *aclc,
655 			   struct smc_init_info *ini)
656 {
657 	int rc = 0;
658 
659 	ini->is_smcd = true;
660 	ini->ism_gid = aclc->gid;
661 	ini->srv_first_contact = aclc->hdr.flag;
662 
663 	/* there is only one lgr role for SMC-D; use server lock */
664 	mutex_lock(&smc_server_lgr_pending);
665 	rc = smc_conn_create(smc, ini);
666 	if (rc) {
667 		mutex_unlock(&smc_server_lgr_pending);
668 		return rc;
669 	}
670 
671 	/* Create send and receive buffers */
672 	if (smc_buf_create(smc, true))
673 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
674 					 ini->cln_first_contact);
675 
676 	smc_conn_save_peer_info(smc, aclc);
677 	smc_close_init(smc);
678 	smc_rx_init(smc);
679 	smc_tx_init(smc);
680 
681 	rc = smc_clc_send_confirm(smc);
682 	if (rc)
683 		return smc_connect_abort(smc, rc, ini->cln_first_contact);
684 	mutex_unlock(&smc_server_lgr_pending);
685 
686 	smc_copy_sock_settings_to_clc(smc);
687 	smc->connect_nonblock = 0;
688 	if (smc->sk.sk_state == SMC_INIT)
689 		smc->sk.sk_state = SMC_ACTIVE;
690 
691 	return 0;
692 }
693 
694 /* perform steps before actually connecting */
695 static int __smc_connect(struct smc_sock *smc)
696 {
697 	bool ism_supported = false, rdma_supported = false;
698 	struct smc_clc_msg_accept_confirm aclc;
699 	struct smc_init_info ini = {0};
700 	int smc_type;
701 	int rc = 0;
702 
703 	sock_hold(&smc->sk); /* sock put in passive closing */
704 
705 	if (smc->use_fallback)
706 		return smc_connect_fallback(smc, smc->fallback_rsn);
707 
708 	/* if peer has not signalled SMC-capability, fall back */
709 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
710 		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
711 
712 	/* IPSec connections opt out of SMC-R optimizations */
713 	if (using_ipsec(smc))
714 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
715 
716 	/* get vlan id from IP device */
717 	if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
718 		return smc_connect_decline_fallback(smc,
719 						    SMC_CLC_DECL_GETVLANERR);
720 
721 	/* check if there is an ism device available */
722 	if (!smc_find_ism_device(smc, &ini) &&
723 	    !smc_connect_ism_vlan_setup(smc, &ini)) {
724 		/* ISM is supported for this connection */
725 		ism_supported = true;
726 		smc_type = SMC_TYPE_D;
727 	}
728 
729 	/* check if there is a rdma device available */
730 	if (!smc_find_rdma_device(smc, &ini)) {
731 		/* RDMA is supported for this connection */
732 		rdma_supported = true;
733 		if (ism_supported)
734 			smc_type = SMC_TYPE_B; /* both */
735 		else
736 			smc_type = SMC_TYPE_R; /* only RDMA */
737 	}
738 
739 	/* if neither ISM nor RDMA are supported, fallback */
740 	if (!rdma_supported && !ism_supported)
741 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
742 
743 	/* perform CLC handshake */
744 	rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
745 	if (rc) {
746 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
747 		return smc_connect_decline_fallback(smc, rc);
748 	}
749 
750 	/* depending on previous steps, connect using rdma or ism */
751 	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
752 		rc = smc_connect_rdma(smc, &aclc, &ini);
753 	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
754 		rc = smc_connect_ism(smc, &aclc, &ini);
755 	else
756 		rc = SMC_CLC_DECL_MODEUNSUPP;
757 	if (rc) {
758 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
759 		return smc_connect_decline_fallback(smc, rc);
760 	}
761 
762 	smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
763 	return 0;
764 }
765 
766 static void smc_connect_work(struct work_struct *work)
767 {
768 	struct smc_sock *smc = container_of(work, struct smc_sock,
769 					    connect_work);
770 	long timeo = smc->sk.sk_sndtimeo;
771 	int rc = 0;
772 
773 	if (!timeo)
774 		timeo = MAX_SCHEDULE_TIMEOUT;
775 	lock_sock(smc->clcsock->sk);
776 	if (smc->clcsock->sk->sk_err) {
777 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
778 	} else if ((1 << smc->clcsock->sk->sk_state) &
779 					(TCPF_SYN_SENT | TCP_SYN_RECV)) {
780 		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
781 		if ((rc == -EPIPE) &&
782 		    ((1 << smc->clcsock->sk->sk_state) &
783 					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
784 			rc = 0;
785 	}
786 	release_sock(smc->clcsock->sk);
787 	lock_sock(&smc->sk);
788 	if (rc != 0 || smc->sk.sk_err) {
789 		smc->sk.sk_state = SMC_CLOSED;
790 		if (rc == -EPIPE || rc == -EAGAIN)
791 			smc->sk.sk_err = EPIPE;
792 		else if (signal_pending(current))
793 			smc->sk.sk_err = -sock_intr_errno(timeo);
794 		goto out;
795 	}
796 
797 	rc = __smc_connect(smc);
798 	if (rc < 0)
799 		smc->sk.sk_err = -rc;
800 
801 out:
802 	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
803 		if (smc->sk.sk_err) {
804 			smc->sk.sk_state_change(&smc->sk);
805 		} else { /* allow polling before and after fallback decision */
806 			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
807 			smc->sk.sk_write_space(&smc->sk);
808 		}
809 	}
810 	release_sock(&smc->sk);
811 }
812 
813 static int smc_connect(struct socket *sock, struct sockaddr *addr,
814 		       int alen, int flags)
815 {
816 	struct sock *sk = sock->sk;
817 	struct smc_sock *smc;
818 	int rc = -EINVAL;
819 
820 	smc = smc_sk(sk);
821 
822 	/* separate smc parameter checking to be safe */
823 	if (alen < sizeof(addr->sa_family))
824 		goto out_err;
825 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
826 		goto out_err;
827 
828 	lock_sock(sk);
829 	switch (sk->sk_state) {
830 	default:
831 		goto out;
832 	case SMC_ACTIVE:
833 		rc = -EISCONN;
834 		goto out;
835 	case SMC_INIT:
836 		rc = 0;
837 		break;
838 	}
839 
840 	smc_copy_sock_settings_to_clc(smc);
841 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
842 	if (smc->connect_nonblock) {
843 		rc = -EALREADY;
844 		goto out;
845 	}
846 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
847 	if (rc && rc != -EINPROGRESS)
848 		goto out;
849 	if (flags & O_NONBLOCK) {
850 		if (schedule_work(&smc->connect_work))
851 			smc->connect_nonblock = 1;
852 		rc = -EINPROGRESS;
853 	} else {
854 		rc = __smc_connect(smc);
855 		if (rc < 0)
856 			goto out;
857 		else
858 			rc = 0; /* success cases including fallback */
859 	}
860 
861 out:
862 	release_sock(sk);
863 out_err:
864 	return rc;
865 }
866 
867 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
868 {
869 	struct socket *new_clcsock = NULL;
870 	struct sock *lsk = &lsmc->sk;
871 	struct sock *new_sk;
872 	int rc = -EINVAL;
873 
874 	release_sock(lsk);
875 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
876 	if (!new_sk) {
877 		rc = -ENOMEM;
878 		lsk->sk_err = ENOMEM;
879 		*new_smc = NULL;
880 		lock_sock(lsk);
881 		goto out;
882 	}
883 	*new_smc = smc_sk(new_sk);
884 
885 	mutex_lock(&lsmc->clcsock_release_lock);
886 	if (lsmc->clcsock)
887 		rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
888 	mutex_unlock(&lsmc->clcsock_release_lock);
889 	lock_sock(lsk);
890 	if  (rc < 0)
891 		lsk->sk_err = -rc;
892 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
893 		new_sk->sk_prot->unhash(new_sk);
894 		if (new_clcsock)
895 			sock_release(new_clcsock);
896 		new_sk->sk_state = SMC_CLOSED;
897 		sock_set_flag(new_sk, SOCK_DEAD);
898 		sock_put(new_sk); /* final */
899 		*new_smc = NULL;
900 		goto out;
901 	}
902 
903 	(*new_smc)->clcsock = new_clcsock;
904 out:
905 	return rc;
906 }
907 
908 /* add a just created sock to the accept queue of the listen sock as
909  * candidate for a following socket accept call from user space
910  */
911 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
912 {
913 	struct smc_sock *par = smc_sk(parent);
914 
915 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
916 	spin_lock(&par->accept_q_lock);
917 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
918 	spin_unlock(&par->accept_q_lock);
919 	sk_acceptq_added(parent);
920 }
921 
922 /* remove a socket from the accept queue of its parental listening socket */
923 static void smc_accept_unlink(struct sock *sk)
924 {
925 	struct smc_sock *par = smc_sk(sk)->listen_smc;
926 
927 	spin_lock(&par->accept_q_lock);
928 	list_del_init(&smc_sk(sk)->accept_q);
929 	spin_unlock(&par->accept_q_lock);
930 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
931 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
932 }
933 
934 /* remove a sock from the accept queue to bind it to a new socket created
935  * for a socket accept call from user space
936  */
937 struct sock *smc_accept_dequeue(struct sock *parent,
938 				struct socket *new_sock)
939 {
940 	struct smc_sock *isk, *n;
941 	struct sock *new_sk;
942 
943 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
944 		new_sk = (struct sock *)isk;
945 
946 		smc_accept_unlink(new_sk);
947 		if (new_sk->sk_state == SMC_CLOSED) {
948 			new_sk->sk_prot->unhash(new_sk);
949 			if (isk->clcsock) {
950 				sock_release(isk->clcsock);
951 				isk->clcsock = NULL;
952 			}
953 			sock_put(new_sk); /* final */
954 			continue;
955 		}
956 		if (new_sock) {
957 			sock_graft(new_sk, new_sock);
958 			if (isk->use_fallback) {
959 				smc_sk(new_sk)->clcsock->file = new_sock->file;
960 				isk->clcsock->file->private_data = isk->clcsock;
961 			}
962 		}
963 		return new_sk;
964 	}
965 	return NULL;
966 }
967 
968 /* clean up for a created but never accepted sock */
969 void smc_close_non_accepted(struct sock *sk)
970 {
971 	struct smc_sock *smc = smc_sk(sk);
972 
973 	lock_sock(sk);
974 	if (!sk->sk_lingertime)
975 		/* wait for peer closing */
976 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
977 	__smc_release(smc);
978 	release_sock(sk);
979 	sock_put(sk); /* final sock_put */
980 }
981 
982 static int smc_serv_conf_first_link(struct smc_sock *smc)
983 {
984 	struct net *net = sock_net(smc->clcsock->sk);
985 	struct smc_link_group *lgr = smc->conn.lgr;
986 	struct smc_link *link;
987 	int rest;
988 	int rc;
989 
990 	link = &lgr->lnk[SMC_SINGLE_LINK];
991 
992 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
993 		return SMC_CLC_DECL_ERR_REGRMB;
994 
995 	/* send CONFIRM LINK request to client over the RoCE fabric */
996 	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
997 	if (rc < 0)
998 		return SMC_CLC_DECL_TIMEOUT_CL;
999 
1000 	/* receive CONFIRM LINK response from client over the RoCE fabric */
1001 	rest = wait_for_completion_interruptible_timeout(
1002 		&link->llc_confirm_resp,
1003 		SMC_LLC_WAIT_FIRST_TIME);
1004 	if (rest <= 0) {
1005 		struct smc_clc_msg_decline dclc;
1006 
1007 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1008 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1009 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1010 	}
1011 
1012 	if (link->llc_confirm_resp_rc)
1013 		return SMC_CLC_DECL_RMBE_EC;
1014 
1015 	/* send ADD LINK request to client over the RoCE fabric */
1016 	rc = smc_llc_send_add_link(link,
1017 				   link->smcibdev->mac[link->ibport - 1],
1018 				   link->gid, SMC_LLC_REQ);
1019 	if (rc < 0)
1020 		return SMC_CLC_DECL_TIMEOUT_AL;
1021 
1022 	/* receive ADD LINK response from client over the RoCE fabric */
1023 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1024 							 SMC_LLC_WAIT_TIME);
1025 	if (rest <= 0) {
1026 		struct smc_clc_msg_decline dclc;
1027 
1028 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1029 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1030 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1031 	}
1032 
1033 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1034 
1035 	return 0;
1036 }
1037 
1038 /* listen worker: finish */
1039 static void smc_listen_out(struct smc_sock *new_smc)
1040 {
1041 	struct smc_sock *lsmc = new_smc->listen_smc;
1042 	struct sock *newsmcsk = &new_smc->sk;
1043 
1044 	if (lsmc->sk.sk_state == SMC_LISTEN) {
1045 		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1046 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1047 		release_sock(&lsmc->sk);
1048 	} else { /* no longer listening */
1049 		smc_close_non_accepted(newsmcsk);
1050 	}
1051 
1052 	/* Wake up accept */
1053 	lsmc->sk.sk_data_ready(&lsmc->sk);
1054 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1055 }
1056 
1057 /* listen worker: finish in state connected */
1058 static void smc_listen_out_connected(struct smc_sock *new_smc)
1059 {
1060 	struct sock *newsmcsk = &new_smc->sk;
1061 
1062 	sk_refcnt_debug_inc(newsmcsk);
1063 	if (newsmcsk->sk_state == SMC_INIT)
1064 		newsmcsk->sk_state = SMC_ACTIVE;
1065 
1066 	smc_listen_out(new_smc);
1067 }
1068 
1069 /* listen worker: finish in error state */
1070 static void smc_listen_out_err(struct smc_sock *new_smc)
1071 {
1072 	struct sock *newsmcsk = &new_smc->sk;
1073 
1074 	if (newsmcsk->sk_state == SMC_INIT)
1075 		sock_put(&new_smc->sk); /* passive closing */
1076 	newsmcsk->sk_state = SMC_CLOSED;
1077 	smc_conn_free(&new_smc->conn);
1078 
1079 	smc_listen_out(new_smc);
1080 }
1081 
1082 /* listen worker: decline and fall back if possible */
1083 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1084 			       int local_contact)
1085 {
1086 	/* RDMA setup failed, switch back to TCP */
1087 	if (local_contact == SMC_FIRST_CONTACT)
1088 		smc_lgr_forget(new_smc->conn.lgr);
1089 	if (reason_code < 0) { /* error, no fallback possible */
1090 		smc_listen_out_err(new_smc);
1091 		return;
1092 	}
1093 	smc_conn_free(&new_smc->conn);
1094 	smc_switch_to_fallback(new_smc);
1095 	new_smc->fallback_rsn = reason_code;
1096 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1097 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1098 			smc_listen_out_err(new_smc);
1099 			return;
1100 		}
1101 	}
1102 	smc_listen_out_connected(new_smc);
1103 }
1104 
1105 /* listen worker: check prefixes */
1106 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1107 				 struct smc_clc_msg_proposal *pclc)
1108 {
1109 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1110 	struct socket *newclcsock = new_smc->clcsock;
1111 
1112 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1113 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1114 		return SMC_CLC_DECL_DIFFPREFIX;
1115 
1116 	return 0;
1117 }
1118 
1119 /* listen worker: initialize connection and buffers */
1120 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1121 				struct smc_init_info *ini)
1122 {
1123 	int rc;
1124 
1125 	/* allocate connection / link group */
1126 	rc = smc_conn_create(new_smc, ini);
1127 	if (rc)
1128 		return rc;
1129 
1130 	/* create send buffer and rmb */
1131 	if (smc_buf_create(new_smc, false))
1132 		return SMC_CLC_DECL_MEM;
1133 
1134 	return 0;
1135 }
1136 
1137 /* listen worker: initialize connection and buffers for SMC-D */
1138 static int smc_listen_ism_init(struct smc_sock *new_smc,
1139 			       struct smc_clc_msg_proposal *pclc,
1140 			       struct smc_init_info *ini)
1141 {
1142 	struct smc_clc_msg_smcd *pclc_smcd;
1143 	int rc;
1144 
1145 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1146 	ini->ism_gid = pclc_smcd->gid;
1147 	rc = smc_conn_create(new_smc, ini);
1148 	if (rc)
1149 		return rc;
1150 
1151 	/* Check if peer can be reached via ISM device */
1152 	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1153 			    new_smc->conn.lgr->vlan_id,
1154 			    new_smc->conn.lgr->smcd)) {
1155 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1156 			smc_lgr_forget(new_smc->conn.lgr);
1157 		smc_conn_free(&new_smc->conn);
1158 		return SMC_CLC_DECL_SMCDNOTALK;
1159 	}
1160 
1161 	/* Create send and receive buffers */
1162 	if (smc_buf_create(new_smc, true)) {
1163 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1164 			smc_lgr_forget(new_smc->conn.lgr);
1165 		smc_conn_free(&new_smc->conn);
1166 		return SMC_CLC_DECL_MEM;
1167 	}
1168 
1169 	return 0;
1170 }
1171 
1172 /* listen worker: register buffers */
1173 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1174 {
1175 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1176 
1177 	if (local_contact != SMC_FIRST_CONTACT) {
1178 		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1179 			return SMC_CLC_DECL_ERR_REGRMB;
1180 	}
1181 	smc_rmb_sync_sg_for_device(&new_smc->conn);
1182 
1183 	return 0;
1184 }
1185 
1186 /* listen worker: finish RDMA setup */
1187 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1188 				  struct smc_clc_msg_accept_confirm *cclc,
1189 				  int local_contact)
1190 {
1191 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1192 	int reason_code = 0;
1193 
1194 	if (local_contact == SMC_FIRST_CONTACT)
1195 		smc_link_save_peer_info(link, cclc);
1196 
1197 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1198 		reason_code = SMC_CLC_DECL_ERR_RTOK;
1199 		goto decline;
1200 	}
1201 
1202 	if (local_contact == SMC_FIRST_CONTACT) {
1203 		if (smc_ib_ready_link(link)) {
1204 			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1205 			goto decline;
1206 		}
1207 		/* QP confirmation over RoCE fabric */
1208 		reason_code = smc_serv_conf_first_link(new_smc);
1209 		if (reason_code)
1210 			goto decline;
1211 	}
1212 	return 0;
1213 
1214 decline:
1215 	smc_listen_decline(new_smc, reason_code, local_contact);
1216 	return reason_code;
1217 }
1218 
1219 /* setup for RDMA connection of server */
1220 static void smc_listen_work(struct work_struct *work)
1221 {
1222 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1223 						smc_listen_work);
1224 	struct socket *newclcsock = new_smc->clcsock;
1225 	struct smc_clc_msg_accept_confirm cclc;
1226 	struct smc_clc_msg_proposal *pclc;
1227 	struct smc_init_info ini = {0};
1228 	bool ism_supported = false;
1229 	u8 buf[SMC_CLC_MAX_LEN];
1230 	int rc = 0;
1231 
1232 	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1233 		return smc_listen_out_err(new_smc);
1234 
1235 	if (new_smc->use_fallback) {
1236 		smc_listen_out_connected(new_smc);
1237 		return;
1238 	}
1239 
1240 	/* check if peer is smc capable */
1241 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1242 		smc_switch_to_fallback(new_smc);
1243 		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1244 		smc_listen_out_connected(new_smc);
1245 		return;
1246 	}
1247 
1248 	/* do inband token exchange -
1249 	 * wait for and receive SMC Proposal CLC message
1250 	 */
1251 	pclc = (struct smc_clc_msg_proposal *)&buf;
1252 	rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1253 			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1254 	if (rc)
1255 		goto out_decl;
1256 
1257 	/* IPSec connections opt out of SMC-R optimizations */
1258 	if (using_ipsec(new_smc)) {
1259 		rc = SMC_CLC_DECL_IPSEC;
1260 		goto out_decl;
1261 	}
1262 
1263 	/* check for matching IP prefix and subnet length */
1264 	rc = smc_listen_prfx_check(new_smc, pclc);
1265 	if (rc)
1266 		goto out_decl;
1267 
1268 	/* get vlan id from IP device */
1269 	if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1270 		rc = SMC_CLC_DECL_GETVLANERR;
1271 		goto out_decl;
1272 	}
1273 
1274 	mutex_lock(&smc_server_lgr_pending);
1275 	smc_close_init(new_smc);
1276 	smc_rx_init(new_smc);
1277 	smc_tx_init(new_smc);
1278 
1279 	/* check if ISM is available */
1280 	if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1281 		ini.is_smcd = true; /* prepare ISM check */
1282 		rc = smc_find_ism_device(new_smc, &ini);
1283 		if (!rc)
1284 			rc = smc_listen_ism_init(new_smc, pclc, &ini);
1285 		if (!rc)
1286 			ism_supported = true;
1287 		else if (pclc->hdr.path == SMC_TYPE_D)
1288 			goto out_unlock; /* skip RDMA and decline */
1289 	}
1290 
1291 	/* check if RDMA is available */
1292 	if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1293 		/* prepare RDMA check */
1294 		memset(&ini, 0, sizeof(ini));
1295 		ini.is_smcd = false;
1296 		ini.ib_lcl = &pclc->lcl;
1297 		rc = smc_find_rdma_device(new_smc, &ini);
1298 		if (rc) {
1299 			/* no RDMA device found */
1300 			if (pclc->hdr.path == SMC_TYPE_B)
1301 				/* neither ISM nor RDMA device found */
1302 				rc = SMC_CLC_DECL_NOSMCDEV;
1303 			goto out_unlock;
1304 		}
1305 		rc = smc_listen_rdma_init(new_smc, &ini);
1306 		if (rc)
1307 			goto out_unlock;
1308 		rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1309 		if (rc)
1310 			goto out_unlock;
1311 	}
1312 
1313 	/* send SMC Accept CLC message */
1314 	rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1315 	if (rc)
1316 		goto out_unlock;
1317 
1318 	/* SMC-D does not need this lock any more */
1319 	if (ism_supported)
1320 		mutex_unlock(&smc_server_lgr_pending);
1321 
1322 	/* receive SMC Confirm CLC message */
1323 	rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1324 			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1325 	if (rc) {
1326 		if (!ism_supported)
1327 			goto out_unlock;
1328 		goto out_decl;
1329 	}
1330 
1331 	/* finish worker */
1332 	if (!ism_supported) {
1333 		rc = smc_listen_rdma_finish(new_smc, &cclc,
1334 					    ini.cln_first_contact);
1335 		mutex_unlock(&smc_server_lgr_pending);
1336 		if (rc)
1337 			return;
1338 	}
1339 	smc_conn_save_peer_info(new_smc, &cclc);
1340 	smc_listen_out_connected(new_smc);
1341 	return;
1342 
1343 out_unlock:
1344 	mutex_unlock(&smc_server_lgr_pending);
1345 out_decl:
1346 	smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1347 }
1348 
1349 static void smc_tcp_listen_work(struct work_struct *work)
1350 {
1351 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1352 					     tcp_listen_work);
1353 	struct sock *lsk = &lsmc->sk;
1354 	struct smc_sock *new_smc;
1355 	int rc = 0;
1356 
1357 	lock_sock(lsk);
1358 	while (lsk->sk_state == SMC_LISTEN) {
1359 		rc = smc_clcsock_accept(lsmc, &new_smc);
1360 		if (rc)
1361 			goto out;
1362 		if (!new_smc)
1363 			continue;
1364 
1365 		new_smc->listen_smc = lsmc;
1366 		new_smc->use_fallback = lsmc->use_fallback;
1367 		new_smc->fallback_rsn = lsmc->fallback_rsn;
1368 		sock_hold(lsk); /* sock_put in smc_listen_work */
1369 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1370 		smc_copy_sock_settings_to_smc(new_smc);
1371 		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1372 		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1373 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1374 		if (!schedule_work(&new_smc->smc_listen_work))
1375 			sock_put(&new_smc->sk);
1376 	}
1377 
1378 out:
1379 	release_sock(lsk);
1380 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1381 }
1382 
1383 static int smc_listen(struct socket *sock, int backlog)
1384 {
1385 	struct sock *sk = sock->sk;
1386 	struct smc_sock *smc;
1387 	int rc;
1388 
1389 	smc = smc_sk(sk);
1390 	lock_sock(sk);
1391 
1392 	rc = -EINVAL;
1393 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1394 		goto out;
1395 
1396 	rc = 0;
1397 	if (sk->sk_state == SMC_LISTEN) {
1398 		sk->sk_max_ack_backlog = backlog;
1399 		goto out;
1400 	}
1401 	/* some socket options are handled in core, so we could not apply
1402 	 * them to the clc socket -- copy smc socket options to clc socket
1403 	 */
1404 	smc_copy_sock_settings_to_clc(smc);
1405 	if (!smc->use_fallback)
1406 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1407 
1408 	rc = kernel_listen(smc->clcsock, backlog);
1409 	if (rc)
1410 		goto out;
1411 	sk->sk_max_ack_backlog = backlog;
1412 	sk->sk_ack_backlog = 0;
1413 	sk->sk_state = SMC_LISTEN;
1414 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1415 	if (!schedule_work(&smc->tcp_listen_work))
1416 		sock_put(sk);
1417 
1418 out:
1419 	release_sock(sk);
1420 	return rc;
1421 }
1422 
1423 static int smc_accept(struct socket *sock, struct socket *new_sock,
1424 		      int flags, bool kern)
1425 {
1426 	struct sock *sk = sock->sk, *nsk;
1427 	DECLARE_WAITQUEUE(wait, current);
1428 	struct smc_sock *lsmc;
1429 	long timeo;
1430 	int rc = 0;
1431 
1432 	lsmc = smc_sk(sk);
1433 	sock_hold(sk); /* sock_put below */
1434 	lock_sock(sk);
1435 
1436 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1437 		rc = -EINVAL;
1438 		release_sock(sk);
1439 		goto out;
1440 	}
1441 
1442 	/* Wait for an incoming connection */
1443 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1444 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1445 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1446 		set_current_state(TASK_INTERRUPTIBLE);
1447 		if (!timeo) {
1448 			rc = -EAGAIN;
1449 			break;
1450 		}
1451 		release_sock(sk);
1452 		timeo = schedule_timeout(timeo);
1453 		/* wakeup by sk_data_ready in smc_listen_work() */
1454 		sched_annotate_sleep();
1455 		lock_sock(sk);
1456 		if (signal_pending(current)) {
1457 			rc = sock_intr_errno(timeo);
1458 			break;
1459 		}
1460 	}
1461 	set_current_state(TASK_RUNNING);
1462 	remove_wait_queue(sk_sleep(sk), &wait);
1463 
1464 	if (!rc)
1465 		rc = sock_error(nsk);
1466 	release_sock(sk);
1467 	if (rc)
1468 		goto out;
1469 
1470 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1471 		/* wait till data arrives on the socket */
1472 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1473 								MSEC_PER_SEC);
1474 		if (smc_sk(nsk)->use_fallback) {
1475 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1476 
1477 			lock_sock(clcsk);
1478 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1479 				sk_wait_data(clcsk, &timeo, NULL);
1480 			release_sock(clcsk);
1481 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1482 			lock_sock(nsk);
1483 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1484 			release_sock(nsk);
1485 		}
1486 	}
1487 
1488 out:
1489 	sock_put(sk); /* sock_hold above */
1490 	return rc;
1491 }
1492 
1493 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1494 		       int peer)
1495 {
1496 	struct smc_sock *smc;
1497 
1498 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1499 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1500 		return -ENOTCONN;
1501 
1502 	smc = smc_sk(sock->sk);
1503 
1504 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1505 }
1506 
1507 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1508 {
1509 	struct sock *sk = sock->sk;
1510 	struct smc_sock *smc;
1511 	int rc = -EPIPE;
1512 
1513 	smc = smc_sk(sk);
1514 	lock_sock(sk);
1515 	if ((sk->sk_state != SMC_ACTIVE) &&
1516 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1517 	    (sk->sk_state != SMC_INIT))
1518 		goto out;
1519 
1520 	if (msg->msg_flags & MSG_FASTOPEN) {
1521 		if (sk->sk_state == SMC_INIT) {
1522 			smc_switch_to_fallback(smc);
1523 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1524 		} else {
1525 			rc = -EINVAL;
1526 			goto out;
1527 		}
1528 	}
1529 
1530 	if (smc->use_fallback)
1531 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1532 	else
1533 		rc = smc_tx_sendmsg(smc, msg, len);
1534 out:
1535 	release_sock(sk);
1536 	return rc;
1537 }
1538 
1539 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1540 		       int flags)
1541 {
1542 	struct sock *sk = sock->sk;
1543 	struct smc_sock *smc;
1544 	int rc = -ENOTCONN;
1545 
1546 	smc = smc_sk(sk);
1547 	lock_sock(sk);
1548 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1549 		/* socket was connected before, no more data to read */
1550 		rc = 0;
1551 		goto out;
1552 	}
1553 	if ((sk->sk_state == SMC_INIT) ||
1554 	    (sk->sk_state == SMC_LISTEN) ||
1555 	    (sk->sk_state == SMC_CLOSED))
1556 		goto out;
1557 
1558 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1559 		rc = 0;
1560 		goto out;
1561 	}
1562 
1563 	if (smc->use_fallback) {
1564 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1565 	} else {
1566 		msg->msg_namelen = 0;
1567 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1568 	}
1569 
1570 out:
1571 	release_sock(sk);
1572 	return rc;
1573 }
1574 
1575 static __poll_t smc_accept_poll(struct sock *parent)
1576 {
1577 	struct smc_sock *isk = smc_sk(parent);
1578 	__poll_t mask = 0;
1579 
1580 	spin_lock(&isk->accept_q_lock);
1581 	if (!list_empty(&isk->accept_q))
1582 		mask = EPOLLIN | EPOLLRDNORM;
1583 	spin_unlock(&isk->accept_q_lock);
1584 
1585 	return mask;
1586 }
1587 
1588 static __poll_t smc_poll(struct file *file, struct socket *sock,
1589 			     poll_table *wait)
1590 {
1591 	struct sock *sk = sock->sk;
1592 	struct smc_sock *smc;
1593 	__poll_t mask = 0;
1594 
1595 	if (!sk)
1596 		return EPOLLNVAL;
1597 
1598 	smc = smc_sk(sock->sk);
1599 	if (smc->use_fallback) {
1600 		/* delegate to CLC child sock */
1601 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1602 		sk->sk_err = smc->clcsock->sk->sk_err;
1603 	} else {
1604 		if (sk->sk_state != SMC_CLOSED)
1605 			sock_poll_wait(file, sock, wait);
1606 		if (sk->sk_err)
1607 			mask |= EPOLLERR;
1608 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1609 		    (sk->sk_state == SMC_CLOSED))
1610 			mask |= EPOLLHUP;
1611 		if (sk->sk_state == SMC_LISTEN) {
1612 			/* woken up by sk_data_ready in smc_listen_work() */
1613 			mask |= smc_accept_poll(sk);
1614 		} else if (smc->use_fallback) { /* as result of connect_work()*/
1615 			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1616 							   wait);
1617 			sk->sk_err = smc->clcsock->sk->sk_err;
1618 		} else {
1619 			if ((sk->sk_state != SMC_INIT &&
1620 			     atomic_read(&smc->conn.sndbuf_space)) ||
1621 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1622 				mask |= EPOLLOUT | EPOLLWRNORM;
1623 			} else {
1624 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1625 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1626 			}
1627 			if (atomic_read(&smc->conn.bytes_to_rcv))
1628 				mask |= EPOLLIN | EPOLLRDNORM;
1629 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1630 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1631 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1632 				mask |= EPOLLIN;
1633 			if (smc->conn.urg_state == SMC_URG_VALID)
1634 				mask |= EPOLLPRI;
1635 		}
1636 	}
1637 
1638 	return mask;
1639 }
1640 
1641 static int smc_shutdown(struct socket *sock, int how)
1642 {
1643 	struct sock *sk = sock->sk;
1644 	struct smc_sock *smc;
1645 	int rc = -EINVAL;
1646 	int rc1 = 0;
1647 
1648 	smc = smc_sk(sk);
1649 
1650 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1651 		return rc;
1652 
1653 	lock_sock(sk);
1654 
1655 	rc = -ENOTCONN;
1656 	if ((sk->sk_state != SMC_ACTIVE) &&
1657 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1658 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1659 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1660 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1661 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1662 		goto out;
1663 	if (smc->use_fallback) {
1664 		rc = kernel_sock_shutdown(smc->clcsock, how);
1665 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1666 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1667 			sk->sk_state = SMC_CLOSED;
1668 		goto out;
1669 	}
1670 	switch (how) {
1671 	case SHUT_RDWR:		/* shutdown in both directions */
1672 		rc = smc_close_active(smc);
1673 		break;
1674 	case SHUT_WR:
1675 		rc = smc_close_shutdown_write(smc);
1676 		break;
1677 	case SHUT_RD:
1678 		rc = 0;
1679 		/* nothing more to do because peer is not involved */
1680 		break;
1681 	}
1682 	if (smc->clcsock)
1683 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1684 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1685 	sk->sk_shutdown |= how + 1;
1686 
1687 out:
1688 	release_sock(sk);
1689 	return rc ? rc : rc1;
1690 }
1691 
1692 static int smc_setsockopt(struct socket *sock, int level, int optname,
1693 			  char __user *optval, unsigned int optlen)
1694 {
1695 	struct sock *sk = sock->sk;
1696 	struct smc_sock *smc;
1697 	int val, rc;
1698 
1699 	smc = smc_sk(sk);
1700 
1701 	/* generic setsockopts reaching us here always apply to the
1702 	 * CLC socket
1703 	 */
1704 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1705 					   optval, optlen);
1706 	if (smc->clcsock->sk->sk_err) {
1707 		sk->sk_err = smc->clcsock->sk->sk_err;
1708 		sk->sk_error_report(sk);
1709 	}
1710 	if (rc)
1711 		return rc;
1712 
1713 	if (optlen < sizeof(int))
1714 		return -EINVAL;
1715 	if (get_user(val, (int __user *)optval))
1716 		return -EFAULT;
1717 
1718 	lock_sock(sk);
1719 	switch (optname) {
1720 	case TCP_ULP:
1721 	case TCP_FASTOPEN:
1722 	case TCP_FASTOPEN_CONNECT:
1723 	case TCP_FASTOPEN_KEY:
1724 	case TCP_FASTOPEN_NO_COOKIE:
1725 		/* option not supported by SMC */
1726 		if (sk->sk_state == SMC_INIT) {
1727 			smc_switch_to_fallback(smc);
1728 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1729 		} else {
1730 			if (!smc->use_fallback)
1731 				rc = -EINVAL;
1732 		}
1733 		break;
1734 	case TCP_NODELAY:
1735 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1736 			if (val && !smc->use_fallback)
1737 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1738 						 0);
1739 		}
1740 		break;
1741 	case TCP_CORK:
1742 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1743 			if (!val && !smc->use_fallback)
1744 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1745 						 0);
1746 		}
1747 		break;
1748 	case TCP_DEFER_ACCEPT:
1749 		smc->sockopt_defer_accept = val;
1750 		break;
1751 	default:
1752 		break;
1753 	}
1754 	release_sock(sk);
1755 
1756 	return rc;
1757 }
1758 
1759 static int smc_getsockopt(struct socket *sock, int level, int optname,
1760 			  char __user *optval, int __user *optlen)
1761 {
1762 	struct smc_sock *smc;
1763 
1764 	smc = smc_sk(sock->sk);
1765 	/* socket options apply to the CLC socket */
1766 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1767 					     optval, optlen);
1768 }
1769 
1770 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1771 		     unsigned long arg)
1772 {
1773 	union smc_host_cursor cons, urg;
1774 	struct smc_connection *conn;
1775 	struct smc_sock *smc;
1776 	int answ;
1777 
1778 	smc = smc_sk(sock->sk);
1779 	conn = &smc->conn;
1780 	lock_sock(&smc->sk);
1781 	if (smc->use_fallback) {
1782 		if (!smc->clcsock) {
1783 			release_sock(&smc->sk);
1784 			return -EBADF;
1785 		}
1786 		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1787 		release_sock(&smc->sk);
1788 		return answ;
1789 	}
1790 	switch (cmd) {
1791 	case SIOCINQ: /* same as FIONREAD */
1792 		if (smc->sk.sk_state == SMC_LISTEN) {
1793 			release_sock(&smc->sk);
1794 			return -EINVAL;
1795 		}
1796 		if (smc->sk.sk_state == SMC_INIT ||
1797 		    smc->sk.sk_state == SMC_CLOSED)
1798 			answ = 0;
1799 		else
1800 			answ = atomic_read(&smc->conn.bytes_to_rcv);
1801 		break;
1802 	case SIOCOUTQ:
1803 		/* output queue size (not send + not acked) */
1804 		if (smc->sk.sk_state == SMC_LISTEN) {
1805 			release_sock(&smc->sk);
1806 			return -EINVAL;
1807 		}
1808 		if (smc->sk.sk_state == SMC_INIT ||
1809 		    smc->sk.sk_state == SMC_CLOSED)
1810 			answ = 0;
1811 		else
1812 			answ = smc->conn.sndbuf_desc->len -
1813 					atomic_read(&smc->conn.sndbuf_space);
1814 		break;
1815 	case SIOCOUTQNSD:
1816 		/* output queue size (not send only) */
1817 		if (smc->sk.sk_state == SMC_LISTEN) {
1818 			release_sock(&smc->sk);
1819 			return -EINVAL;
1820 		}
1821 		if (smc->sk.sk_state == SMC_INIT ||
1822 		    smc->sk.sk_state == SMC_CLOSED)
1823 			answ = 0;
1824 		else
1825 			answ = smc_tx_prepared_sends(&smc->conn);
1826 		break;
1827 	case SIOCATMARK:
1828 		if (smc->sk.sk_state == SMC_LISTEN) {
1829 			release_sock(&smc->sk);
1830 			return -EINVAL;
1831 		}
1832 		if (smc->sk.sk_state == SMC_INIT ||
1833 		    smc->sk.sk_state == SMC_CLOSED) {
1834 			answ = 0;
1835 		} else {
1836 			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1837 			smc_curs_copy(&urg, &conn->urg_curs, conn);
1838 			answ = smc_curs_diff(conn->rmb_desc->len,
1839 					     &cons, &urg) == 1;
1840 		}
1841 		break;
1842 	default:
1843 		release_sock(&smc->sk);
1844 		return -ENOIOCTLCMD;
1845 	}
1846 	release_sock(&smc->sk);
1847 
1848 	return put_user(answ, (int __user *)arg);
1849 }
1850 
1851 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1852 			    int offset, size_t size, int flags)
1853 {
1854 	struct sock *sk = sock->sk;
1855 	struct smc_sock *smc;
1856 	int rc = -EPIPE;
1857 
1858 	smc = smc_sk(sk);
1859 	lock_sock(sk);
1860 	if (sk->sk_state != SMC_ACTIVE) {
1861 		release_sock(sk);
1862 		goto out;
1863 	}
1864 	release_sock(sk);
1865 	if (smc->use_fallback)
1866 		rc = kernel_sendpage(smc->clcsock, page, offset,
1867 				     size, flags);
1868 	else
1869 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1870 
1871 out:
1872 	return rc;
1873 }
1874 
1875 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1876  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1877  * updates till whenever a respective page has been fully processed.
1878  * Note that subsequent recv() calls have to wait till all splice() processing
1879  * completed.
1880  */
1881 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1882 			       struct pipe_inode_info *pipe, size_t len,
1883 			       unsigned int flags)
1884 {
1885 	struct sock *sk = sock->sk;
1886 	struct smc_sock *smc;
1887 	int rc = -ENOTCONN;
1888 
1889 	smc = smc_sk(sk);
1890 	lock_sock(sk);
1891 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1892 		/* socket was connected before, no more data to read */
1893 		rc = 0;
1894 		goto out;
1895 	}
1896 	if (sk->sk_state == SMC_INIT ||
1897 	    sk->sk_state == SMC_LISTEN ||
1898 	    sk->sk_state == SMC_CLOSED)
1899 		goto out;
1900 
1901 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1902 		rc = 0;
1903 		goto out;
1904 	}
1905 
1906 	if (smc->use_fallback) {
1907 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1908 						    pipe, len, flags);
1909 	} else {
1910 		if (*ppos) {
1911 			rc = -ESPIPE;
1912 			goto out;
1913 		}
1914 		if (flags & SPLICE_F_NONBLOCK)
1915 			flags = MSG_DONTWAIT;
1916 		else
1917 			flags = 0;
1918 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1919 	}
1920 out:
1921 	release_sock(sk);
1922 
1923 	return rc;
1924 }
1925 
1926 /* must look like tcp */
1927 static const struct proto_ops smc_sock_ops = {
1928 	.family		= PF_SMC,
1929 	.owner		= THIS_MODULE,
1930 	.release	= smc_release,
1931 	.bind		= smc_bind,
1932 	.connect	= smc_connect,
1933 	.socketpair	= sock_no_socketpair,
1934 	.accept		= smc_accept,
1935 	.getname	= smc_getname,
1936 	.poll		= smc_poll,
1937 	.ioctl		= smc_ioctl,
1938 	.listen		= smc_listen,
1939 	.shutdown	= smc_shutdown,
1940 	.setsockopt	= smc_setsockopt,
1941 	.getsockopt	= smc_getsockopt,
1942 	.sendmsg	= smc_sendmsg,
1943 	.recvmsg	= smc_recvmsg,
1944 	.mmap		= sock_no_mmap,
1945 	.sendpage	= smc_sendpage,
1946 	.splice_read	= smc_splice_read,
1947 };
1948 
1949 static int smc_create(struct net *net, struct socket *sock, int protocol,
1950 		      int kern)
1951 {
1952 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1953 	struct smc_sock *smc;
1954 	struct sock *sk;
1955 	int rc;
1956 
1957 	rc = -ESOCKTNOSUPPORT;
1958 	if (sock->type != SOCK_STREAM)
1959 		goto out;
1960 
1961 	rc = -EPROTONOSUPPORT;
1962 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1963 		goto out;
1964 
1965 	rc = -ENOBUFS;
1966 	sock->ops = &smc_sock_ops;
1967 	sk = smc_sock_alloc(net, sock, protocol);
1968 	if (!sk)
1969 		goto out;
1970 
1971 	/* create internal TCP socket for CLC handshake and fallback */
1972 	smc = smc_sk(sk);
1973 	smc->use_fallback = false; /* assume rdma capability first */
1974 	smc->fallback_rsn = 0;
1975 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1976 			      &smc->clcsock);
1977 	if (rc) {
1978 		sk_common_release(sk);
1979 		goto out;
1980 	}
1981 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1982 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1983 
1984 out:
1985 	return rc;
1986 }
1987 
1988 static const struct net_proto_family smc_sock_family_ops = {
1989 	.family	= PF_SMC,
1990 	.owner	= THIS_MODULE,
1991 	.create	= smc_create,
1992 };
1993 
1994 unsigned int smc_net_id;
1995 
1996 static __net_init int smc_net_init(struct net *net)
1997 {
1998 	return smc_pnet_net_init(net);
1999 }
2000 
2001 static void __net_exit smc_net_exit(struct net *net)
2002 {
2003 	smc_pnet_net_exit(net);
2004 }
2005 
2006 static struct pernet_operations smc_net_ops = {
2007 	.init = smc_net_init,
2008 	.exit = smc_net_exit,
2009 	.id   = &smc_net_id,
2010 	.size = sizeof(struct smc_net),
2011 };
2012 
2013 static int __init smc_init(void)
2014 {
2015 	int rc;
2016 
2017 	rc = register_pernet_subsys(&smc_net_ops);
2018 	if (rc)
2019 		return rc;
2020 
2021 	rc = smc_pnet_init();
2022 	if (rc)
2023 		goto out_pernet_subsys;
2024 
2025 	rc = smc_llc_init();
2026 	if (rc) {
2027 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2028 		goto out_pnet;
2029 	}
2030 
2031 	rc = smc_cdc_init();
2032 	if (rc) {
2033 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2034 		goto out_pnet;
2035 	}
2036 
2037 	rc = proto_register(&smc_proto, 1);
2038 	if (rc) {
2039 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2040 		goto out_pnet;
2041 	}
2042 
2043 	rc = proto_register(&smc_proto6, 1);
2044 	if (rc) {
2045 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2046 		goto out_proto;
2047 	}
2048 
2049 	rc = sock_register(&smc_sock_family_ops);
2050 	if (rc) {
2051 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
2052 		goto out_proto6;
2053 	}
2054 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2055 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2056 
2057 	rc = smc_ib_register_client();
2058 	if (rc) {
2059 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
2060 		goto out_sock;
2061 	}
2062 
2063 	static_branch_enable(&tcp_have_smc);
2064 	return 0;
2065 
2066 out_sock:
2067 	sock_unregister(PF_SMC);
2068 out_proto6:
2069 	proto_unregister(&smc_proto6);
2070 out_proto:
2071 	proto_unregister(&smc_proto);
2072 out_pnet:
2073 	smc_pnet_exit();
2074 out_pernet_subsys:
2075 	unregister_pernet_subsys(&smc_net_ops);
2076 
2077 	return rc;
2078 }
2079 
2080 static void __exit smc_exit(void)
2081 {
2082 	smc_core_exit();
2083 	static_branch_disable(&tcp_have_smc);
2084 	smc_ib_unregister_client();
2085 	sock_unregister(PF_SMC);
2086 	proto_unregister(&smc_proto6);
2087 	proto_unregister(&smc_proto);
2088 	smc_pnet_exit();
2089 	unregister_pernet_subsys(&smc_net_ops);
2090 }
2091 
2092 module_init(smc_init);
2093 module_exit(smc_exit);
2094 
2095 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2096 MODULE_DESCRIPTION("smc socket address family");
2097 MODULE_LICENSE("GPL");
2098 MODULE_ALIAS_NETPROTO(PF_SMC);
2099