xref: /openbmc/linux/net/smc/af_smc.c (revision a17922de)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17 
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20 
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27 
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32 
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_ism.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44 
45 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
46 						 * creation
47 						 */
48 
49 static void smc_tcp_listen_work(struct work_struct *);
50 static void smc_connect_work(struct work_struct *);
51 
52 static void smc_set_keepalive(struct sock *sk, int val)
53 {
54 	struct smc_sock *smc = smc_sk(sk);
55 
56 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
57 }
58 
59 static struct smc_hashinfo smc_v4_hashinfo = {
60 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
61 };
62 
63 static struct smc_hashinfo smc_v6_hashinfo = {
64 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
65 };
66 
67 int smc_hash_sk(struct sock *sk)
68 {
69 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
70 	struct hlist_head *head;
71 
72 	head = &h->ht;
73 
74 	write_lock_bh(&h->lock);
75 	sk_add_node(sk, head);
76 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
77 	write_unlock_bh(&h->lock);
78 
79 	return 0;
80 }
81 EXPORT_SYMBOL_GPL(smc_hash_sk);
82 
83 void smc_unhash_sk(struct sock *sk)
84 {
85 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
86 
87 	write_lock_bh(&h->lock);
88 	if (sk_del_node_init(sk))
89 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
90 	write_unlock_bh(&h->lock);
91 }
92 EXPORT_SYMBOL_GPL(smc_unhash_sk);
93 
94 struct proto smc_proto = {
95 	.name		= "SMC",
96 	.owner		= THIS_MODULE,
97 	.keepalive	= smc_set_keepalive,
98 	.hash		= smc_hash_sk,
99 	.unhash		= smc_unhash_sk,
100 	.obj_size	= sizeof(struct smc_sock),
101 	.h.smc_hash	= &smc_v4_hashinfo,
102 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
103 };
104 EXPORT_SYMBOL_GPL(smc_proto);
105 
106 struct proto smc_proto6 = {
107 	.name		= "SMC6",
108 	.owner		= THIS_MODULE,
109 	.keepalive	= smc_set_keepalive,
110 	.hash		= smc_hash_sk,
111 	.unhash		= smc_unhash_sk,
112 	.obj_size	= sizeof(struct smc_sock),
113 	.h.smc_hash	= &smc_v6_hashinfo,
114 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
115 };
116 EXPORT_SYMBOL_GPL(smc_proto6);
117 
118 static int smc_release(struct socket *sock)
119 {
120 	struct sock *sk = sock->sk;
121 	struct smc_sock *smc;
122 	int rc = 0;
123 
124 	if (!sk)
125 		goto out;
126 
127 	smc = smc_sk(sk);
128 
129 	/* cleanup for a dangling non-blocking connect */
130 	flush_work(&smc->connect_work);
131 	kfree(smc->connect_info);
132 	smc->connect_info = NULL;
133 
134 	if (sk->sk_state == SMC_LISTEN)
135 		/* smc_close_non_accepted() is called and acquires
136 		 * sock lock for child sockets again
137 		 */
138 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
139 	else
140 		lock_sock(sk);
141 
142 	if (!smc->use_fallback) {
143 		rc = smc_close_active(smc);
144 		sock_set_flag(sk, SOCK_DEAD);
145 		sk->sk_shutdown |= SHUTDOWN_MASK;
146 	}
147 	if (smc->clcsock) {
148 		sock_release(smc->clcsock);
149 		smc->clcsock = NULL;
150 	}
151 	if (smc->use_fallback) {
152 		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
153 			sock_put(sk); /* passive closing */
154 		sk->sk_state = SMC_CLOSED;
155 		sk->sk_state_change(sk);
156 	}
157 
158 	/* detach socket */
159 	sock_orphan(sk);
160 	sock->sk = NULL;
161 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
162 		smc_conn_free(&smc->conn);
163 	release_sock(sk);
164 
165 	sk->sk_prot->unhash(sk);
166 	sock_put(sk); /* final sock_put */
167 out:
168 	return rc;
169 }
170 
171 static void smc_destruct(struct sock *sk)
172 {
173 	if (sk->sk_state != SMC_CLOSED)
174 		return;
175 	if (!sock_flag(sk, SOCK_DEAD))
176 		return;
177 
178 	sk_refcnt_debug_dec(sk);
179 }
180 
181 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
182 				   int protocol)
183 {
184 	struct smc_sock *smc;
185 	struct proto *prot;
186 	struct sock *sk;
187 
188 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
189 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
190 	if (!sk)
191 		return NULL;
192 
193 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
194 	sk->sk_state = SMC_INIT;
195 	sk->sk_destruct = smc_destruct;
196 	sk->sk_protocol = protocol;
197 	smc = smc_sk(sk);
198 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
199 	INIT_WORK(&smc->connect_work, smc_connect_work);
200 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
201 	INIT_LIST_HEAD(&smc->accept_q);
202 	spin_lock_init(&smc->accept_q_lock);
203 	spin_lock_init(&smc->conn.send_lock);
204 	sk->sk_prot->hash(sk);
205 	sk_refcnt_debug_inc(sk);
206 
207 	return sk;
208 }
209 
210 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
211 		    int addr_len)
212 {
213 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
214 	struct sock *sk = sock->sk;
215 	struct smc_sock *smc;
216 	int rc;
217 
218 	smc = smc_sk(sk);
219 
220 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
221 	rc = -EINVAL;
222 	if (addr_len < sizeof(struct sockaddr_in))
223 		goto out;
224 
225 	rc = -EAFNOSUPPORT;
226 	if (addr->sin_family != AF_INET &&
227 	    addr->sin_family != AF_INET6 &&
228 	    addr->sin_family != AF_UNSPEC)
229 		goto out;
230 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
231 	if (addr->sin_family == AF_UNSPEC &&
232 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
233 		goto out;
234 
235 	lock_sock(sk);
236 
237 	/* Check if socket is already active */
238 	rc = -EINVAL;
239 	if (sk->sk_state != SMC_INIT)
240 		goto out_rel;
241 
242 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
243 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
244 
245 out_rel:
246 	release_sock(sk);
247 out:
248 	return rc;
249 }
250 
251 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
252 				   unsigned long mask)
253 {
254 	/* options we don't get control via setsockopt for */
255 	nsk->sk_type = osk->sk_type;
256 	nsk->sk_sndbuf = osk->sk_sndbuf;
257 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
258 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
259 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
260 	nsk->sk_mark = osk->sk_mark;
261 	nsk->sk_priority = osk->sk_priority;
262 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
263 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
264 	nsk->sk_err = osk->sk_err;
265 
266 	nsk->sk_flags &= ~mask;
267 	nsk->sk_flags |= osk->sk_flags & mask;
268 }
269 
270 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
271 			     (1UL << SOCK_KEEPOPEN) | \
272 			     (1UL << SOCK_LINGER) | \
273 			     (1UL << SOCK_BROADCAST) | \
274 			     (1UL << SOCK_TIMESTAMP) | \
275 			     (1UL << SOCK_DBG) | \
276 			     (1UL << SOCK_RCVTSTAMP) | \
277 			     (1UL << SOCK_RCVTSTAMPNS) | \
278 			     (1UL << SOCK_LOCALROUTE) | \
279 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
280 			     (1UL << SOCK_RXQ_OVFL) | \
281 			     (1UL << SOCK_WIFI_STATUS) | \
282 			     (1UL << SOCK_NOFCS) | \
283 			     (1UL << SOCK_FILTER_LOCKED))
284 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
285  * clc socket (since smc is not called for these options from net/core)
286  */
287 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
288 {
289 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
290 }
291 
292 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
293 			     (1UL << SOCK_KEEPOPEN) | \
294 			     (1UL << SOCK_LINGER) | \
295 			     (1UL << SOCK_DBG))
296 /* copy only settings and flags relevant for smc from clc to smc socket */
297 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
298 {
299 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
300 }
301 
302 /* register a new rmb, optionally send confirm_rkey msg to register with peer */
303 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
304 		       bool conf_rkey)
305 {
306 	/* register memory region for new rmb */
307 	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
308 		rmb_desc->regerr = 1;
309 		return -EFAULT;
310 	}
311 	if (!conf_rkey)
312 		return 0;
313 	/* exchange confirm_rkey msg with peer */
314 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
315 		rmb_desc->regerr = 1;
316 		return -EFAULT;
317 	}
318 	return 0;
319 }
320 
321 static int smc_clnt_conf_first_link(struct smc_sock *smc)
322 {
323 	struct net *net = sock_net(smc->clcsock->sk);
324 	struct smc_link_group *lgr = smc->conn.lgr;
325 	struct smc_link *link;
326 	int rest;
327 	int rc;
328 
329 	link = &lgr->lnk[SMC_SINGLE_LINK];
330 	/* receive CONFIRM LINK request from server over RoCE fabric */
331 	rest = wait_for_completion_interruptible_timeout(
332 		&link->llc_confirm,
333 		SMC_LLC_WAIT_FIRST_TIME);
334 	if (rest <= 0) {
335 		struct smc_clc_msg_decline dclc;
336 
337 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
338 				      SMC_CLC_DECLINE);
339 		return rc;
340 	}
341 
342 	if (link->llc_confirm_rc)
343 		return SMC_CLC_DECL_RMBE_EC;
344 
345 	rc = smc_ib_modify_qp_rts(link);
346 	if (rc)
347 		return SMC_CLC_DECL_INTERR;
348 
349 	smc_wr_remember_qp_attr(link);
350 
351 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
352 		return SMC_CLC_DECL_INTERR;
353 
354 	/* send CONFIRM LINK response over RoCE fabric */
355 	rc = smc_llc_send_confirm_link(link,
356 				       link->smcibdev->mac[link->ibport - 1],
357 				       &link->smcibdev->gid[link->ibport - 1],
358 				       SMC_LLC_RESP);
359 	if (rc < 0)
360 		return SMC_CLC_DECL_TCL;
361 
362 	/* receive ADD LINK request from server over RoCE fabric */
363 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
364 							 SMC_LLC_WAIT_TIME);
365 	if (rest <= 0) {
366 		struct smc_clc_msg_decline dclc;
367 
368 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
369 				      SMC_CLC_DECLINE);
370 		return rc;
371 	}
372 
373 	/* send add link reject message, only one link supported for now */
374 	rc = smc_llc_send_add_link(link,
375 				   link->smcibdev->mac[link->ibport - 1],
376 				   &link->smcibdev->gid[link->ibport - 1],
377 				   SMC_LLC_RESP);
378 	if (rc < 0)
379 		return SMC_CLC_DECL_TCL;
380 
381 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
382 
383 	return 0;
384 }
385 
386 static void smcr_conn_save_peer_info(struct smc_sock *smc,
387 				     struct smc_clc_msg_accept_confirm *clc)
388 {
389 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
390 
391 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
392 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
393 	smc->conn.peer_rmbe_size = bufsize;
394 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
395 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
396 }
397 
398 static void smcd_conn_save_peer_info(struct smc_sock *smc,
399 				     struct smc_clc_msg_accept_confirm *clc)
400 {
401 	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
402 
403 	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
404 	smc->conn.peer_token = clc->token;
405 	/* msg header takes up space in the buffer */
406 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
407 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
408 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
409 }
410 
411 static void smc_conn_save_peer_info(struct smc_sock *smc,
412 				    struct smc_clc_msg_accept_confirm *clc)
413 {
414 	if (smc->conn.lgr->is_smcd)
415 		smcd_conn_save_peer_info(smc, clc);
416 	else
417 		smcr_conn_save_peer_info(smc, clc);
418 }
419 
420 static void smc_link_save_peer_info(struct smc_link *link,
421 				    struct smc_clc_msg_accept_confirm *clc)
422 {
423 	link->peer_qpn = ntoh24(clc->qpn);
424 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
425 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
426 	link->peer_psn = ntoh24(clc->psn);
427 	link->peer_mtu = clc->qp_mtu;
428 }
429 
430 /* fall back during connect */
431 static int smc_connect_fallback(struct smc_sock *smc)
432 {
433 	smc->use_fallback = true;
434 	smc_copy_sock_settings_to_clc(smc);
435 	if (smc->sk.sk_state == SMC_INIT)
436 		smc->sk.sk_state = SMC_ACTIVE;
437 	return 0;
438 }
439 
440 /* decline and fall back during connect */
441 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
442 {
443 	int rc;
444 
445 	if (reason_code < 0) { /* error, fallback is not possible */
446 		if (smc->sk.sk_state == SMC_INIT)
447 			sock_put(&smc->sk); /* passive closing */
448 		return reason_code;
449 	}
450 	if (reason_code != SMC_CLC_DECL_REPLY) {
451 		rc = smc_clc_send_decline(smc, reason_code);
452 		if (rc < 0) {
453 			if (smc->sk.sk_state == SMC_INIT)
454 				sock_put(&smc->sk); /* passive closing */
455 			return rc;
456 		}
457 	}
458 	return smc_connect_fallback(smc);
459 }
460 
461 /* abort connecting */
462 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
463 			     int local_contact)
464 {
465 	if (local_contact == SMC_FIRST_CONTACT)
466 		smc_lgr_forget(smc->conn.lgr);
467 	mutex_unlock(&smc_create_lgr_pending);
468 	smc_conn_free(&smc->conn);
469 	return reason_code;
470 }
471 
472 /* check if there is a rdma device available for this connection. */
473 /* called for connect and listen */
474 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
475 			  u8 *ibport)
476 {
477 	int reason_code = 0;
478 
479 	/* PNET table look up: search active ib_device and port
480 	 * within same PNETID that also contains the ethernet device
481 	 * used for the internal TCP socket
482 	 */
483 	smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
484 	if (!(*ibdev))
485 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
486 
487 	return reason_code;
488 }
489 
490 /* check if there is an ISM device available for this connection. */
491 /* called for connect and listen */
492 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
493 {
494 	/* Find ISM device with same PNETID as connecting interface  */
495 	smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
496 	if (!(*ismdev))
497 		return SMC_CLC_DECL_CNFERR; /* configuration error */
498 	return 0;
499 }
500 
501 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
502 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
503 				      struct smcd_dev *ismdev,
504 				      unsigned short vlan_id)
505 {
506 	if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
507 		return SMC_CLC_DECL_CNFERR;
508 	return 0;
509 }
510 
511 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
512  * used, the VLAN ID will be registered again during the connection setup.
513  */
514 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
515 					struct smcd_dev *ismdev,
516 					unsigned short vlan_id)
517 {
518 	if (!is_smcd)
519 		return 0;
520 	if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
521 		return SMC_CLC_DECL_CNFERR;
522 	return 0;
523 }
524 
525 /* CLC handshake during connect */
526 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
527 			   struct smc_clc_msg_accept_confirm *aclc,
528 			   struct smc_ib_device *ibdev, u8 ibport,
529 			   struct smcd_dev *ismdev)
530 {
531 	int rc = 0;
532 
533 	/* do inband token exchange */
534 	rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev);
535 	if (rc)
536 		return rc;
537 	/* receive SMC Accept CLC message */
538 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
539 }
540 
541 /* setup for RDMA connection of client */
542 static int smc_connect_rdma(struct smc_sock *smc,
543 			    struct smc_clc_msg_accept_confirm *aclc,
544 			    struct smc_ib_device *ibdev, u8 ibport)
545 {
546 	int local_contact = SMC_FIRST_CONTACT;
547 	struct smc_link *link;
548 	int reason_code = 0;
549 
550 	mutex_lock(&smc_create_lgr_pending);
551 	local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
552 					ibport, &aclc->lcl, NULL, 0);
553 	if (local_contact < 0) {
554 		if (local_contact == -ENOMEM)
555 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
556 		else if (local_contact == -ENOLINK)
557 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
558 		else
559 			reason_code = SMC_CLC_DECL_INTERR; /* other error */
560 		return smc_connect_abort(smc, reason_code, 0);
561 	}
562 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
563 
564 	smc_conn_save_peer_info(smc, aclc);
565 
566 	/* create send buffer and rmb */
567 	if (smc_buf_create(smc, false))
568 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
569 
570 	if (local_contact == SMC_FIRST_CONTACT)
571 		smc_link_save_peer_info(link, aclc);
572 
573 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
574 		return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
575 					 local_contact);
576 
577 	smc_close_init(smc);
578 	smc_rx_init(smc);
579 
580 	if (local_contact == SMC_FIRST_CONTACT) {
581 		if (smc_ib_ready_link(link))
582 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
583 						 local_contact);
584 	} else {
585 		if (!smc->conn.rmb_desc->reused &&
586 		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
587 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
588 						 local_contact);
589 	}
590 	smc_rmb_sync_sg_for_device(&smc->conn);
591 
592 	reason_code = smc_clc_send_confirm(smc);
593 	if (reason_code)
594 		return smc_connect_abort(smc, reason_code, local_contact);
595 
596 	smc_tx_init(smc);
597 
598 	if (local_contact == SMC_FIRST_CONTACT) {
599 		/* QP confirmation over RoCE fabric */
600 		reason_code = smc_clnt_conf_first_link(smc);
601 		if (reason_code)
602 			return smc_connect_abort(smc, reason_code,
603 						 local_contact);
604 	}
605 	mutex_unlock(&smc_create_lgr_pending);
606 
607 	smc_copy_sock_settings_to_clc(smc);
608 	if (smc->sk.sk_state == SMC_INIT)
609 		smc->sk.sk_state = SMC_ACTIVE;
610 
611 	return 0;
612 }
613 
614 /* setup for ISM connection of client */
615 static int smc_connect_ism(struct smc_sock *smc,
616 			   struct smc_clc_msg_accept_confirm *aclc,
617 			   struct smcd_dev *ismdev)
618 {
619 	int local_contact = SMC_FIRST_CONTACT;
620 	int rc = 0;
621 
622 	mutex_lock(&smc_create_lgr_pending);
623 	local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
624 					NULL, ismdev, aclc->gid);
625 	if (local_contact < 0)
626 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
627 
628 	/* Create send and receive buffers */
629 	if (smc_buf_create(smc, true))
630 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
631 
632 	smc_conn_save_peer_info(smc, aclc);
633 	smc_close_init(smc);
634 	smc_rx_init(smc);
635 	smc_tx_init(smc);
636 
637 	rc = smc_clc_send_confirm(smc);
638 	if (rc)
639 		return smc_connect_abort(smc, rc, local_contact);
640 	mutex_unlock(&smc_create_lgr_pending);
641 
642 	smc_copy_sock_settings_to_clc(smc);
643 	if (smc->sk.sk_state == SMC_INIT)
644 		smc->sk.sk_state = SMC_ACTIVE;
645 
646 	return 0;
647 }
648 
649 /* perform steps before actually connecting */
650 static int __smc_connect(struct smc_sock *smc)
651 {
652 	bool ism_supported = false, rdma_supported = false;
653 	struct smc_clc_msg_accept_confirm aclc;
654 	struct smc_ib_device *ibdev;
655 	struct smcd_dev *ismdev;
656 	unsigned short vlan;
657 	int smc_type;
658 	int rc = 0;
659 	u8 ibport;
660 
661 	sock_hold(&smc->sk); /* sock put in passive closing */
662 
663 	if (smc->use_fallback)
664 		return smc_connect_fallback(smc);
665 
666 	/* if peer has not signalled SMC-capability, fall back */
667 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
668 		return smc_connect_fallback(smc);
669 
670 	/* IPSec connections opt out of SMC-R optimizations */
671 	if (using_ipsec(smc))
672 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
673 
674 	/* check for VLAN ID */
675 	if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
676 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
677 
678 	/* check if there is an ism device available */
679 	if (!smc_check_ism(smc, &ismdev) &&
680 	    !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
681 		/* ISM is supported for this connection */
682 		ism_supported = true;
683 		smc_type = SMC_TYPE_D;
684 	}
685 
686 	/* check if there is a rdma device available */
687 	if (!smc_check_rdma(smc, &ibdev, &ibport)) {
688 		/* RDMA is supported for this connection */
689 		rdma_supported = true;
690 		if (ism_supported)
691 			smc_type = SMC_TYPE_B; /* both */
692 		else
693 			smc_type = SMC_TYPE_R; /* only RDMA */
694 	}
695 
696 	/* if neither ISM nor RDMA are supported, fallback */
697 	if (!rdma_supported && !ism_supported)
698 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
699 
700 	/* perform CLC handshake */
701 	rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev);
702 	if (rc) {
703 		smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
704 		return smc_connect_decline_fallback(smc, rc);
705 	}
706 
707 	/* depending on previous steps, connect using rdma or ism */
708 	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
709 		rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
710 	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
711 		rc = smc_connect_ism(smc, &aclc, ismdev);
712 	else
713 		rc = SMC_CLC_DECL_CNFERR;
714 	if (rc) {
715 		smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
716 		return smc_connect_decline_fallback(smc, rc);
717 	}
718 
719 	smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
720 	return 0;
721 }
722 
723 static void smc_connect_work(struct work_struct *work)
724 {
725 	struct smc_sock *smc = container_of(work, struct smc_sock,
726 					    connect_work);
727 	int rc;
728 
729 	lock_sock(&smc->sk);
730 	rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
731 			    smc->connect_info->alen, smc->connect_info->flags);
732 	if (smc->clcsock->sk->sk_err) {
733 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
734 		goto out;
735 	}
736 	if (rc < 0) {
737 		smc->sk.sk_err = -rc;
738 		goto out;
739 	}
740 
741 	rc = __smc_connect(smc);
742 	if (rc < 0)
743 		smc->sk.sk_err = -rc;
744 
745 out:
746 	smc->sk.sk_state_change(&smc->sk);
747 	kfree(smc->connect_info);
748 	smc->connect_info = NULL;
749 	release_sock(&smc->sk);
750 }
751 
752 static int smc_connect(struct socket *sock, struct sockaddr *addr,
753 		       int alen, int flags)
754 {
755 	struct sock *sk = sock->sk;
756 	struct smc_sock *smc;
757 	int rc = -EINVAL;
758 
759 	smc = smc_sk(sk);
760 
761 	/* separate smc parameter checking to be safe */
762 	if (alen < sizeof(addr->sa_family))
763 		goto out_err;
764 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
765 		goto out_err;
766 
767 	lock_sock(sk);
768 	switch (sk->sk_state) {
769 	default:
770 		goto out;
771 	case SMC_ACTIVE:
772 		rc = -EISCONN;
773 		goto out;
774 	case SMC_INIT:
775 		rc = 0;
776 		break;
777 	}
778 
779 	smc_copy_sock_settings_to_clc(smc);
780 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
781 	if (flags & O_NONBLOCK) {
782 		if (smc->connect_info) {
783 			rc = -EALREADY;
784 			goto out;
785 		}
786 		smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
787 		if (!smc->connect_info) {
788 			rc = -ENOMEM;
789 			goto out;
790 		}
791 		smc->connect_info->alen = alen;
792 		smc->connect_info->flags = flags ^ O_NONBLOCK;
793 		memcpy(&smc->connect_info->addr, addr, alen);
794 		schedule_work(&smc->connect_work);
795 		rc = -EINPROGRESS;
796 	} else {
797 		rc = kernel_connect(smc->clcsock, addr, alen, flags);
798 		if (rc)
799 			goto out;
800 
801 		rc = __smc_connect(smc);
802 		if (rc < 0)
803 			goto out;
804 		else
805 			rc = 0; /* success cases including fallback */
806 	}
807 
808 out:
809 	release_sock(sk);
810 out_err:
811 	return rc;
812 }
813 
814 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
815 {
816 	struct socket *new_clcsock = NULL;
817 	struct sock *lsk = &lsmc->sk;
818 	struct sock *new_sk;
819 	int rc;
820 
821 	release_sock(lsk);
822 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
823 	if (!new_sk) {
824 		rc = -ENOMEM;
825 		lsk->sk_err = ENOMEM;
826 		*new_smc = NULL;
827 		lock_sock(lsk);
828 		goto out;
829 	}
830 	*new_smc = smc_sk(new_sk);
831 
832 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
833 	lock_sock(lsk);
834 	if  (rc < 0)
835 		lsk->sk_err = -rc;
836 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
837 		if (new_clcsock)
838 			sock_release(new_clcsock);
839 		new_sk->sk_state = SMC_CLOSED;
840 		sock_set_flag(new_sk, SOCK_DEAD);
841 		new_sk->sk_prot->unhash(new_sk);
842 		sock_put(new_sk); /* final */
843 		*new_smc = NULL;
844 		goto out;
845 	}
846 
847 	(*new_smc)->clcsock = new_clcsock;
848 out:
849 	return rc;
850 }
851 
852 /* add a just created sock to the accept queue of the listen sock as
853  * candidate for a following socket accept call from user space
854  */
855 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
856 {
857 	struct smc_sock *par = smc_sk(parent);
858 
859 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
860 	spin_lock(&par->accept_q_lock);
861 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
862 	spin_unlock(&par->accept_q_lock);
863 	sk_acceptq_added(parent);
864 }
865 
866 /* remove a socket from the accept queue of its parental listening socket */
867 static void smc_accept_unlink(struct sock *sk)
868 {
869 	struct smc_sock *par = smc_sk(sk)->listen_smc;
870 
871 	spin_lock(&par->accept_q_lock);
872 	list_del_init(&smc_sk(sk)->accept_q);
873 	spin_unlock(&par->accept_q_lock);
874 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
875 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
876 }
877 
878 /* remove a sock from the accept queue to bind it to a new socket created
879  * for a socket accept call from user space
880  */
881 struct sock *smc_accept_dequeue(struct sock *parent,
882 				struct socket *new_sock)
883 {
884 	struct smc_sock *isk, *n;
885 	struct sock *new_sk;
886 
887 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
888 		new_sk = (struct sock *)isk;
889 
890 		smc_accept_unlink(new_sk);
891 		if (new_sk->sk_state == SMC_CLOSED) {
892 			if (isk->clcsock) {
893 				sock_release(isk->clcsock);
894 				isk->clcsock = NULL;
895 			}
896 			new_sk->sk_prot->unhash(new_sk);
897 			sock_put(new_sk); /* final */
898 			continue;
899 		}
900 		if (new_sock)
901 			sock_graft(new_sk, new_sock);
902 		return new_sk;
903 	}
904 	return NULL;
905 }
906 
907 /* clean up for a created but never accepted sock */
908 void smc_close_non_accepted(struct sock *sk)
909 {
910 	struct smc_sock *smc = smc_sk(sk);
911 
912 	lock_sock(sk);
913 	if (!sk->sk_lingertime)
914 		/* wait for peer closing */
915 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
916 	if (!smc->use_fallback) {
917 		smc_close_active(smc);
918 		sock_set_flag(sk, SOCK_DEAD);
919 		sk->sk_shutdown |= SHUTDOWN_MASK;
920 	}
921 	if (smc->clcsock) {
922 		struct socket *tcp;
923 
924 		tcp = smc->clcsock;
925 		smc->clcsock = NULL;
926 		sock_release(tcp);
927 	}
928 	if (smc->use_fallback) {
929 		sock_put(sk); /* passive closing */
930 		sk->sk_state = SMC_CLOSED;
931 	} else {
932 		if (sk->sk_state == SMC_CLOSED)
933 			smc_conn_free(&smc->conn);
934 	}
935 	release_sock(sk);
936 	sk->sk_prot->unhash(sk);
937 	sock_put(sk); /* final sock_put */
938 }
939 
940 static int smc_serv_conf_first_link(struct smc_sock *smc)
941 {
942 	struct net *net = sock_net(smc->clcsock->sk);
943 	struct smc_link_group *lgr = smc->conn.lgr;
944 	struct smc_link *link;
945 	int rest;
946 	int rc;
947 
948 	link = &lgr->lnk[SMC_SINGLE_LINK];
949 
950 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
951 		return SMC_CLC_DECL_INTERR;
952 
953 	/* send CONFIRM LINK request to client over the RoCE fabric */
954 	rc = smc_llc_send_confirm_link(link,
955 				       link->smcibdev->mac[link->ibport - 1],
956 				       &link->smcibdev->gid[link->ibport - 1],
957 				       SMC_LLC_REQ);
958 	if (rc < 0)
959 		return SMC_CLC_DECL_TCL;
960 
961 	/* receive CONFIRM LINK response from client over the RoCE fabric */
962 	rest = wait_for_completion_interruptible_timeout(
963 		&link->llc_confirm_resp,
964 		SMC_LLC_WAIT_FIRST_TIME);
965 	if (rest <= 0) {
966 		struct smc_clc_msg_decline dclc;
967 
968 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
969 				      SMC_CLC_DECLINE);
970 		return rc;
971 	}
972 
973 	if (link->llc_confirm_resp_rc)
974 		return SMC_CLC_DECL_RMBE_EC;
975 
976 	/* send ADD LINK request to client over the RoCE fabric */
977 	rc = smc_llc_send_add_link(link,
978 				   link->smcibdev->mac[link->ibport - 1],
979 				   &link->smcibdev->gid[link->ibport - 1],
980 				   SMC_LLC_REQ);
981 	if (rc < 0)
982 		return SMC_CLC_DECL_TCL;
983 
984 	/* receive ADD LINK response from client over the RoCE fabric */
985 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
986 							 SMC_LLC_WAIT_TIME);
987 	if (rest <= 0) {
988 		struct smc_clc_msg_decline dclc;
989 
990 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
991 				      SMC_CLC_DECLINE);
992 		return rc;
993 	}
994 
995 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
996 
997 	return 0;
998 }
999 
1000 /* listen worker: finish */
1001 static void smc_listen_out(struct smc_sock *new_smc)
1002 {
1003 	struct smc_sock *lsmc = new_smc->listen_smc;
1004 	struct sock *newsmcsk = &new_smc->sk;
1005 
1006 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1007 	if (lsmc->sk.sk_state == SMC_LISTEN) {
1008 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1009 	} else { /* no longer listening */
1010 		smc_close_non_accepted(newsmcsk);
1011 	}
1012 	release_sock(&lsmc->sk);
1013 
1014 	/* Wake up accept */
1015 	lsmc->sk.sk_data_ready(&lsmc->sk);
1016 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1017 }
1018 
1019 /* listen worker: finish in state connected */
1020 static void smc_listen_out_connected(struct smc_sock *new_smc)
1021 {
1022 	struct sock *newsmcsk = &new_smc->sk;
1023 
1024 	sk_refcnt_debug_inc(newsmcsk);
1025 	if (newsmcsk->sk_state == SMC_INIT)
1026 		newsmcsk->sk_state = SMC_ACTIVE;
1027 
1028 	smc_listen_out(new_smc);
1029 }
1030 
1031 /* listen worker: finish in error state */
1032 static void smc_listen_out_err(struct smc_sock *new_smc)
1033 {
1034 	struct sock *newsmcsk = &new_smc->sk;
1035 
1036 	if (newsmcsk->sk_state == SMC_INIT)
1037 		sock_put(&new_smc->sk); /* passive closing */
1038 	newsmcsk->sk_state = SMC_CLOSED;
1039 	smc_conn_free(&new_smc->conn);
1040 
1041 	smc_listen_out(new_smc);
1042 }
1043 
1044 /* listen worker: decline and fall back if possible */
1045 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1046 			       int local_contact)
1047 {
1048 	/* RDMA setup failed, switch back to TCP */
1049 	if (local_contact == SMC_FIRST_CONTACT)
1050 		smc_lgr_forget(new_smc->conn.lgr);
1051 	if (reason_code < 0) { /* error, no fallback possible */
1052 		smc_listen_out_err(new_smc);
1053 		return;
1054 	}
1055 	smc_conn_free(&new_smc->conn);
1056 	new_smc->use_fallback = true;
1057 	if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
1058 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1059 			smc_listen_out_err(new_smc);
1060 			return;
1061 		}
1062 	}
1063 	smc_listen_out_connected(new_smc);
1064 }
1065 
1066 /* listen worker: check prefixes */
1067 static int smc_listen_rdma_check(struct smc_sock *new_smc,
1068 				 struct smc_clc_msg_proposal *pclc)
1069 {
1070 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1071 	struct socket *newclcsock = new_smc->clcsock;
1072 
1073 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1074 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1075 		return SMC_CLC_DECL_CNFERR;
1076 
1077 	return 0;
1078 }
1079 
1080 /* listen worker: initialize connection and buffers */
1081 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1082 				struct smc_clc_msg_proposal *pclc,
1083 				struct smc_ib_device *ibdev, u8 ibport,
1084 				int *local_contact)
1085 {
1086 	/* allocate connection / link group */
1087 	*local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
1088 					 &pclc->lcl, NULL, 0);
1089 	if (*local_contact < 0) {
1090 		if (*local_contact == -ENOMEM)
1091 			return SMC_CLC_DECL_MEM;/* insufficient memory*/
1092 		return SMC_CLC_DECL_INTERR; /* other error */
1093 	}
1094 
1095 	/* create send buffer and rmb */
1096 	if (smc_buf_create(new_smc, false))
1097 		return SMC_CLC_DECL_MEM;
1098 
1099 	return 0;
1100 }
1101 
1102 /* listen worker: initialize connection and buffers for SMC-D */
1103 static int smc_listen_ism_init(struct smc_sock *new_smc,
1104 			       struct smc_clc_msg_proposal *pclc,
1105 			       struct smcd_dev *ismdev,
1106 			       int *local_contact)
1107 {
1108 	struct smc_clc_msg_smcd *pclc_smcd;
1109 
1110 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1111 	*local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
1112 					 ismdev, pclc_smcd->gid);
1113 	if (*local_contact < 0) {
1114 		if (*local_contact == -ENOMEM)
1115 			return SMC_CLC_DECL_MEM;/* insufficient memory*/
1116 		return SMC_CLC_DECL_INTERR; /* other error */
1117 	}
1118 
1119 	/* Check if peer can be reached via ISM device */
1120 	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1121 			    new_smc->conn.lgr->vlan_id,
1122 			    new_smc->conn.lgr->smcd)) {
1123 		if (*local_contact == SMC_FIRST_CONTACT)
1124 			smc_lgr_forget(new_smc->conn.lgr);
1125 		smc_conn_free(&new_smc->conn);
1126 		return SMC_CLC_DECL_CNFERR;
1127 	}
1128 
1129 	/* Create send and receive buffers */
1130 	if (smc_buf_create(new_smc, true)) {
1131 		if (*local_contact == SMC_FIRST_CONTACT)
1132 			smc_lgr_forget(new_smc->conn.lgr);
1133 		smc_conn_free(&new_smc->conn);
1134 		return SMC_CLC_DECL_MEM;
1135 	}
1136 
1137 	return 0;
1138 }
1139 
1140 /* listen worker: register buffers */
1141 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1142 {
1143 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1144 
1145 	if (local_contact != SMC_FIRST_CONTACT) {
1146 		if (!new_smc->conn.rmb_desc->reused) {
1147 			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1148 				return SMC_CLC_DECL_INTERR;
1149 		}
1150 	}
1151 	smc_rmb_sync_sg_for_device(&new_smc->conn);
1152 
1153 	return 0;
1154 }
1155 
1156 /* listen worker: finish RDMA setup */
1157 static void smc_listen_rdma_finish(struct smc_sock *new_smc,
1158 				   struct smc_clc_msg_accept_confirm *cclc,
1159 				   int local_contact)
1160 {
1161 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1162 	int reason_code = 0;
1163 
1164 	if (local_contact == SMC_FIRST_CONTACT)
1165 		smc_link_save_peer_info(link, cclc);
1166 
1167 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1168 		reason_code = SMC_CLC_DECL_INTERR;
1169 		goto decline;
1170 	}
1171 
1172 	if (local_contact == SMC_FIRST_CONTACT) {
1173 		if (smc_ib_ready_link(link)) {
1174 			reason_code = SMC_CLC_DECL_INTERR;
1175 			goto decline;
1176 		}
1177 		/* QP confirmation over RoCE fabric */
1178 		reason_code = smc_serv_conf_first_link(new_smc);
1179 		if (reason_code)
1180 			goto decline;
1181 	}
1182 	return;
1183 
1184 decline:
1185 	mutex_unlock(&smc_create_lgr_pending);
1186 	smc_listen_decline(new_smc, reason_code, local_contact);
1187 }
1188 
1189 /* setup for RDMA connection of server */
1190 static void smc_listen_work(struct work_struct *work)
1191 {
1192 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1193 						smc_listen_work);
1194 	struct socket *newclcsock = new_smc->clcsock;
1195 	struct smc_clc_msg_accept_confirm cclc;
1196 	struct smc_clc_msg_proposal *pclc;
1197 	struct smc_ib_device *ibdev;
1198 	bool ism_supported = false;
1199 	struct smcd_dev *ismdev;
1200 	u8 buf[SMC_CLC_MAX_LEN];
1201 	int local_contact = 0;
1202 	int reason_code = 0;
1203 	int rc = 0;
1204 	u8 ibport;
1205 
1206 	if (new_smc->use_fallback) {
1207 		smc_listen_out_connected(new_smc);
1208 		return;
1209 	}
1210 
1211 	/* check if peer is smc capable */
1212 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1213 		new_smc->use_fallback = true;
1214 		smc_listen_out_connected(new_smc);
1215 		return;
1216 	}
1217 
1218 	/* do inband token exchange -
1219 	 * wait for and receive SMC Proposal CLC message
1220 	 */
1221 	pclc = (struct smc_clc_msg_proposal *)&buf;
1222 	reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1223 				       SMC_CLC_PROPOSAL);
1224 	if (reason_code) {
1225 		smc_listen_decline(new_smc, reason_code, 0);
1226 		return;
1227 	}
1228 
1229 	/* IPSec connections opt out of SMC-R optimizations */
1230 	if (using_ipsec(new_smc)) {
1231 		smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1232 		return;
1233 	}
1234 
1235 	mutex_lock(&smc_create_lgr_pending);
1236 	smc_close_init(new_smc);
1237 	smc_rx_init(new_smc);
1238 	smc_tx_init(new_smc);
1239 
1240 	/* check if ISM is available */
1241 	if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1242 	    !smc_check_ism(new_smc, &ismdev) &&
1243 	    !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1244 		ism_supported = true;
1245 	}
1246 
1247 	/* check if RDMA is available */
1248 	if (!ism_supported &&
1249 	    ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1250 	     smc_check_rdma(new_smc, &ibdev, &ibport) ||
1251 	     smc_listen_rdma_check(new_smc, pclc) ||
1252 	     smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1253 				  &local_contact) ||
1254 	     smc_listen_rdma_reg(new_smc, local_contact))) {
1255 		/* SMC not supported, decline */
1256 		mutex_unlock(&smc_create_lgr_pending);
1257 		smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
1258 		return;
1259 	}
1260 
1261 	/* send SMC Accept CLC message */
1262 	rc = smc_clc_send_accept(new_smc, local_contact);
1263 	if (rc) {
1264 		mutex_unlock(&smc_create_lgr_pending);
1265 		smc_listen_decline(new_smc, rc, local_contact);
1266 		return;
1267 	}
1268 
1269 	/* receive SMC Confirm CLC message */
1270 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1271 				       SMC_CLC_CONFIRM);
1272 	if (reason_code) {
1273 		mutex_unlock(&smc_create_lgr_pending);
1274 		smc_listen_decline(new_smc, reason_code, local_contact);
1275 		return;
1276 	}
1277 
1278 	/* finish worker */
1279 	if (!ism_supported)
1280 		smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1281 	smc_conn_save_peer_info(new_smc, &cclc);
1282 	mutex_unlock(&smc_create_lgr_pending);
1283 	smc_listen_out_connected(new_smc);
1284 }
1285 
1286 static void smc_tcp_listen_work(struct work_struct *work)
1287 {
1288 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1289 					     tcp_listen_work);
1290 	struct sock *lsk = &lsmc->sk;
1291 	struct smc_sock *new_smc;
1292 	int rc = 0;
1293 
1294 	lock_sock(lsk);
1295 	while (lsk->sk_state == SMC_LISTEN) {
1296 		rc = smc_clcsock_accept(lsmc, &new_smc);
1297 		if (rc)
1298 			goto out;
1299 		if (!new_smc)
1300 			continue;
1301 
1302 		new_smc->listen_smc = lsmc;
1303 		new_smc->use_fallback = lsmc->use_fallback;
1304 		sock_hold(lsk); /* sock_put in smc_listen_work */
1305 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1306 		smc_copy_sock_settings_to_smc(new_smc);
1307 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1308 		if (!schedule_work(&new_smc->smc_listen_work))
1309 			sock_put(&new_smc->sk);
1310 	}
1311 
1312 out:
1313 	release_sock(lsk);
1314 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1315 }
1316 
1317 static int smc_listen(struct socket *sock, int backlog)
1318 {
1319 	struct sock *sk = sock->sk;
1320 	struct smc_sock *smc;
1321 	int rc;
1322 
1323 	smc = smc_sk(sk);
1324 	lock_sock(sk);
1325 
1326 	rc = -EINVAL;
1327 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1328 		goto out;
1329 
1330 	rc = 0;
1331 	if (sk->sk_state == SMC_LISTEN) {
1332 		sk->sk_max_ack_backlog = backlog;
1333 		goto out;
1334 	}
1335 	/* some socket options are handled in core, so we could not apply
1336 	 * them to the clc socket -- copy smc socket options to clc socket
1337 	 */
1338 	smc_copy_sock_settings_to_clc(smc);
1339 	if (!smc->use_fallback)
1340 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1341 
1342 	rc = kernel_listen(smc->clcsock, backlog);
1343 	if (rc)
1344 		goto out;
1345 	sk->sk_max_ack_backlog = backlog;
1346 	sk->sk_ack_backlog = 0;
1347 	sk->sk_state = SMC_LISTEN;
1348 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1349 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1350 	if (!schedule_work(&smc->tcp_listen_work))
1351 		sock_put(sk);
1352 
1353 out:
1354 	release_sock(sk);
1355 	return rc;
1356 }
1357 
1358 static int smc_accept(struct socket *sock, struct socket *new_sock,
1359 		      int flags, bool kern)
1360 {
1361 	struct sock *sk = sock->sk, *nsk;
1362 	DECLARE_WAITQUEUE(wait, current);
1363 	struct smc_sock *lsmc;
1364 	long timeo;
1365 	int rc = 0;
1366 
1367 	lsmc = smc_sk(sk);
1368 	sock_hold(sk); /* sock_put below */
1369 	lock_sock(sk);
1370 
1371 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1372 		rc = -EINVAL;
1373 		release_sock(sk);
1374 		goto out;
1375 	}
1376 
1377 	/* Wait for an incoming connection */
1378 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1379 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1380 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1381 		set_current_state(TASK_INTERRUPTIBLE);
1382 		if (!timeo) {
1383 			rc = -EAGAIN;
1384 			break;
1385 		}
1386 		release_sock(sk);
1387 		timeo = schedule_timeout(timeo);
1388 		/* wakeup by sk_data_ready in smc_listen_work() */
1389 		sched_annotate_sleep();
1390 		lock_sock(sk);
1391 		if (signal_pending(current)) {
1392 			rc = sock_intr_errno(timeo);
1393 			break;
1394 		}
1395 	}
1396 	set_current_state(TASK_RUNNING);
1397 	remove_wait_queue(sk_sleep(sk), &wait);
1398 
1399 	if (!rc)
1400 		rc = sock_error(nsk);
1401 	release_sock(sk);
1402 	if (rc)
1403 		goto out;
1404 
1405 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1406 		/* wait till data arrives on the socket */
1407 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1408 								MSEC_PER_SEC);
1409 		if (smc_sk(nsk)->use_fallback) {
1410 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1411 
1412 			lock_sock(clcsk);
1413 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1414 				sk_wait_data(clcsk, &timeo, NULL);
1415 			release_sock(clcsk);
1416 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1417 			lock_sock(nsk);
1418 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1419 			release_sock(nsk);
1420 		}
1421 	}
1422 
1423 out:
1424 	sock_put(sk); /* sock_hold above */
1425 	return rc;
1426 }
1427 
1428 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1429 		       int peer)
1430 {
1431 	struct smc_sock *smc;
1432 
1433 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1434 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1435 		return -ENOTCONN;
1436 
1437 	smc = smc_sk(sock->sk);
1438 
1439 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1440 }
1441 
1442 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1443 {
1444 	struct sock *sk = sock->sk;
1445 	struct smc_sock *smc;
1446 	int rc = -EPIPE;
1447 
1448 	smc = smc_sk(sk);
1449 	lock_sock(sk);
1450 	if ((sk->sk_state != SMC_ACTIVE) &&
1451 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1452 	    (sk->sk_state != SMC_INIT))
1453 		goto out;
1454 
1455 	if (msg->msg_flags & MSG_FASTOPEN) {
1456 		if (sk->sk_state == SMC_INIT) {
1457 			smc->use_fallback = true;
1458 		} else {
1459 			rc = -EINVAL;
1460 			goto out;
1461 		}
1462 	}
1463 
1464 	if (smc->use_fallback)
1465 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1466 	else
1467 		rc = smc_tx_sendmsg(smc, msg, len);
1468 out:
1469 	release_sock(sk);
1470 	return rc;
1471 }
1472 
1473 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1474 		       int flags)
1475 {
1476 	struct sock *sk = sock->sk;
1477 	struct smc_sock *smc;
1478 	int rc = -ENOTCONN;
1479 
1480 	smc = smc_sk(sk);
1481 	lock_sock(sk);
1482 	if ((sk->sk_state == SMC_INIT) ||
1483 	    (sk->sk_state == SMC_LISTEN) ||
1484 	    (sk->sk_state == SMC_CLOSED))
1485 		goto out;
1486 
1487 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1488 		rc = 0;
1489 		goto out;
1490 	}
1491 
1492 	if (smc->use_fallback) {
1493 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1494 	} else {
1495 		msg->msg_namelen = 0;
1496 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1497 	}
1498 
1499 out:
1500 	release_sock(sk);
1501 	return rc;
1502 }
1503 
1504 static __poll_t smc_accept_poll(struct sock *parent)
1505 {
1506 	struct smc_sock *isk = smc_sk(parent);
1507 	__poll_t mask = 0;
1508 
1509 	spin_lock(&isk->accept_q_lock);
1510 	if (!list_empty(&isk->accept_q))
1511 		mask = EPOLLIN | EPOLLRDNORM;
1512 	spin_unlock(&isk->accept_q_lock);
1513 
1514 	return mask;
1515 }
1516 
1517 static __poll_t smc_poll(struct file *file, struct socket *sock,
1518 			     poll_table *wait)
1519 {
1520 	struct sock *sk = sock->sk;
1521 	__poll_t mask = 0;
1522 	struct smc_sock *smc;
1523 
1524 	if (!sk)
1525 		return EPOLLNVAL;
1526 
1527 	smc = smc_sk(sock->sk);
1528 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1529 		/* delegate to CLC child sock */
1530 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1531 		sk->sk_err = smc->clcsock->sk->sk_err;
1532 		if (sk->sk_err)
1533 			mask |= EPOLLERR;
1534 	} else {
1535 		if (sk->sk_state != SMC_CLOSED)
1536 			sock_poll_wait(file, sk_sleep(sk), wait);
1537 		if (sk->sk_err)
1538 			mask |= EPOLLERR;
1539 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1540 		    (sk->sk_state == SMC_CLOSED))
1541 			mask |= EPOLLHUP;
1542 		if (sk->sk_state == SMC_LISTEN) {
1543 			/* woken up by sk_data_ready in smc_listen_work() */
1544 			mask = smc_accept_poll(sk);
1545 		} else {
1546 			if (atomic_read(&smc->conn.sndbuf_space) ||
1547 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1548 				mask |= EPOLLOUT | EPOLLWRNORM;
1549 			} else {
1550 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1551 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1552 			}
1553 			if (atomic_read(&smc->conn.bytes_to_rcv))
1554 				mask |= EPOLLIN | EPOLLRDNORM;
1555 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1556 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1557 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1558 				mask |= EPOLLIN;
1559 		}
1560 		if (smc->conn.urg_state == SMC_URG_VALID)
1561 			mask |= EPOLLPRI;
1562 	}
1563 
1564 	return mask;
1565 }
1566 
1567 static int smc_shutdown(struct socket *sock, int how)
1568 {
1569 	struct sock *sk = sock->sk;
1570 	struct smc_sock *smc;
1571 	int rc = -EINVAL;
1572 	int rc1 = 0;
1573 
1574 	smc = smc_sk(sk);
1575 
1576 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1577 		return rc;
1578 
1579 	lock_sock(sk);
1580 
1581 	rc = -ENOTCONN;
1582 	if ((sk->sk_state != SMC_LISTEN) &&
1583 	    (sk->sk_state != SMC_ACTIVE) &&
1584 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1585 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1586 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1587 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1588 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1589 		goto out;
1590 	if (smc->use_fallback) {
1591 		rc = kernel_sock_shutdown(smc->clcsock, how);
1592 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1593 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1594 			sk->sk_state = SMC_CLOSED;
1595 		goto out;
1596 	}
1597 	switch (how) {
1598 	case SHUT_RDWR:		/* shutdown in both directions */
1599 		rc = smc_close_active(smc);
1600 		break;
1601 	case SHUT_WR:
1602 		rc = smc_close_shutdown_write(smc);
1603 		break;
1604 	case SHUT_RD:
1605 		rc = 0;
1606 		/* nothing more to do because peer is not involved */
1607 		break;
1608 	}
1609 	if (smc->clcsock)
1610 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1611 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1612 	sk->sk_shutdown |= how + 1;
1613 
1614 out:
1615 	release_sock(sk);
1616 	return rc ? rc : rc1;
1617 }
1618 
1619 static int smc_setsockopt(struct socket *sock, int level, int optname,
1620 			  char __user *optval, unsigned int optlen)
1621 {
1622 	struct sock *sk = sock->sk;
1623 	struct smc_sock *smc;
1624 	int val, rc;
1625 
1626 	smc = smc_sk(sk);
1627 
1628 	/* generic setsockopts reaching us here always apply to the
1629 	 * CLC socket
1630 	 */
1631 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1632 					   optval, optlen);
1633 	if (smc->clcsock->sk->sk_err) {
1634 		sk->sk_err = smc->clcsock->sk->sk_err;
1635 		sk->sk_error_report(sk);
1636 	}
1637 	if (rc)
1638 		return rc;
1639 
1640 	if (optlen < sizeof(int))
1641 		return -EINVAL;
1642 	if (get_user(val, (int __user *)optval))
1643 		return -EFAULT;
1644 
1645 	lock_sock(sk);
1646 	switch (optname) {
1647 	case TCP_ULP:
1648 	case TCP_FASTOPEN:
1649 	case TCP_FASTOPEN_CONNECT:
1650 	case TCP_FASTOPEN_KEY:
1651 	case TCP_FASTOPEN_NO_COOKIE:
1652 		/* option not supported by SMC */
1653 		if (sk->sk_state == SMC_INIT) {
1654 			smc->use_fallback = true;
1655 		} else {
1656 			if (!smc->use_fallback)
1657 				rc = -EINVAL;
1658 		}
1659 		break;
1660 	case TCP_NODELAY:
1661 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1662 			if (val && !smc->use_fallback)
1663 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1664 						 0);
1665 		}
1666 		break;
1667 	case TCP_CORK:
1668 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1669 			if (!val && !smc->use_fallback)
1670 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1671 						 0);
1672 		}
1673 		break;
1674 	case TCP_DEFER_ACCEPT:
1675 		smc->sockopt_defer_accept = val;
1676 		break;
1677 	default:
1678 		break;
1679 	}
1680 	release_sock(sk);
1681 
1682 	return rc;
1683 }
1684 
1685 static int smc_getsockopt(struct socket *sock, int level, int optname,
1686 			  char __user *optval, int __user *optlen)
1687 {
1688 	struct smc_sock *smc;
1689 
1690 	smc = smc_sk(sock->sk);
1691 	/* socket options apply to the CLC socket */
1692 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1693 					     optval, optlen);
1694 }
1695 
1696 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1697 		     unsigned long arg)
1698 {
1699 	union smc_host_cursor cons, urg;
1700 	struct smc_connection *conn;
1701 	struct smc_sock *smc;
1702 	int answ;
1703 
1704 	smc = smc_sk(sock->sk);
1705 	conn = &smc->conn;
1706 	if (smc->use_fallback) {
1707 		if (!smc->clcsock)
1708 			return -EBADF;
1709 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1710 	}
1711 	lock_sock(&smc->sk);
1712 	switch (cmd) {
1713 	case SIOCINQ: /* same as FIONREAD */
1714 		if (smc->sk.sk_state == SMC_LISTEN) {
1715 			release_sock(&smc->sk);
1716 			return -EINVAL;
1717 		}
1718 		if (smc->sk.sk_state == SMC_INIT ||
1719 		    smc->sk.sk_state == SMC_CLOSED)
1720 			answ = 0;
1721 		else
1722 			answ = atomic_read(&smc->conn.bytes_to_rcv);
1723 		break;
1724 	case SIOCOUTQ:
1725 		/* output queue size (not send + not acked) */
1726 		if (smc->sk.sk_state == SMC_LISTEN) {
1727 			release_sock(&smc->sk);
1728 			return -EINVAL;
1729 		}
1730 		if (smc->sk.sk_state == SMC_INIT ||
1731 		    smc->sk.sk_state == SMC_CLOSED)
1732 			answ = 0;
1733 		else
1734 			answ = smc->conn.sndbuf_desc->len -
1735 					atomic_read(&smc->conn.sndbuf_space);
1736 		break;
1737 	case SIOCOUTQNSD:
1738 		/* output queue size (not send only) */
1739 		if (smc->sk.sk_state == SMC_LISTEN) {
1740 			release_sock(&smc->sk);
1741 			return -EINVAL;
1742 		}
1743 		if (smc->sk.sk_state == SMC_INIT ||
1744 		    smc->sk.sk_state == SMC_CLOSED)
1745 			answ = 0;
1746 		else
1747 			answ = smc_tx_prepared_sends(&smc->conn);
1748 		break;
1749 	case SIOCATMARK:
1750 		if (smc->sk.sk_state == SMC_LISTEN) {
1751 			release_sock(&smc->sk);
1752 			return -EINVAL;
1753 		}
1754 		if (smc->sk.sk_state == SMC_INIT ||
1755 		    smc->sk.sk_state == SMC_CLOSED) {
1756 			answ = 0;
1757 		} else {
1758 			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1759 			smc_curs_copy(&urg, &conn->urg_curs, conn);
1760 			answ = smc_curs_diff(conn->rmb_desc->len,
1761 					     &cons, &urg) == 1;
1762 		}
1763 		break;
1764 	default:
1765 		release_sock(&smc->sk);
1766 		return -ENOIOCTLCMD;
1767 	}
1768 	release_sock(&smc->sk);
1769 
1770 	return put_user(answ, (int __user *)arg);
1771 }
1772 
1773 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1774 			    int offset, size_t size, int flags)
1775 {
1776 	struct sock *sk = sock->sk;
1777 	struct smc_sock *smc;
1778 	int rc = -EPIPE;
1779 
1780 	smc = smc_sk(sk);
1781 	lock_sock(sk);
1782 	if (sk->sk_state != SMC_ACTIVE) {
1783 		release_sock(sk);
1784 		goto out;
1785 	}
1786 	release_sock(sk);
1787 	if (smc->use_fallback)
1788 		rc = kernel_sendpage(smc->clcsock, page, offset,
1789 				     size, flags);
1790 	else
1791 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1792 
1793 out:
1794 	return rc;
1795 }
1796 
1797 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1798  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1799  * updates till whenever a respective page has been fully processed.
1800  * Note that subsequent recv() calls have to wait till all splice() processing
1801  * completed.
1802  */
1803 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1804 			       struct pipe_inode_info *pipe, size_t len,
1805 			       unsigned int flags)
1806 {
1807 	struct sock *sk = sock->sk;
1808 	struct smc_sock *smc;
1809 	int rc = -ENOTCONN;
1810 
1811 	smc = smc_sk(sk);
1812 	lock_sock(sk);
1813 
1814 	if (sk->sk_state == SMC_INIT ||
1815 	    sk->sk_state == SMC_LISTEN ||
1816 	    sk->sk_state == SMC_CLOSED)
1817 		goto out;
1818 
1819 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1820 		rc = 0;
1821 		goto out;
1822 	}
1823 
1824 	if (smc->use_fallback) {
1825 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1826 						    pipe, len, flags);
1827 	} else {
1828 		if (*ppos) {
1829 			rc = -ESPIPE;
1830 			goto out;
1831 		}
1832 		if (flags & SPLICE_F_NONBLOCK)
1833 			flags = MSG_DONTWAIT;
1834 		else
1835 			flags = 0;
1836 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1837 	}
1838 out:
1839 	release_sock(sk);
1840 
1841 	return rc;
1842 }
1843 
1844 /* must look like tcp */
1845 static const struct proto_ops smc_sock_ops = {
1846 	.family		= PF_SMC,
1847 	.owner		= THIS_MODULE,
1848 	.release	= smc_release,
1849 	.bind		= smc_bind,
1850 	.connect	= smc_connect,
1851 	.socketpair	= sock_no_socketpair,
1852 	.accept		= smc_accept,
1853 	.getname	= smc_getname,
1854 	.poll		= smc_poll,
1855 	.ioctl		= smc_ioctl,
1856 	.listen		= smc_listen,
1857 	.shutdown	= smc_shutdown,
1858 	.setsockopt	= smc_setsockopt,
1859 	.getsockopt	= smc_getsockopt,
1860 	.sendmsg	= smc_sendmsg,
1861 	.recvmsg	= smc_recvmsg,
1862 	.mmap		= sock_no_mmap,
1863 	.sendpage	= smc_sendpage,
1864 	.splice_read	= smc_splice_read,
1865 };
1866 
1867 static int smc_create(struct net *net, struct socket *sock, int protocol,
1868 		      int kern)
1869 {
1870 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1871 	struct smc_sock *smc;
1872 	struct sock *sk;
1873 	int rc;
1874 
1875 	rc = -ESOCKTNOSUPPORT;
1876 	if (sock->type != SOCK_STREAM)
1877 		goto out;
1878 
1879 	rc = -EPROTONOSUPPORT;
1880 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1881 		goto out;
1882 
1883 	rc = -ENOBUFS;
1884 	sock->ops = &smc_sock_ops;
1885 	sk = smc_sock_alloc(net, sock, protocol);
1886 	if (!sk)
1887 		goto out;
1888 
1889 	/* create internal TCP socket for CLC handshake and fallback */
1890 	smc = smc_sk(sk);
1891 	smc->use_fallback = false; /* assume rdma capability first */
1892 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1893 			      &smc->clcsock);
1894 	if (rc) {
1895 		sk_common_release(sk);
1896 		goto out;
1897 	}
1898 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1899 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1900 
1901 out:
1902 	return rc;
1903 }
1904 
1905 static const struct net_proto_family smc_sock_family_ops = {
1906 	.family	= PF_SMC,
1907 	.owner	= THIS_MODULE,
1908 	.create	= smc_create,
1909 };
1910 
1911 static int __init smc_init(void)
1912 {
1913 	int rc;
1914 
1915 	rc = smc_pnet_init();
1916 	if (rc)
1917 		return rc;
1918 
1919 	rc = smc_llc_init();
1920 	if (rc) {
1921 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1922 		goto out_pnet;
1923 	}
1924 
1925 	rc = smc_cdc_init();
1926 	if (rc) {
1927 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1928 		goto out_pnet;
1929 	}
1930 
1931 	rc = proto_register(&smc_proto, 1);
1932 	if (rc) {
1933 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1934 		goto out_pnet;
1935 	}
1936 
1937 	rc = proto_register(&smc_proto6, 1);
1938 	if (rc) {
1939 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1940 		goto out_proto;
1941 	}
1942 
1943 	rc = sock_register(&smc_sock_family_ops);
1944 	if (rc) {
1945 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1946 		goto out_proto6;
1947 	}
1948 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1949 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1950 
1951 	rc = smc_ib_register_client();
1952 	if (rc) {
1953 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1954 		goto out_sock;
1955 	}
1956 
1957 	static_branch_enable(&tcp_have_smc);
1958 	return 0;
1959 
1960 out_sock:
1961 	sock_unregister(PF_SMC);
1962 out_proto6:
1963 	proto_unregister(&smc_proto6);
1964 out_proto:
1965 	proto_unregister(&smc_proto);
1966 out_pnet:
1967 	smc_pnet_exit();
1968 	return rc;
1969 }
1970 
1971 static void __exit smc_exit(void)
1972 {
1973 	smc_core_exit();
1974 	static_branch_disable(&tcp_have_smc);
1975 	smc_ib_unregister_client();
1976 	sock_unregister(PF_SMC);
1977 	proto_unregister(&smc_proto6);
1978 	proto_unregister(&smc_proto);
1979 	smc_pnet_exit();
1980 }
1981 
1982 module_init(smc_init);
1983 module_exit(smc_exit);
1984 
1985 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1986 MODULE_DESCRIPTION("smc socket address family");
1987 MODULE_LICENSE("GPL");
1988 MODULE_ALIAS_NETPROTO(PF_SMC);
1989