xref: /openbmc/linux/net/smc/af_smc.c (revision 3b23dc52)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17 
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20 
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 
27 #include <net/sock.h>
28 #include <net/tcp.h>
29 #include <net/smc.h>
30 #include <asm/ioctls.h>
31 
32 #include "smc.h"
33 #include "smc_clc.h"
34 #include "smc_llc.h"
35 #include "smc_cdc.h"
36 #include "smc_core.h"
37 #include "smc_ib.h"
38 #include "smc_pnet.h"
39 #include "smc_tx.h"
40 #include "smc_rx.h"
41 #include "smc_close.h"
42 
43 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
44 						 * creation
45 						 */
46 
47 static void smc_tcp_listen_work(struct work_struct *);
48 static void smc_connect_work(struct work_struct *);
49 
50 static void smc_set_keepalive(struct sock *sk, int val)
51 {
52 	struct smc_sock *smc = smc_sk(sk);
53 
54 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
55 }
56 
57 static struct smc_hashinfo smc_v4_hashinfo = {
58 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
59 };
60 
61 static struct smc_hashinfo smc_v6_hashinfo = {
62 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
63 };
64 
65 int smc_hash_sk(struct sock *sk)
66 {
67 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
68 	struct hlist_head *head;
69 
70 	head = &h->ht;
71 
72 	write_lock_bh(&h->lock);
73 	sk_add_node(sk, head);
74 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
75 	write_unlock_bh(&h->lock);
76 
77 	return 0;
78 }
79 EXPORT_SYMBOL_GPL(smc_hash_sk);
80 
81 void smc_unhash_sk(struct sock *sk)
82 {
83 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
84 
85 	write_lock_bh(&h->lock);
86 	if (sk_del_node_init(sk))
87 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
88 	write_unlock_bh(&h->lock);
89 }
90 EXPORT_SYMBOL_GPL(smc_unhash_sk);
91 
92 struct proto smc_proto = {
93 	.name		= "SMC",
94 	.owner		= THIS_MODULE,
95 	.keepalive	= smc_set_keepalive,
96 	.hash		= smc_hash_sk,
97 	.unhash		= smc_unhash_sk,
98 	.obj_size	= sizeof(struct smc_sock),
99 	.h.smc_hash	= &smc_v4_hashinfo,
100 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
101 };
102 EXPORT_SYMBOL_GPL(smc_proto);
103 
104 struct proto smc_proto6 = {
105 	.name		= "SMC6",
106 	.owner		= THIS_MODULE,
107 	.keepalive	= smc_set_keepalive,
108 	.hash		= smc_hash_sk,
109 	.unhash		= smc_unhash_sk,
110 	.obj_size	= sizeof(struct smc_sock),
111 	.h.smc_hash	= &smc_v6_hashinfo,
112 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
113 };
114 EXPORT_SYMBOL_GPL(smc_proto6);
115 
116 static int smc_release(struct socket *sock)
117 {
118 	struct sock *sk = sock->sk;
119 	struct smc_sock *smc;
120 	int rc = 0;
121 
122 	if (!sk)
123 		goto out;
124 
125 	smc = smc_sk(sk);
126 
127 	/* cleanup for a dangling non-blocking connect */
128 	flush_work(&smc->connect_work);
129 	kfree(smc->connect_info);
130 	smc->connect_info = NULL;
131 
132 	if (sk->sk_state == SMC_LISTEN)
133 		/* smc_close_non_accepted() is called and acquires
134 		 * sock lock for child sockets again
135 		 */
136 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
137 	else
138 		lock_sock(sk);
139 
140 	if (!smc->use_fallback) {
141 		rc = smc_close_active(smc);
142 		sock_set_flag(sk, SOCK_DEAD);
143 		sk->sk_shutdown |= SHUTDOWN_MASK;
144 	}
145 	if (smc->clcsock) {
146 		sock_release(smc->clcsock);
147 		smc->clcsock = NULL;
148 	}
149 	if (smc->use_fallback) {
150 		sock_put(sk); /* passive closing */
151 		sk->sk_state = SMC_CLOSED;
152 		sk->sk_state_change(sk);
153 	}
154 
155 	/* detach socket */
156 	sock_orphan(sk);
157 	sock->sk = NULL;
158 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
159 		smc_conn_free(&smc->conn);
160 	release_sock(sk);
161 
162 	sk->sk_prot->unhash(sk);
163 	sock_put(sk); /* final sock_put */
164 out:
165 	return rc;
166 }
167 
168 static void smc_destruct(struct sock *sk)
169 {
170 	if (sk->sk_state != SMC_CLOSED)
171 		return;
172 	if (!sock_flag(sk, SOCK_DEAD))
173 		return;
174 
175 	sk_refcnt_debug_dec(sk);
176 }
177 
178 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
179 				   int protocol)
180 {
181 	struct smc_sock *smc;
182 	struct proto *prot;
183 	struct sock *sk;
184 
185 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
186 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
187 	if (!sk)
188 		return NULL;
189 
190 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
191 	sk->sk_state = SMC_INIT;
192 	sk->sk_destruct = smc_destruct;
193 	sk->sk_protocol = protocol;
194 	smc = smc_sk(sk);
195 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
196 	INIT_WORK(&smc->connect_work, smc_connect_work);
197 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
198 	INIT_LIST_HEAD(&smc->accept_q);
199 	spin_lock_init(&smc->accept_q_lock);
200 	spin_lock_init(&smc->conn.send_lock);
201 	sk->sk_prot->hash(sk);
202 	sk_refcnt_debug_inc(sk);
203 
204 	return sk;
205 }
206 
207 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
208 		    int addr_len)
209 {
210 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
211 	struct sock *sk = sock->sk;
212 	struct smc_sock *smc;
213 	int rc;
214 
215 	smc = smc_sk(sk);
216 
217 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
218 	rc = -EINVAL;
219 	if (addr_len < sizeof(struct sockaddr_in))
220 		goto out;
221 
222 	rc = -EAFNOSUPPORT;
223 	if (addr->sin_family != AF_INET &&
224 	    addr->sin_family != AF_INET6 &&
225 	    addr->sin_family != AF_UNSPEC)
226 		goto out;
227 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
228 	if (addr->sin_family == AF_UNSPEC &&
229 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
230 		goto out;
231 
232 	lock_sock(sk);
233 
234 	/* Check if socket is already active */
235 	rc = -EINVAL;
236 	if (sk->sk_state != SMC_INIT)
237 		goto out_rel;
238 
239 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
240 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
241 
242 out_rel:
243 	release_sock(sk);
244 out:
245 	return rc;
246 }
247 
248 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
249 				   unsigned long mask)
250 {
251 	/* options we don't get control via setsockopt for */
252 	nsk->sk_type = osk->sk_type;
253 	nsk->sk_sndbuf = osk->sk_sndbuf;
254 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
255 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
256 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
257 	nsk->sk_mark = osk->sk_mark;
258 	nsk->sk_priority = osk->sk_priority;
259 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
260 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
261 	nsk->sk_err = osk->sk_err;
262 
263 	nsk->sk_flags &= ~mask;
264 	nsk->sk_flags |= osk->sk_flags & mask;
265 }
266 
267 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
268 			     (1UL << SOCK_KEEPOPEN) | \
269 			     (1UL << SOCK_LINGER) | \
270 			     (1UL << SOCK_BROADCAST) | \
271 			     (1UL << SOCK_TIMESTAMP) | \
272 			     (1UL << SOCK_DBG) | \
273 			     (1UL << SOCK_RCVTSTAMP) | \
274 			     (1UL << SOCK_RCVTSTAMPNS) | \
275 			     (1UL << SOCK_LOCALROUTE) | \
276 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
277 			     (1UL << SOCK_RXQ_OVFL) | \
278 			     (1UL << SOCK_WIFI_STATUS) | \
279 			     (1UL << SOCK_NOFCS) | \
280 			     (1UL << SOCK_FILTER_LOCKED))
281 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
282  * clc socket (since smc is not called for these options from net/core)
283  */
284 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
285 {
286 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
287 }
288 
289 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
290 			     (1UL << SOCK_KEEPOPEN) | \
291 			     (1UL << SOCK_LINGER) | \
292 			     (1UL << SOCK_DBG))
293 /* copy only settings and flags relevant for smc from clc to smc socket */
294 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
295 {
296 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
297 }
298 
299 /* register a new rmb, optionally send confirm_rkey msg to register with peer */
300 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
301 		       bool conf_rkey)
302 {
303 	/* register memory region for new rmb */
304 	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
305 		rmb_desc->regerr = 1;
306 		return -EFAULT;
307 	}
308 	if (!conf_rkey)
309 		return 0;
310 	/* exchange confirm_rkey msg with peer */
311 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
312 		rmb_desc->regerr = 1;
313 		return -EFAULT;
314 	}
315 	return 0;
316 }
317 
318 static int smc_clnt_conf_first_link(struct smc_sock *smc)
319 {
320 	struct net *net = sock_net(smc->clcsock->sk);
321 	struct smc_link_group *lgr = smc->conn.lgr;
322 	struct smc_link *link;
323 	int rest;
324 	int rc;
325 
326 	link = &lgr->lnk[SMC_SINGLE_LINK];
327 	/* receive CONFIRM LINK request from server over RoCE fabric */
328 	rest = wait_for_completion_interruptible_timeout(
329 		&link->llc_confirm,
330 		SMC_LLC_WAIT_FIRST_TIME);
331 	if (rest <= 0) {
332 		struct smc_clc_msg_decline dclc;
333 
334 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
335 				      SMC_CLC_DECLINE);
336 		return rc;
337 	}
338 
339 	if (link->llc_confirm_rc)
340 		return SMC_CLC_DECL_RMBE_EC;
341 
342 	rc = smc_ib_modify_qp_rts(link);
343 	if (rc)
344 		return SMC_CLC_DECL_INTERR;
345 
346 	smc_wr_remember_qp_attr(link);
347 
348 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
349 		return SMC_CLC_DECL_INTERR;
350 
351 	/* send CONFIRM LINK response over RoCE fabric */
352 	rc = smc_llc_send_confirm_link(link,
353 				       link->smcibdev->mac[link->ibport - 1],
354 				       &link->smcibdev->gid[link->ibport - 1],
355 				       SMC_LLC_RESP);
356 	if (rc < 0)
357 		return SMC_CLC_DECL_TCL;
358 
359 	/* receive ADD LINK request from server over RoCE fabric */
360 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
361 							 SMC_LLC_WAIT_TIME);
362 	if (rest <= 0) {
363 		struct smc_clc_msg_decline dclc;
364 
365 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
366 				      SMC_CLC_DECLINE);
367 		return rc;
368 	}
369 
370 	/* send add link reject message, only one link supported for now */
371 	rc = smc_llc_send_add_link(link,
372 				   link->smcibdev->mac[link->ibport - 1],
373 				   &link->smcibdev->gid[link->ibport - 1],
374 				   SMC_LLC_RESP);
375 	if (rc < 0)
376 		return SMC_CLC_DECL_TCL;
377 
378 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
379 
380 	return 0;
381 }
382 
383 static void smc_conn_save_peer_info(struct smc_sock *smc,
384 				    struct smc_clc_msg_accept_confirm *clc)
385 {
386 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
387 
388 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
389 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
390 	smc->conn.peer_rmbe_size = bufsize;
391 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
392 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
393 }
394 
395 static void smc_link_save_peer_info(struct smc_link *link,
396 				    struct smc_clc_msg_accept_confirm *clc)
397 {
398 	link->peer_qpn = ntoh24(clc->qpn);
399 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
400 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
401 	link->peer_psn = ntoh24(clc->psn);
402 	link->peer_mtu = clc->qp_mtu;
403 }
404 
405 /* fall back during connect */
406 static int smc_connect_fallback(struct smc_sock *smc)
407 {
408 	smc->use_fallback = true;
409 	smc_copy_sock_settings_to_clc(smc);
410 	if (smc->sk.sk_state == SMC_INIT)
411 		smc->sk.sk_state = SMC_ACTIVE;
412 	return 0;
413 }
414 
415 /* decline and fall back during connect */
416 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
417 {
418 	int rc;
419 
420 	if (reason_code < 0) /* error, fallback is not possible */
421 		return reason_code;
422 	if (reason_code != SMC_CLC_DECL_REPLY) {
423 		rc = smc_clc_send_decline(smc, reason_code);
424 		if (rc < 0)
425 			return rc;
426 	}
427 	return smc_connect_fallback(smc);
428 }
429 
430 /* abort connecting */
431 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
432 			     int local_contact)
433 {
434 	if (local_contact == SMC_FIRST_CONTACT)
435 		smc_lgr_forget(smc->conn.lgr);
436 	mutex_unlock(&smc_create_lgr_pending);
437 	smc_conn_free(&smc->conn);
438 	if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
439 		sock_put(&smc->sk); /* passive closing */
440 	return reason_code;
441 }
442 
443 /* check if there is a rdma device available for this connection. */
444 /* called for connect and listen */
445 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
446 			  u8 *ibport)
447 {
448 	int reason_code = 0;
449 
450 	/* PNET table look up: search active ib_device and port
451 	 * within same PNETID that also contains the ethernet device
452 	 * used for the internal TCP socket
453 	 */
454 	smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
455 	if (!(*ibdev))
456 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
457 
458 	return reason_code;
459 }
460 
461 /* CLC handshake during connect */
462 static int smc_connect_clc(struct smc_sock *smc,
463 			   struct smc_clc_msg_accept_confirm *aclc,
464 			   struct smc_ib_device *ibdev, u8 ibport)
465 {
466 	int rc = 0;
467 
468 	/* do inband token exchange */
469 	rc = smc_clc_send_proposal(smc, ibdev, ibport);
470 	if (rc)
471 		return rc;
472 	/* receive SMC Accept CLC message */
473 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
474 }
475 
476 /* setup for RDMA connection of client */
477 static int smc_connect_rdma(struct smc_sock *smc,
478 			    struct smc_clc_msg_accept_confirm *aclc,
479 			    struct smc_ib_device *ibdev, u8 ibport)
480 {
481 	int local_contact = SMC_FIRST_CONTACT;
482 	struct smc_link *link;
483 	int reason_code = 0;
484 
485 	mutex_lock(&smc_create_lgr_pending);
486 	local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
487 					aclc->hdr.flag);
488 	if (local_contact < 0) {
489 		if (local_contact == -ENOMEM)
490 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
491 		else if (local_contact == -ENOLINK)
492 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
493 		else
494 			reason_code = SMC_CLC_DECL_INTERR; /* other error */
495 		return smc_connect_abort(smc, reason_code, 0);
496 	}
497 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
498 
499 	smc_conn_save_peer_info(smc, aclc);
500 
501 	/* create send buffer and rmb */
502 	if (smc_buf_create(smc))
503 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
504 
505 	if (local_contact == SMC_FIRST_CONTACT)
506 		smc_link_save_peer_info(link, aclc);
507 
508 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
509 		return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
510 					 local_contact);
511 
512 	smc_close_init(smc);
513 	smc_rx_init(smc);
514 
515 	if (local_contact == SMC_FIRST_CONTACT) {
516 		if (smc_ib_ready_link(link))
517 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
518 						 local_contact);
519 	} else {
520 		if (!smc->conn.rmb_desc->reused &&
521 		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
522 			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
523 						 local_contact);
524 	}
525 	smc_rmb_sync_sg_for_device(&smc->conn);
526 
527 	reason_code = smc_clc_send_confirm(smc);
528 	if (reason_code)
529 		return smc_connect_abort(smc, reason_code, local_contact);
530 
531 	smc_tx_init(smc);
532 
533 	if (local_contact == SMC_FIRST_CONTACT) {
534 		/* QP confirmation over RoCE fabric */
535 		reason_code = smc_clnt_conf_first_link(smc);
536 		if (reason_code)
537 			return smc_connect_abort(smc, reason_code,
538 						 local_contact);
539 	}
540 	mutex_unlock(&smc_create_lgr_pending);
541 
542 	smc_copy_sock_settings_to_clc(smc);
543 	if (smc->sk.sk_state == SMC_INIT)
544 		smc->sk.sk_state = SMC_ACTIVE;
545 
546 	return 0;
547 }
548 
549 /* perform steps before actually connecting */
550 static int __smc_connect(struct smc_sock *smc)
551 {
552 	struct smc_clc_msg_accept_confirm aclc;
553 	struct smc_ib_device *ibdev;
554 	int rc = 0;
555 	u8 ibport;
556 
557 	sock_hold(&smc->sk); /* sock put in passive closing */
558 
559 	if (smc->use_fallback)
560 		return smc_connect_fallback(smc);
561 
562 	/* if peer has not signalled SMC-capability, fall back */
563 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
564 		return smc_connect_fallback(smc);
565 
566 	/* IPSec connections opt out of SMC-R optimizations */
567 	if (using_ipsec(smc))
568 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
569 
570 	/* check if a RDMA device is available; if not, fall back */
571 	if (smc_check_rdma(smc, &ibdev, &ibport))
572 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
573 
574 	/* perform CLC handshake */
575 	rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
576 	if (rc)
577 		return smc_connect_decline_fallback(smc, rc);
578 
579 	/* connect using rdma */
580 	rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
581 	if (rc)
582 		return smc_connect_decline_fallback(smc, rc);
583 
584 	return 0;
585 }
586 
587 static void smc_connect_work(struct work_struct *work)
588 {
589 	struct smc_sock *smc = container_of(work, struct smc_sock,
590 					    connect_work);
591 	int rc;
592 
593 	lock_sock(&smc->sk);
594 	rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
595 			    smc->connect_info->alen, smc->connect_info->flags);
596 	if (smc->clcsock->sk->sk_err) {
597 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
598 		goto out;
599 	}
600 	if (rc < 0) {
601 		smc->sk.sk_err = -rc;
602 		goto out;
603 	}
604 
605 	rc = __smc_connect(smc);
606 	if (rc < 0)
607 		smc->sk.sk_err = -rc;
608 
609 out:
610 	smc->sk.sk_state_change(&smc->sk);
611 	kfree(smc->connect_info);
612 	smc->connect_info = NULL;
613 	release_sock(&smc->sk);
614 }
615 
616 static int smc_connect(struct socket *sock, struct sockaddr *addr,
617 		       int alen, int flags)
618 {
619 	struct sock *sk = sock->sk;
620 	struct smc_sock *smc;
621 	int rc = -EINVAL;
622 
623 	smc = smc_sk(sk);
624 
625 	/* separate smc parameter checking to be safe */
626 	if (alen < sizeof(addr->sa_family))
627 		goto out_err;
628 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
629 		goto out_err;
630 
631 	lock_sock(sk);
632 	switch (sk->sk_state) {
633 	default:
634 		goto out;
635 	case SMC_ACTIVE:
636 		rc = -EISCONN;
637 		goto out;
638 	case SMC_INIT:
639 		rc = 0;
640 		break;
641 	}
642 
643 	smc_copy_sock_settings_to_clc(smc);
644 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
645 	if (flags & O_NONBLOCK) {
646 		if (smc->connect_info) {
647 			rc = -EALREADY;
648 			goto out;
649 		}
650 		smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
651 		if (!smc->connect_info) {
652 			rc = -ENOMEM;
653 			goto out;
654 		}
655 		smc->connect_info->alen = alen;
656 		smc->connect_info->flags = flags ^ O_NONBLOCK;
657 		memcpy(&smc->connect_info->addr, addr, alen);
658 		schedule_work(&smc->connect_work);
659 		rc = -EINPROGRESS;
660 	} else {
661 		rc = kernel_connect(smc->clcsock, addr, alen, flags);
662 		if (rc)
663 			goto out;
664 
665 		rc = __smc_connect(smc);
666 		if (rc < 0)
667 			goto out;
668 		else
669 			rc = 0; /* success cases including fallback */
670 	}
671 
672 out:
673 	release_sock(sk);
674 out_err:
675 	return rc;
676 }
677 
678 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
679 {
680 	struct socket *new_clcsock = NULL;
681 	struct sock *lsk = &lsmc->sk;
682 	struct sock *new_sk;
683 	int rc;
684 
685 	release_sock(lsk);
686 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
687 	if (!new_sk) {
688 		rc = -ENOMEM;
689 		lsk->sk_err = ENOMEM;
690 		*new_smc = NULL;
691 		lock_sock(lsk);
692 		goto out;
693 	}
694 	*new_smc = smc_sk(new_sk);
695 
696 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
697 	lock_sock(lsk);
698 	if  (rc < 0)
699 		lsk->sk_err = -rc;
700 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
701 		if (new_clcsock)
702 			sock_release(new_clcsock);
703 		new_sk->sk_state = SMC_CLOSED;
704 		sock_set_flag(new_sk, SOCK_DEAD);
705 		new_sk->sk_prot->unhash(new_sk);
706 		sock_put(new_sk); /* final */
707 		*new_smc = NULL;
708 		goto out;
709 	}
710 
711 	(*new_smc)->clcsock = new_clcsock;
712 out:
713 	return rc;
714 }
715 
716 /* add a just created sock to the accept queue of the listen sock as
717  * candidate for a following socket accept call from user space
718  */
719 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
720 {
721 	struct smc_sock *par = smc_sk(parent);
722 
723 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
724 	spin_lock(&par->accept_q_lock);
725 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
726 	spin_unlock(&par->accept_q_lock);
727 	sk_acceptq_added(parent);
728 }
729 
730 /* remove a socket from the accept queue of its parental listening socket */
731 static void smc_accept_unlink(struct sock *sk)
732 {
733 	struct smc_sock *par = smc_sk(sk)->listen_smc;
734 
735 	spin_lock(&par->accept_q_lock);
736 	list_del_init(&smc_sk(sk)->accept_q);
737 	spin_unlock(&par->accept_q_lock);
738 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
739 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
740 }
741 
742 /* remove a sock from the accept queue to bind it to a new socket created
743  * for a socket accept call from user space
744  */
745 struct sock *smc_accept_dequeue(struct sock *parent,
746 				struct socket *new_sock)
747 {
748 	struct smc_sock *isk, *n;
749 	struct sock *new_sk;
750 
751 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
752 		new_sk = (struct sock *)isk;
753 
754 		smc_accept_unlink(new_sk);
755 		if (new_sk->sk_state == SMC_CLOSED) {
756 			if (isk->clcsock) {
757 				sock_release(isk->clcsock);
758 				isk->clcsock = NULL;
759 			}
760 			new_sk->sk_prot->unhash(new_sk);
761 			sock_put(new_sk); /* final */
762 			continue;
763 		}
764 		if (new_sock)
765 			sock_graft(new_sk, new_sock);
766 		return new_sk;
767 	}
768 	return NULL;
769 }
770 
771 /* clean up for a created but never accepted sock */
772 void smc_close_non_accepted(struct sock *sk)
773 {
774 	struct smc_sock *smc = smc_sk(sk);
775 
776 	lock_sock(sk);
777 	if (!sk->sk_lingertime)
778 		/* wait for peer closing */
779 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
780 	if (!smc->use_fallback) {
781 		smc_close_active(smc);
782 		sock_set_flag(sk, SOCK_DEAD);
783 		sk->sk_shutdown |= SHUTDOWN_MASK;
784 	}
785 	if (smc->clcsock) {
786 		struct socket *tcp;
787 
788 		tcp = smc->clcsock;
789 		smc->clcsock = NULL;
790 		sock_release(tcp);
791 	}
792 	if (smc->use_fallback) {
793 		sock_put(sk); /* passive closing */
794 		sk->sk_state = SMC_CLOSED;
795 	} else {
796 		if (sk->sk_state == SMC_CLOSED)
797 			smc_conn_free(&smc->conn);
798 	}
799 	release_sock(sk);
800 	sk->sk_prot->unhash(sk);
801 	sock_put(sk); /* final sock_put */
802 }
803 
804 static int smc_serv_conf_first_link(struct smc_sock *smc)
805 {
806 	struct net *net = sock_net(smc->clcsock->sk);
807 	struct smc_link_group *lgr = smc->conn.lgr;
808 	struct smc_link *link;
809 	int rest;
810 	int rc;
811 
812 	link = &lgr->lnk[SMC_SINGLE_LINK];
813 
814 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
815 		return SMC_CLC_DECL_INTERR;
816 
817 	/* send CONFIRM LINK request to client over the RoCE fabric */
818 	rc = smc_llc_send_confirm_link(link,
819 				       link->smcibdev->mac[link->ibport - 1],
820 				       &link->smcibdev->gid[link->ibport - 1],
821 				       SMC_LLC_REQ);
822 	if (rc < 0)
823 		return SMC_CLC_DECL_TCL;
824 
825 	/* receive CONFIRM LINK response from client over the RoCE fabric */
826 	rest = wait_for_completion_interruptible_timeout(
827 		&link->llc_confirm_resp,
828 		SMC_LLC_WAIT_FIRST_TIME);
829 	if (rest <= 0) {
830 		struct smc_clc_msg_decline dclc;
831 
832 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
833 				      SMC_CLC_DECLINE);
834 		return rc;
835 	}
836 
837 	if (link->llc_confirm_resp_rc)
838 		return SMC_CLC_DECL_RMBE_EC;
839 
840 	/* send ADD LINK request to client over the RoCE fabric */
841 	rc = smc_llc_send_add_link(link,
842 				   link->smcibdev->mac[link->ibport - 1],
843 				   &link->smcibdev->gid[link->ibport - 1],
844 				   SMC_LLC_REQ);
845 	if (rc < 0)
846 		return SMC_CLC_DECL_TCL;
847 
848 	/* receive ADD LINK response from client over the RoCE fabric */
849 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
850 							 SMC_LLC_WAIT_TIME);
851 	if (rest <= 0) {
852 		struct smc_clc_msg_decline dclc;
853 
854 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
855 				      SMC_CLC_DECLINE);
856 		return rc;
857 	}
858 
859 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
860 
861 	return 0;
862 }
863 
864 /* listen worker: finish */
865 static void smc_listen_out(struct smc_sock *new_smc)
866 {
867 	struct smc_sock *lsmc = new_smc->listen_smc;
868 	struct sock *newsmcsk = &new_smc->sk;
869 
870 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
871 	if (lsmc->sk.sk_state == SMC_LISTEN) {
872 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
873 	} else { /* no longer listening */
874 		smc_close_non_accepted(newsmcsk);
875 	}
876 	release_sock(&lsmc->sk);
877 
878 	/* Wake up accept */
879 	lsmc->sk.sk_data_ready(&lsmc->sk);
880 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
881 }
882 
883 /* listen worker: finish in state connected */
884 static void smc_listen_out_connected(struct smc_sock *new_smc)
885 {
886 	struct sock *newsmcsk = &new_smc->sk;
887 
888 	sk_refcnt_debug_inc(newsmcsk);
889 	if (newsmcsk->sk_state == SMC_INIT)
890 		newsmcsk->sk_state = SMC_ACTIVE;
891 
892 	smc_listen_out(new_smc);
893 }
894 
895 /* listen worker: finish in error state */
896 static void smc_listen_out_err(struct smc_sock *new_smc)
897 {
898 	struct sock *newsmcsk = &new_smc->sk;
899 
900 	if (newsmcsk->sk_state == SMC_INIT)
901 		sock_put(&new_smc->sk); /* passive closing */
902 	newsmcsk->sk_state = SMC_CLOSED;
903 	smc_conn_free(&new_smc->conn);
904 
905 	smc_listen_out(new_smc);
906 }
907 
908 /* listen worker: decline and fall back if possible */
909 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
910 			       int local_contact)
911 {
912 	/* RDMA setup failed, switch back to TCP */
913 	if (local_contact == SMC_FIRST_CONTACT)
914 		smc_lgr_forget(new_smc->conn.lgr);
915 	if (reason_code < 0) { /* error, no fallback possible */
916 		smc_listen_out_err(new_smc);
917 		return;
918 	}
919 	smc_conn_free(&new_smc->conn);
920 	new_smc->use_fallback = true;
921 	if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
922 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
923 			smc_listen_out_err(new_smc);
924 			return;
925 		}
926 	}
927 	smc_listen_out_connected(new_smc);
928 }
929 
930 /* listen worker: check prefixes */
931 static int smc_listen_rdma_check(struct smc_sock *new_smc,
932 				 struct smc_clc_msg_proposal *pclc)
933 {
934 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
935 	struct socket *newclcsock = new_smc->clcsock;
936 
937 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
938 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
939 		return SMC_CLC_DECL_CNFERR;
940 
941 	return 0;
942 }
943 
944 /* listen worker: initialize connection and buffers */
945 static int smc_listen_rdma_init(struct smc_sock *new_smc,
946 				struct smc_clc_msg_proposal *pclc,
947 				struct smc_ib_device *ibdev, u8 ibport,
948 				int *local_contact)
949 {
950 	/* allocate connection / link group */
951 	*local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
952 	if (*local_contact < 0) {
953 		if (*local_contact == -ENOMEM)
954 			return SMC_CLC_DECL_MEM;/* insufficient memory*/
955 		return SMC_CLC_DECL_INTERR; /* other error */
956 	}
957 
958 	/* create send buffer and rmb */
959 	if (smc_buf_create(new_smc))
960 		return SMC_CLC_DECL_MEM;
961 
962 	return 0;
963 }
964 
965 /* listen worker: register buffers */
966 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
967 {
968 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
969 
970 	if (local_contact != SMC_FIRST_CONTACT) {
971 		if (!new_smc->conn.rmb_desc->reused) {
972 			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
973 				return SMC_CLC_DECL_INTERR;
974 		}
975 	}
976 	smc_rmb_sync_sg_for_device(&new_smc->conn);
977 
978 	return 0;
979 }
980 
981 /* listen worker: finish RDMA setup */
982 static void smc_listen_rdma_finish(struct smc_sock *new_smc,
983 				   struct smc_clc_msg_accept_confirm *cclc,
984 				   int local_contact)
985 {
986 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
987 	int reason_code = 0;
988 
989 	if (local_contact == SMC_FIRST_CONTACT)
990 		smc_link_save_peer_info(link, cclc);
991 
992 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
993 		reason_code = SMC_CLC_DECL_INTERR;
994 		goto decline;
995 	}
996 
997 	if (local_contact == SMC_FIRST_CONTACT) {
998 		if (smc_ib_ready_link(link)) {
999 			reason_code = SMC_CLC_DECL_INTERR;
1000 			goto decline;
1001 		}
1002 		/* QP confirmation over RoCE fabric */
1003 		reason_code = smc_serv_conf_first_link(new_smc);
1004 		if (reason_code)
1005 			goto decline;
1006 	}
1007 	return;
1008 
1009 decline:
1010 	mutex_unlock(&smc_create_lgr_pending);
1011 	smc_listen_decline(new_smc, reason_code, local_contact);
1012 }
1013 
1014 /* setup for RDMA connection of server */
1015 static void smc_listen_work(struct work_struct *work)
1016 {
1017 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1018 						smc_listen_work);
1019 	struct socket *newclcsock = new_smc->clcsock;
1020 	struct smc_clc_msg_accept_confirm cclc;
1021 	struct smc_clc_msg_proposal *pclc;
1022 	struct smc_ib_device *ibdev;
1023 	u8 buf[SMC_CLC_MAX_LEN];
1024 	int local_contact = 0;
1025 	int reason_code = 0;
1026 	int rc = 0;
1027 	u8 ibport;
1028 
1029 	if (new_smc->use_fallback) {
1030 		smc_listen_out_connected(new_smc);
1031 		return;
1032 	}
1033 
1034 	/* check if peer is smc capable */
1035 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1036 		new_smc->use_fallback = true;
1037 		smc_listen_out_connected(new_smc);
1038 		return;
1039 	}
1040 
1041 	/* do inband token exchange -
1042 	 * wait for and receive SMC Proposal CLC message
1043 	 */
1044 	pclc = (struct smc_clc_msg_proposal *)&buf;
1045 	reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1046 				       SMC_CLC_PROPOSAL);
1047 	if (reason_code) {
1048 		smc_listen_decline(new_smc, reason_code, 0);
1049 		return;
1050 	}
1051 
1052 	/* IPSec connections opt out of SMC-R optimizations */
1053 	if (using_ipsec(new_smc)) {
1054 		smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1055 		return;
1056 	}
1057 
1058 	mutex_lock(&smc_create_lgr_pending);
1059 	smc_close_init(new_smc);
1060 	smc_rx_init(new_smc);
1061 	smc_tx_init(new_smc);
1062 
1063 	/* check if RDMA is available */
1064 	if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
1065 	    smc_listen_rdma_check(new_smc, pclc) ||
1066 	    smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1067 				 &local_contact) ||
1068 	    smc_listen_rdma_reg(new_smc, local_contact)) {
1069 		/* SMC not supported, decline */
1070 		mutex_unlock(&smc_create_lgr_pending);
1071 		smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
1072 		return;
1073 	}
1074 
1075 	/* send SMC Accept CLC message */
1076 	rc = smc_clc_send_accept(new_smc, local_contact);
1077 	if (rc) {
1078 		mutex_unlock(&smc_create_lgr_pending);
1079 		smc_listen_decline(new_smc, rc, local_contact);
1080 		return;
1081 	}
1082 
1083 	/* receive SMC Confirm CLC message */
1084 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1085 				       SMC_CLC_CONFIRM);
1086 	if (reason_code) {
1087 		mutex_unlock(&smc_create_lgr_pending);
1088 		smc_listen_decline(new_smc, reason_code, local_contact);
1089 		return;
1090 	}
1091 
1092 	/* finish worker */
1093 	smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1094 	smc_conn_save_peer_info(new_smc, &cclc);
1095 	mutex_unlock(&smc_create_lgr_pending);
1096 	smc_listen_out_connected(new_smc);
1097 }
1098 
1099 static void smc_tcp_listen_work(struct work_struct *work)
1100 {
1101 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1102 					     tcp_listen_work);
1103 	struct sock *lsk = &lsmc->sk;
1104 	struct smc_sock *new_smc;
1105 	int rc = 0;
1106 
1107 	lock_sock(lsk);
1108 	while (lsk->sk_state == SMC_LISTEN) {
1109 		rc = smc_clcsock_accept(lsmc, &new_smc);
1110 		if (rc)
1111 			goto out;
1112 		if (!new_smc)
1113 			continue;
1114 
1115 		new_smc->listen_smc = lsmc;
1116 		new_smc->use_fallback = lsmc->use_fallback;
1117 		sock_hold(lsk); /* sock_put in smc_listen_work */
1118 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1119 		smc_copy_sock_settings_to_smc(new_smc);
1120 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1121 		if (!schedule_work(&new_smc->smc_listen_work))
1122 			sock_put(&new_smc->sk);
1123 	}
1124 
1125 out:
1126 	release_sock(lsk);
1127 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1128 }
1129 
1130 static int smc_listen(struct socket *sock, int backlog)
1131 {
1132 	struct sock *sk = sock->sk;
1133 	struct smc_sock *smc;
1134 	int rc;
1135 
1136 	smc = smc_sk(sk);
1137 	lock_sock(sk);
1138 
1139 	rc = -EINVAL;
1140 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1141 		goto out;
1142 
1143 	rc = 0;
1144 	if (sk->sk_state == SMC_LISTEN) {
1145 		sk->sk_max_ack_backlog = backlog;
1146 		goto out;
1147 	}
1148 	/* some socket options are handled in core, so we could not apply
1149 	 * them to the clc socket -- copy smc socket options to clc socket
1150 	 */
1151 	smc_copy_sock_settings_to_clc(smc);
1152 	if (!smc->use_fallback)
1153 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1154 
1155 	rc = kernel_listen(smc->clcsock, backlog);
1156 	if (rc)
1157 		goto out;
1158 	sk->sk_max_ack_backlog = backlog;
1159 	sk->sk_ack_backlog = 0;
1160 	sk->sk_state = SMC_LISTEN;
1161 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1162 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1163 	if (!schedule_work(&smc->tcp_listen_work))
1164 		sock_put(sk);
1165 
1166 out:
1167 	release_sock(sk);
1168 	return rc;
1169 }
1170 
1171 static int smc_accept(struct socket *sock, struct socket *new_sock,
1172 		      int flags, bool kern)
1173 {
1174 	struct sock *sk = sock->sk, *nsk;
1175 	DECLARE_WAITQUEUE(wait, current);
1176 	struct smc_sock *lsmc;
1177 	long timeo;
1178 	int rc = 0;
1179 
1180 	lsmc = smc_sk(sk);
1181 	sock_hold(sk); /* sock_put below */
1182 	lock_sock(sk);
1183 
1184 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1185 		rc = -EINVAL;
1186 		release_sock(sk);
1187 		goto out;
1188 	}
1189 
1190 	/* Wait for an incoming connection */
1191 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1192 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1193 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1194 		set_current_state(TASK_INTERRUPTIBLE);
1195 		if (!timeo) {
1196 			rc = -EAGAIN;
1197 			break;
1198 		}
1199 		release_sock(sk);
1200 		timeo = schedule_timeout(timeo);
1201 		/* wakeup by sk_data_ready in smc_listen_work() */
1202 		sched_annotate_sleep();
1203 		lock_sock(sk);
1204 		if (signal_pending(current)) {
1205 			rc = sock_intr_errno(timeo);
1206 			break;
1207 		}
1208 	}
1209 	set_current_state(TASK_RUNNING);
1210 	remove_wait_queue(sk_sleep(sk), &wait);
1211 
1212 	if (!rc)
1213 		rc = sock_error(nsk);
1214 	release_sock(sk);
1215 	if (rc)
1216 		goto out;
1217 
1218 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1219 		/* wait till data arrives on the socket */
1220 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1221 								MSEC_PER_SEC);
1222 		if (smc_sk(nsk)->use_fallback) {
1223 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1224 
1225 			lock_sock(clcsk);
1226 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1227 				sk_wait_data(clcsk, &timeo, NULL);
1228 			release_sock(clcsk);
1229 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1230 			lock_sock(nsk);
1231 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1232 			release_sock(nsk);
1233 		}
1234 	}
1235 
1236 out:
1237 	sock_put(sk); /* sock_hold above */
1238 	return rc;
1239 }
1240 
1241 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1242 		       int peer)
1243 {
1244 	struct smc_sock *smc;
1245 
1246 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1247 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1248 		return -ENOTCONN;
1249 
1250 	smc = smc_sk(sock->sk);
1251 
1252 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1253 }
1254 
1255 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1256 {
1257 	struct sock *sk = sock->sk;
1258 	struct smc_sock *smc;
1259 	int rc = -EPIPE;
1260 
1261 	smc = smc_sk(sk);
1262 	lock_sock(sk);
1263 	if ((sk->sk_state != SMC_ACTIVE) &&
1264 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1265 	    (sk->sk_state != SMC_INIT))
1266 		goto out;
1267 
1268 	if (msg->msg_flags & MSG_FASTOPEN) {
1269 		if (sk->sk_state == SMC_INIT) {
1270 			smc->use_fallback = true;
1271 		} else {
1272 			rc = -EINVAL;
1273 			goto out;
1274 		}
1275 	}
1276 
1277 	if (smc->use_fallback)
1278 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1279 	else
1280 		rc = smc_tx_sendmsg(smc, msg, len);
1281 out:
1282 	release_sock(sk);
1283 	return rc;
1284 }
1285 
1286 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1287 		       int flags)
1288 {
1289 	struct sock *sk = sock->sk;
1290 	struct smc_sock *smc;
1291 	int rc = -ENOTCONN;
1292 
1293 	smc = smc_sk(sk);
1294 	lock_sock(sk);
1295 	if ((sk->sk_state == SMC_INIT) ||
1296 	    (sk->sk_state == SMC_LISTEN) ||
1297 	    (sk->sk_state == SMC_CLOSED))
1298 		goto out;
1299 
1300 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1301 		rc = 0;
1302 		goto out;
1303 	}
1304 
1305 	if (smc->use_fallback) {
1306 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1307 	} else {
1308 		msg->msg_namelen = 0;
1309 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1310 	}
1311 
1312 out:
1313 	release_sock(sk);
1314 	return rc;
1315 }
1316 
1317 static __poll_t smc_accept_poll(struct sock *parent)
1318 {
1319 	struct smc_sock *isk = smc_sk(parent);
1320 	__poll_t mask = 0;
1321 
1322 	spin_lock(&isk->accept_q_lock);
1323 	if (!list_empty(&isk->accept_q))
1324 		mask = EPOLLIN | EPOLLRDNORM;
1325 	spin_unlock(&isk->accept_q_lock);
1326 
1327 	return mask;
1328 }
1329 
1330 static __poll_t smc_poll(struct file *file, struct socket *sock,
1331 			     poll_table *wait)
1332 {
1333 	struct sock *sk = sock->sk;
1334 	__poll_t mask = 0;
1335 	struct smc_sock *smc;
1336 
1337 	if (!sk)
1338 		return EPOLLNVAL;
1339 
1340 	smc = smc_sk(sock->sk);
1341 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1342 		/* delegate to CLC child sock */
1343 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1344 		sk->sk_err = smc->clcsock->sk->sk_err;
1345 		if (sk->sk_err)
1346 			mask |= EPOLLERR;
1347 	} else {
1348 		if (sk->sk_state != SMC_CLOSED)
1349 			sock_poll_wait(file, sk_sleep(sk), wait);
1350 		if (sk->sk_err)
1351 			mask |= EPOLLERR;
1352 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1353 		    (sk->sk_state == SMC_CLOSED))
1354 			mask |= EPOLLHUP;
1355 		if (sk->sk_state == SMC_LISTEN) {
1356 			/* woken up by sk_data_ready in smc_listen_work() */
1357 			mask = smc_accept_poll(sk);
1358 		} else {
1359 			if (atomic_read(&smc->conn.sndbuf_space) ||
1360 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1361 				mask |= EPOLLOUT | EPOLLWRNORM;
1362 			} else {
1363 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1364 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1365 			}
1366 			if (atomic_read(&smc->conn.bytes_to_rcv))
1367 				mask |= EPOLLIN | EPOLLRDNORM;
1368 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1369 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1370 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1371 				mask |= EPOLLIN;
1372 		}
1373 		if (smc->conn.urg_state == SMC_URG_VALID)
1374 			mask |= EPOLLPRI;
1375 	}
1376 
1377 	return mask;
1378 }
1379 
1380 static int smc_shutdown(struct socket *sock, int how)
1381 {
1382 	struct sock *sk = sock->sk;
1383 	struct smc_sock *smc;
1384 	int rc = -EINVAL;
1385 	int rc1 = 0;
1386 
1387 	smc = smc_sk(sk);
1388 
1389 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1390 		return rc;
1391 
1392 	lock_sock(sk);
1393 
1394 	rc = -ENOTCONN;
1395 	if ((sk->sk_state != SMC_LISTEN) &&
1396 	    (sk->sk_state != SMC_ACTIVE) &&
1397 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1398 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1399 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1400 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1401 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1402 		goto out;
1403 	if (smc->use_fallback) {
1404 		rc = kernel_sock_shutdown(smc->clcsock, how);
1405 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1406 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1407 			sk->sk_state = SMC_CLOSED;
1408 		goto out;
1409 	}
1410 	switch (how) {
1411 	case SHUT_RDWR:		/* shutdown in both directions */
1412 		rc = smc_close_active(smc);
1413 		break;
1414 	case SHUT_WR:
1415 		rc = smc_close_shutdown_write(smc);
1416 		break;
1417 	case SHUT_RD:
1418 		rc = 0;
1419 		/* nothing more to do because peer is not involved */
1420 		break;
1421 	}
1422 	if (smc->clcsock)
1423 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1424 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1425 	sk->sk_shutdown |= how + 1;
1426 
1427 out:
1428 	release_sock(sk);
1429 	return rc ? rc : rc1;
1430 }
1431 
1432 static int smc_setsockopt(struct socket *sock, int level, int optname,
1433 			  char __user *optval, unsigned int optlen)
1434 {
1435 	struct sock *sk = sock->sk;
1436 	struct smc_sock *smc;
1437 	int val, rc;
1438 
1439 	smc = smc_sk(sk);
1440 
1441 	/* generic setsockopts reaching us here always apply to the
1442 	 * CLC socket
1443 	 */
1444 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1445 					   optval, optlen);
1446 	if (smc->clcsock->sk->sk_err) {
1447 		sk->sk_err = smc->clcsock->sk->sk_err;
1448 		sk->sk_error_report(sk);
1449 	}
1450 	if (rc)
1451 		return rc;
1452 
1453 	if (optlen < sizeof(int))
1454 		return -EINVAL;
1455 	get_user(val, (int __user *)optval);
1456 
1457 	lock_sock(sk);
1458 	switch (optname) {
1459 	case TCP_ULP:
1460 	case TCP_FASTOPEN:
1461 	case TCP_FASTOPEN_CONNECT:
1462 	case TCP_FASTOPEN_KEY:
1463 	case TCP_FASTOPEN_NO_COOKIE:
1464 		/* option not supported by SMC */
1465 		if (sk->sk_state == SMC_INIT) {
1466 			smc->use_fallback = true;
1467 		} else {
1468 			if (!smc->use_fallback)
1469 				rc = -EINVAL;
1470 		}
1471 		break;
1472 	case TCP_NODELAY:
1473 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1474 			if (val && !smc->use_fallback)
1475 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1476 						 0);
1477 		}
1478 		break;
1479 	case TCP_CORK:
1480 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1481 			if (!val && !smc->use_fallback)
1482 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1483 						 0);
1484 		}
1485 		break;
1486 	case TCP_DEFER_ACCEPT:
1487 		smc->sockopt_defer_accept = val;
1488 		break;
1489 	default:
1490 		break;
1491 	}
1492 	release_sock(sk);
1493 
1494 	return rc;
1495 }
1496 
1497 static int smc_getsockopt(struct socket *sock, int level, int optname,
1498 			  char __user *optval, int __user *optlen)
1499 {
1500 	struct smc_sock *smc;
1501 
1502 	smc = smc_sk(sock->sk);
1503 	/* socket options apply to the CLC socket */
1504 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1505 					     optval, optlen);
1506 }
1507 
1508 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1509 		     unsigned long arg)
1510 {
1511 	union smc_host_cursor cons, urg;
1512 	struct smc_connection *conn;
1513 	struct smc_sock *smc;
1514 	int answ;
1515 
1516 	smc = smc_sk(sock->sk);
1517 	conn = &smc->conn;
1518 	if (smc->use_fallback) {
1519 		if (!smc->clcsock)
1520 			return -EBADF;
1521 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1522 	}
1523 	switch (cmd) {
1524 	case SIOCINQ: /* same as FIONREAD */
1525 		if (smc->sk.sk_state == SMC_LISTEN)
1526 			return -EINVAL;
1527 		if (smc->sk.sk_state == SMC_INIT ||
1528 		    smc->sk.sk_state == SMC_CLOSED)
1529 			answ = 0;
1530 		else
1531 			answ = atomic_read(&smc->conn.bytes_to_rcv);
1532 		break;
1533 	case SIOCOUTQ:
1534 		/* output queue size (not send + not acked) */
1535 		if (smc->sk.sk_state == SMC_LISTEN)
1536 			return -EINVAL;
1537 		if (smc->sk.sk_state == SMC_INIT ||
1538 		    smc->sk.sk_state == SMC_CLOSED)
1539 			answ = 0;
1540 		else
1541 			answ = smc->conn.sndbuf_desc->len -
1542 					atomic_read(&smc->conn.sndbuf_space);
1543 		break;
1544 	case SIOCOUTQNSD:
1545 		/* output queue size (not send only) */
1546 		if (smc->sk.sk_state == SMC_LISTEN)
1547 			return -EINVAL;
1548 		if (smc->sk.sk_state == SMC_INIT ||
1549 		    smc->sk.sk_state == SMC_CLOSED)
1550 			answ = 0;
1551 		else
1552 			answ = smc_tx_prepared_sends(&smc->conn);
1553 		break;
1554 	case SIOCATMARK:
1555 		if (smc->sk.sk_state == SMC_LISTEN)
1556 			return -EINVAL;
1557 		if (smc->sk.sk_state == SMC_INIT ||
1558 		    smc->sk.sk_state == SMC_CLOSED) {
1559 			answ = 0;
1560 		} else {
1561 			smc_curs_write(&cons,
1562 			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
1563 				       conn);
1564 			smc_curs_write(&urg,
1565 				       smc_curs_read(&conn->urg_curs, conn),
1566 				       conn);
1567 			answ = smc_curs_diff(conn->rmb_desc->len,
1568 					     &cons, &urg) == 1;
1569 		}
1570 		break;
1571 	default:
1572 		return -ENOIOCTLCMD;
1573 	}
1574 
1575 	return put_user(answ, (int __user *)arg);
1576 }
1577 
1578 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1579 			    int offset, size_t size, int flags)
1580 {
1581 	struct sock *sk = sock->sk;
1582 	struct smc_sock *smc;
1583 	int rc = -EPIPE;
1584 
1585 	smc = smc_sk(sk);
1586 	lock_sock(sk);
1587 	if (sk->sk_state != SMC_ACTIVE) {
1588 		release_sock(sk);
1589 		goto out;
1590 	}
1591 	release_sock(sk);
1592 	if (smc->use_fallback)
1593 		rc = kernel_sendpage(smc->clcsock, page, offset,
1594 				     size, flags);
1595 	else
1596 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1597 
1598 out:
1599 	return rc;
1600 }
1601 
1602 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1603  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1604  * updates till whenever a respective page has been fully processed.
1605  * Note that subsequent recv() calls have to wait till all splice() processing
1606  * completed.
1607  */
1608 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1609 			       struct pipe_inode_info *pipe, size_t len,
1610 			       unsigned int flags)
1611 {
1612 	struct sock *sk = sock->sk;
1613 	struct smc_sock *smc;
1614 	int rc = -ENOTCONN;
1615 
1616 	smc = smc_sk(sk);
1617 	lock_sock(sk);
1618 
1619 	if (sk->sk_state == SMC_INIT ||
1620 	    sk->sk_state == SMC_LISTEN ||
1621 	    sk->sk_state == SMC_CLOSED)
1622 		goto out;
1623 
1624 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1625 		rc = 0;
1626 		goto out;
1627 	}
1628 
1629 	if (smc->use_fallback) {
1630 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1631 						    pipe, len, flags);
1632 	} else {
1633 		if (*ppos) {
1634 			rc = -ESPIPE;
1635 			goto out;
1636 		}
1637 		if (flags & SPLICE_F_NONBLOCK)
1638 			flags = MSG_DONTWAIT;
1639 		else
1640 			flags = 0;
1641 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1642 	}
1643 out:
1644 	release_sock(sk);
1645 
1646 	return rc;
1647 }
1648 
1649 /* must look like tcp */
1650 static const struct proto_ops smc_sock_ops = {
1651 	.family		= PF_SMC,
1652 	.owner		= THIS_MODULE,
1653 	.release	= smc_release,
1654 	.bind		= smc_bind,
1655 	.connect	= smc_connect,
1656 	.socketpair	= sock_no_socketpair,
1657 	.accept		= smc_accept,
1658 	.getname	= smc_getname,
1659 	.poll		= smc_poll,
1660 	.ioctl		= smc_ioctl,
1661 	.listen		= smc_listen,
1662 	.shutdown	= smc_shutdown,
1663 	.setsockopt	= smc_setsockopt,
1664 	.getsockopt	= smc_getsockopt,
1665 	.sendmsg	= smc_sendmsg,
1666 	.recvmsg	= smc_recvmsg,
1667 	.mmap		= sock_no_mmap,
1668 	.sendpage	= smc_sendpage,
1669 	.splice_read	= smc_splice_read,
1670 };
1671 
1672 static int smc_create(struct net *net, struct socket *sock, int protocol,
1673 		      int kern)
1674 {
1675 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1676 	struct smc_sock *smc;
1677 	struct sock *sk;
1678 	int rc;
1679 
1680 	rc = -ESOCKTNOSUPPORT;
1681 	if (sock->type != SOCK_STREAM)
1682 		goto out;
1683 
1684 	rc = -EPROTONOSUPPORT;
1685 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1686 		goto out;
1687 
1688 	rc = -ENOBUFS;
1689 	sock->ops = &smc_sock_ops;
1690 	sk = smc_sock_alloc(net, sock, protocol);
1691 	if (!sk)
1692 		goto out;
1693 
1694 	/* create internal TCP socket for CLC handshake and fallback */
1695 	smc = smc_sk(sk);
1696 	smc->use_fallback = false; /* assume rdma capability first */
1697 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1698 			      &smc->clcsock);
1699 	if (rc) {
1700 		sk_common_release(sk);
1701 		goto out;
1702 	}
1703 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1704 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1705 
1706 out:
1707 	return rc;
1708 }
1709 
1710 static const struct net_proto_family smc_sock_family_ops = {
1711 	.family	= PF_SMC,
1712 	.owner	= THIS_MODULE,
1713 	.create	= smc_create,
1714 };
1715 
1716 static int __init smc_init(void)
1717 {
1718 	int rc;
1719 
1720 	rc = smc_pnet_init();
1721 	if (rc)
1722 		return rc;
1723 
1724 	rc = smc_llc_init();
1725 	if (rc) {
1726 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1727 		goto out_pnet;
1728 	}
1729 
1730 	rc = smc_cdc_init();
1731 	if (rc) {
1732 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1733 		goto out_pnet;
1734 	}
1735 
1736 	rc = proto_register(&smc_proto, 1);
1737 	if (rc) {
1738 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1739 		goto out_pnet;
1740 	}
1741 
1742 	rc = proto_register(&smc_proto6, 1);
1743 	if (rc) {
1744 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1745 		goto out_proto;
1746 	}
1747 
1748 	rc = sock_register(&smc_sock_family_ops);
1749 	if (rc) {
1750 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1751 		goto out_proto6;
1752 	}
1753 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1754 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1755 
1756 	rc = smc_ib_register_client();
1757 	if (rc) {
1758 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1759 		goto out_sock;
1760 	}
1761 
1762 	static_branch_enable(&tcp_have_smc);
1763 	return 0;
1764 
1765 out_sock:
1766 	sock_unregister(PF_SMC);
1767 out_proto6:
1768 	proto_unregister(&smc_proto6);
1769 out_proto:
1770 	proto_unregister(&smc_proto);
1771 out_pnet:
1772 	smc_pnet_exit();
1773 	return rc;
1774 }
1775 
1776 static void __exit smc_exit(void)
1777 {
1778 	smc_core_exit();
1779 	static_branch_disable(&tcp_have_smc);
1780 	smc_ib_unregister_client();
1781 	sock_unregister(PF_SMC);
1782 	proto_unregister(&smc_proto6);
1783 	proto_unregister(&smc_proto);
1784 	smc_pnet_exit();
1785 }
1786 
1787 module_init(smc_init);
1788 module_exit(smc_exit);
1789 
1790 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1791 MODULE_DESCRIPTION("smc socket address family");
1792 MODULE_LICENSE("GPL");
1793 MODULE_ALIAS_NETPROTO(PF_SMC);
1794