1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - support for alternate links postponed 11 * 12 * Copyright IBM Corp. 2016, 2018 13 * 14 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 15 * based on prototype from Frank Blaschka 16 */ 17 18 #define KMSG_COMPONENT "smc" 19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 20 21 #include <linux/module.h> 22 #include <linux/socket.h> 23 #include <linux/workqueue.h> 24 #include <linux/in.h> 25 #include <linux/sched/signal.h> 26 27 #include <net/sock.h> 28 #include <net/tcp.h> 29 #include <net/smc.h> 30 #include <asm/ioctls.h> 31 32 #include "smc.h" 33 #include "smc_clc.h" 34 #include "smc_llc.h" 35 #include "smc_cdc.h" 36 #include "smc_core.h" 37 #include "smc_ib.h" 38 #include "smc_pnet.h" 39 #include "smc_tx.h" 40 #include "smc_rx.h" 41 #include "smc_close.h" 42 43 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 44 * creation 45 */ 46 47 static void smc_tcp_listen_work(struct work_struct *); 48 49 static void smc_set_keepalive(struct sock *sk, int val) 50 { 51 struct smc_sock *smc = smc_sk(sk); 52 53 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 54 } 55 56 static struct smc_hashinfo smc_v4_hashinfo = { 57 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 58 }; 59 60 static struct smc_hashinfo smc_v6_hashinfo = { 61 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 62 }; 63 64 int smc_hash_sk(struct sock *sk) 65 { 66 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 67 struct hlist_head *head; 68 69 head = &h->ht; 70 71 write_lock_bh(&h->lock); 72 sk_add_node(sk, head); 73 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 74 write_unlock_bh(&h->lock); 75 76 return 0; 77 } 78 EXPORT_SYMBOL_GPL(smc_hash_sk); 79 80 void smc_unhash_sk(struct sock *sk) 81 { 82 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 83 84 write_lock_bh(&h->lock); 85 if (sk_del_node_init(sk)) 86 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 87 write_unlock_bh(&h->lock); 88 } 89 EXPORT_SYMBOL_GPL(smc_unhash_sk); 90 91 struct proto smc_proto = { 92 .name = "SMC", 93 .owner = THIS_MODULE, 94 .keepalive = smc_set_keepalive, 95 .hash = smc_hash_sk, 96 .unhash = smc_unhash_sk, 97 .obj_size = sizeof(struct smc_sock), 98 .h.smc_hash = &smc_v4_hashinfo, 99 .slab_flags = SLAB_TYPESAFE_BY_RCU, 100 }; 101 EXPORT_SYMBOL_GPL(smc_proto); 102 103 struct proto smc_proto6 = { 104 .name = "SMC6", 105 .owner = THIS_MODULE, 106 .keepalive = smc_set_keepalive, 107 .hash = smc_hash_sk, 108 .unhash = smc_unhash_sk, 109 .obj_size = sizeof(struct smc_sock), 110 .h.smc_hash = &smc_v6_hashinfo, 111 .slab_flags = SLAB_TYPESAFE_BY_RCU, 112 }; 113 EXPORT_SYMBOL_GPL(smc_proto6); 114 115 static int smc_release(struct socket *sock) 116 { 117 struct sock *sk = sock->sk; 118 struct smc_sock *smc; 119 int rc = 0; 120 121 if (!sk) 122 goto out; 123 124 smc = smc_sk(sk); 125 if (sk->sk_state == SMC_LISTEN) 126 /* smc_close_non_accepted() is called and acquires 127 * sock lock for child sockets again 128 */ 129 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 130 else 131 lock_sock(sk); 132 133 if (!smc->use_fallback) { 134 rc = smc_close_active(smc); 135 sock_set_flag(sk, SOCK_DEAD); 136 sk->sk_shutdown |= SHUTDOWN_MASK; 137 } 138 if (smc->clcsock) { 139 sock_release(smc->clcsock); 140 smc->clcsock = NULL; 141 } 142 if (smc->use_fallback) { 143 sock_put(sk); /* passive closing */ 144 sk->sk_state = SMC_CLOSED; 145 sk->sk_state_change(sk); 146 } 147 148 /* detach socket */ 149 sock_orphan(sk); 150 sock->sk = NULL; 151 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 152 smc_conn_free(&smc->conn); 153 release_sock(sk); 154 155 sk->sk_prot->unhash(sk); 156 sock_put(sk); /* final sock_put */ 157 out: 158 return rc; 159 } 160 161 static void smc_destruct(struct sock *sk) 162 { 163 if (sk->sk_state != SMC_CLOSED) 164 return; 165 if (!sock_flag(sk, SOCK_DEAD)) 166 return; 167 168 sk_refcnt_debug_dec(sk); 169 } 170 171 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 172 int protocol) 173 { 174 struct smc_sock *smc; 175 struct proto *prot; 176 struct sock *sk; 177 178 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 179 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 180 if (!sk) 181 return NULL; 182 183 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 184 sk->sk_state = SMC_INIT; 185 sk->sk_destruct = smc_destruct; 186 sk->sk_protocol = protocol; 187 smc = smc_sk(sk); 188 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 189 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); 190 INIT_LIST_HEAD(&smc->accept_q); 191 spin_lock_init(&smc->accept_q_lock); 192 spin_lock_init(&smc->conn.send_lock); 193 sk->sk_prot->hash(sk); 194 sk_refcnt_debug_inc(sk); 195 196 return sk; 197 } 198 199 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 200 int addr_len) 201 { 202 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 203 struct sock *sk = sock->sk; 204 struct smc_sock *smc; 205 int rc; 206 207 smc = smc_sk(sk); 208 209 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 210 rc = -EINVAL; 211 if (addr_len < sizeof(struct sockaddr_in)) 212 goto out; 213 214 rc = -EAFNOSUPPORT; 215 if (addr->sin_family != AF_INET && 216 addr->sin_family != AF_INET6 && 217 addr->sin_family != AF_UNSPEC) 218 goto out; 219 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 220 if (addr->sin_family == AF_UNSPEC && 221 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 222 goto out; 223 224 lock_sock(sk); 225 226 /* Check if socket is already active */ 227 rc = -EINVAL; 228 if (sk->sk_state != SMC_INIT) 229 goto out_rel; 230 231 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 232 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 233 234 out_rel: 235 release_sock(sk); 236 out: 237 return rc; 238 } 239 240 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 241 unsigned long mask) 242 { 243 /* options we don't get control via setsockopt for */ 244 nsk->sk_type = osk->sk_type; 245 nsk->sk_sndbuf = osk->sk_sndbuf; 246 nsk->sk_rcvbuf = osk->sk_rcvbuf; 247 nsk->sk_sndtimeo = osk->sk_sndtimeo; 248 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 249 nsk->sk_mark = osk->sk_mark; 250 nsk->sk_priority = osk->sk_priority; 251 nsk->sk_rcvlowat = osk->sk_rcvlowat; 252 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 253 nsk->sk_err = osk->sk_err; 254 255 nsk->sk_flags &= ~mask; 256 nsk->sk_flags |= osk->sk_flags & mask; 257 } 258 259 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 260 (1UL << SOCK_KEEPOPEN) | \ 261 (1UL << SOCK_LINGER) | \ 262 (1UL << SOCK_BROADCAST) | \ 263 (1UL << SOCK_TIMESTAMP) | \ 264 (1UL << SOCK_DBG) | \ 265 (1UL << SOCK_RCVTSTAMP) | \ 266 (1UL << SOCK_RCVTSTAMPNS) | \ 267 (1UL << SOCK_LOCALROUTE) | \ 268 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 269 (1UL << SOCK_RXQ_OVFL) | \ 270 (1UL << SOCK_WIFI_STATUS) | \ 271 (1UL << SOCK_NOFCS) | \ 272 (1UL << SOCK_FILTER_LOCKED)) 273 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 274 * clc socket (since smc is not called for these options from net/core) 275 */ 276 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 277 { 278 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 279 } 280 281 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 282 (1UL << SOCK_KEEPOPEN) | \ 283 (1UL << SOCK_LINGER) | \ 284 (1UL << SOCK_DBG)) 285 /* copy only settings and flags relevant for smc from clc to smc socket */ 286 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 287 { 288 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 289 } 290 291 /* register a new rmb, optionally send confirm_rkey msg to register with peer */ 292 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, 293 bool conf_rkey) 294 { 295 /* register memory region for new rmb */ 296 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { 297 rmb_desc->regerr = 1; 298 return -EFAULT; 299 } 300 if (!conf_rkey) 301 return 0; 302 /* exchange confirm_rkey msg with peer */ 303 if (smc_llc_do_confirm_rkey(link, rmb_desc)) { 304 rmb_desc->regerr = 1; 305 return -EFAULT; 306 } 307 return 0; 308 } 309 310 static int smc_clnt_conf_first_link(struct smc_sock *smc) 311 { 312 struct net *net = sock_net(smc->clcsock->sk); 313 struct smc_link_group *lgr = smc->conn.lgr; 314 struct smc_link *link; 315 int rest; 316 int rc; 317 318 link = &lgr->lnk[SMC_SINGLE_LINK]; 319 /* receive CONFIRM LINK request from server over RoCE fabric */ 320 rest = wait_for_completion_interruptible_timeout( 321 &link->llc_confirm, 322 SMC_LLC_WAIT_FIRST_TIME); 323 if (rest <= 0) { 324 struct smc_clc_msg_decline dclc; 325 326 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 327 SMC_CLC_DECLINE); 328 return rc; 329 } 330 331 if (link->llc_confirm_rc) 332 return SMC_CLC_DECL_RMBE_EC; 333 334 rc = smc_ib_modify_qp_rts(link); 335 if (rc) 336 return SMC_CLC_DECL_INTERR; 337 338 smc_wr_remember_qp_attr(link); 339 340 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 341 return SMC_CLC_DECL_INTERR; 342 343 /* send CONFIRM LINK response over RoCE fabric */ 344 rc = smc_llc_send_confirm_link(link, 345 link->smcibdev->mac[link->ibport - 1], 346 &link->smcibdev->gid[link->ibport - 1], 347 SMC_LLC_RESP); 348 if (rc < 0) 349 return SMC_CLC_DECL_TCL; 350 351 /* receive ADD LINK request from server over RoCE fabric */ 352 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 353 SMC_LLC_WAIT_TIME); 354 if (rest <= 0) { 355 struct smc_clc_msg_decline dclc; 356 357 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 358 SMC_CLC_DECLINE); 359 return rc; 360 } 361 362 /* send add link reject message, only one link supported for now */ 363 rc = smc_llc_send_add_link(link, 364 link->smcibdev->mac[link->ibport - 1], 365 &link->smcibdev->gid[link->ibport - 1], 366 SMC_LLC_RESP); 367 if (rc < 0) 368 return SMC_CLC_DECL_TCL; 369 370 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 371 372 return 0; 373 } 374 375 static void smc_conn_save_peer_info(struct smc_sock *smc, 376 struct smc_clc_msg_accept_confirm *clc) 377 { 378 int bufsize = smc_uncompress_bufsize(clc->rmbe_size); 379 380 smc->conn.peer_rmbe_idx = clc->rmbe_idx; 381 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 382 smc->conn.peer_rmbe_size = bufsize; 383 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 384 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); 385 } 386 387 static void smc_link_save_peer_info(struct smc_link *link, 388 struct smc_clc_msg_accept_confirm *clc) 389 { 390 link->peer_qpn = ntoh24(clc->qpn); 391 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 392 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 393 link->peer_psn = ntoh24(clc->psn); 394 link->peer_mtu = clc->qp_mtu; 395 } 396 397 /* fall back during connect */ 398 static int smc_connect_fallback(struct smc_sock *smc) 399 { 400 smc->use_fallback = true; 401 smc_copy_sock_settings_to_clc(smc); 402 if (smc->sk.sk_state == SMC_INIT) 403 smc->sk.sk_state = SMC_ACTIVE; 404 return 0; 405 } 406 407 /* decline and fall back during connect */ 408 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) 409 { 410 int rc; 411 412 if (reason_code < 0) /* error, fallback is not possible */ 413 return reason_code; 414 if (reason_code != SMC_CLC_DECL_REPLY) { 415 rc = smc_clc_send_decline(smc, reason_code); 416 if (rc < 0) 417 return rc; 418 } 419 return smc_connect_fallback(smc); 420 } 421 422 /* abort connecting */ 423 static int smc_connect_abort(struct smc_sock *smc, int reason_code, 424 int local_contact) 425 { 426 if (local_contact == SMC_FIRST_CONTACT) 427 smc_lgr_forget(smc->conn.lgr); 428 mutex_unlock(&smc_create_lgr_pending); 429 smc_conn_free(&smc->conn); 430 if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) 431 sock_put(&smc->sk); /* passive closing */ 432 return reason_code; 433 } 434 435 /* check if there is a rdma device available for this connection. */ 436 /* called for connect and listen */ 437 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, 438 u8 *ibport) 439 { 440 int reason_code = 0; 441 442 /* PNET table look up: search active ib_device and port 443 * within same PNETID that also contains the ethernet device 444 * used for the internal TCP socket 445 */ 446 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); 447 if (!(*ibdev)) 448 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 449 450 return reason_code; 451 } 452 453 /* CLC handshake during connect */ 454 static int smc_connect_clc(struct smc_sock *smc, 455 struct smc_clc_msg_accept_confirm *aclc, 456 struct smc_ib_device *ibdev, u8 ibport) 457 { 458 int rc = 0; 459 460 /* do inband token exchange */ 461 rc = smc_clc_send_proposal(smc, ibdev, ibport); 462 if (rc) 463 return rc; 464 /* receive SMC Accept CLC message */ 465 return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); 466 } 467 468 /* setup for RDMA connection of client */ 469 static int smc_connect_rdma(struct smc_sock *smc, 470 struct smc_clc_msg_accept_confirm *aclc, 471 struct smc_ib_device *ibdev, u8 ibport) 472 { 473 int local_contact = SMC_FIRST_CONTACT; 474 struct smc_link *link; 475 int reason_code = 0; 476 477 mutex_lock(&smc_create_lgr_pending); 478 local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, 479 aclc->hdr.flag); 480 if (local_contact < 0) { 481 if (local_contact == -ENOMEM) 482 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 483 else if (local_contact == -ENOLINK) 484 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 485 else 486 reason_code = SMC_CLC_DECL_INTERR; /* other error */ 487 return smc_connect_abort(smc, reason_code, 0); 488 } 489 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 490 491 smc_conn_save_peer_info(smc, aclc); 492 493 /* create send buffer and rmb */ 494 if (smc_buf_create(smc)) 495 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); 496 497 if (local_contact == SMC_FIRST_CONTACT) 498 smc_link_save_peer_info(link, aclc); 499 500 if (smc_rmb_rtoken_handling(&smc->conn, aclc)) 501 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 502 local_contact); 503 504 smc_close_init(smc); 505 smc_rx_init(smc); 506 507 if (local_contact == SMC_FIRST_CONTACT) { 508 if (smc_ib_ready_link(link)) 509 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 510 local_contact); 511 } else { 512 if (!smc->conn.rmb_desc->reused && 513 smc_reg_rmb(link, smc->conn.rmb_desc, true)) 514 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 515 local_contact); 516 } 517 smc_rmb_sync_sg_for_device(&smc->conn); 518 519 reason_code = smc_clc_send_confirm(smc); 520 if (reason_code) 521 return smc_connect_abort(smc, reason_code, local_contact); 522 523 smc_tx_init(smc); 524 525 if (local_contact == SMC_FIRST_CONTACT) { 526 /* QP confirmation over RoCE fabric */ 527 reason_code = smc_clnt_conf_first_link(smc); 528 if (reason_code) 529 return smc_connect_abort(smc, reason_code, 530 local_contact); 531 } 532 mutex_unlock(&smc_create_lgr_pending); 533 534 smc_copy_sock_settings_to_clc(smc); 535 if (smc->sk.sk_state == SMC_INIT) 536 smc->sk.sk_state = SMC_ACTIVE; 537 538 return 0; 539 } 540 541 /* perform steps before actually connecting */ 542 static int __smc_connect(struct smc_sock *smc) 543 { 544 struct smc_clc_msg_accept_confirm aclc; 545 struct smc_ib_device *ibdev; 546 int rc = 0; 547 u8 ibport; 548 549 sock_hold(&smc->sk); /* sock put in passive closing */ 550 551 if (smc->use_fallback) 552 return smc_connect_fallback(smc); 553 554 /* if peer has not signalled SMC-capability, fall back */ 555 if (!tcp_sk(smc->clcsock->sk)->syn_smc) 556 return smc_connect_fallback(smc); 557 558 /* IPSec connections opt out of SMC-R optimizations */ 559 if (using_ipsec(smc)) 560 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); 561 562 /* check if a RDMA device is available; if not, fall back */ 563 if (smc_check_rdma(smc, &ibdev, &ibport)) 564 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); 565 566 /* perform CLC handshake */ 567 rc = smc_connect_clc(smc, &aclc, ibdev, ibport); 568 if (rc) 569 return smc_connect_decline_fallback(smc, rc); 570 571 /* connect using rdma */ 572 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); 573 if (rc) 574 return smc_connect_decline_fallback(smc, rc); 575 576 return 0; 577 } 578 579 static int smc_connect(struct socket *sock, struct sockaddr *addr, 580 int alen, int flags) 581 { 582 struct sock *sk = sock->sk; 583 struct smc_sock *smc; 584 int rc = -EINVAL; 585 586 smc = smc_sk(sk); 587 588 /* separate smc parameter checking to be safe */ 589 if (alen < sizeof(addr->sa_family)) 590 goto out_err; 591 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 592 goto out_err; 593 594 lock_sock(sk); 595 switch (sk->sk_state) { 596 default: 597 goto out; 598 case SMC_ACTIVE: 599 rc = -EISCONN; 600 goto out; 601 case SMC_INIT: 602 rc = 0; 603 break; 604 } 605 606 smc_copy_sock_settings_to_clc(smc); 607 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 608 rc = kernel_connect(smc->clcsock, addr, alen, flags); 609 if (rc) 610 goto out; 611 612 rc = __smc_connect(smc); 613 if (rc < 0) 614 goto out; 615 else 616 rc = 0; /* success cases including fallback */ 617 618 out: 619 release_sock(sk); 620 out_err: 621 return rc; 622 } 623 624 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 625 { 626 struct socket *new_clcsock = NULL; 627 struct sock *lsk = &lsmc->sk; 628 struct sock *new_sk; 629 int rc; 630 631 release_sock(lsk); 632 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 633 if (!new_sk) { 634 rc = -ENOMEM; 635 lsk->sk_err = ENOMEM; 636 *new_smc = NULL; 637 lock_sock(lsk); 638 goto out; 639 } 640 *new_smc = smc_sk(new_sk); 641 642 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 643 lock_sock(lsk); 644 if (rc < 0) 645 lsk->sk_err = -rc; 646 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 647 if (new_clcsock) 648 sock_release(new_clcsock); 649 new_sk->sk_state = SMC_CLOSED; 650 sock_set_flag(new_sk, SOCK_DEAD); 651 new_sk->sk_prot->unhash(new_sk); 652 sock_put(new_sk); /* final */ 653 *new_smc = NULL; 654 goto out; 655 } 656 657 (*new_smc)->clcsock = new_clcsock; 658 out: 659 return rc; 660 } 661 662 /* add a just created sock to the accept queue of the listen sock as 663 * candidate for a following socket accept call from user space 664 */ 665 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 666 { 667 struct smc_sock *par = smc_sk(parent); 668 669 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 670 spin_lock(&par->accept_q_lock); 671 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 672 spin_unlock(&par->accept_q_lock); 673 sk_acceptq_added(parent); 674 } 675 676 /* remove a socket from the accept queue of its parental listening socket */ 677 static void smc_accept_unlink(struct sock *sk) 678 { 679 struct smc_sock *par = smc_sk(sk)->listen_smc; 680 681 spin_lock(&par->accept_q_lock); 682 list_del_init(&smc_sk(sk)->accept_q); 683 spin_unlock(&par->accept_q_lock); 684 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 685 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 686 } 687 688 /* remove a sock from the accept queue to bind it to a new socket created 689 * for a socket accept call from user space 690 */ 691 struct sock *smc_accept_dequeue(struct sock *parent, 692 struct socket *new_sock) 693 { 694 struct smc_sock *isk, *n; 695 struct sock *new_sk; 696 697 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 698 new_sk = (struct sock *)isk; 699 700 smc_accept_unlink(new_sk); 701 if (new_sk->sk_state == SMC_CLOSED) { 702 if (isk->clcsock) { 703 sock_release(isk->clcsock); 704 isk->clcsock = NULL; 705 } 706 new_sk->sk_prot->unhash(new_sk); 707 sock_put(new_sk); /* final */ 708 continue; 709 } 710 if (new_sock) 711 sock_graft(new_sk, new_sock); 712 return new_sk; 713 } 714 return NULL; 715 } 716 717 /* clean up for a created but never accepted sock */ 718 void smc_close_non_accepted(struct sock *sk) 719 { 720 struct smc_sock *smc = smc_sk(sk); 721 722 lock_sock(sk); 723 if (!sk->sk_lingertime) 724 /* wait for peer closing */ 725 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 726 if (!smc->use_fallback) { 727 smc_close_active(smc); 728 sock_set_flag(sk, SOCK_DEAD); 729 sk->sk_shutdown |= SHUTDOWN_MASK; 730 } 731 if (smc->clcsock) { 732 struct socket *tcp; 733 734 tcp = smc->clcsock; 735 smc->clcsock = NULL; 736 sock_release(tcp); 737 } 738 if (smc->use_fallback) { 739 sock_put(sk); /* passive closing */ 740 sk->sk_state = SMC_CLOSED; 741 } else { 742 if (sk->sk_state == SMC_CLOSED) 743 smc_conn_free(&smc->conn); 744 } 745 release_sock(sk); 746 sk->sk_prot->unhash(sk); 747 sock_put(sk); /* final sock_put */ 748 } 749 750 static int smc_serv_conf_first_link(struct smc_sock *smc) 751 { 752 struct net *net = sock_net(smc->clcsock->sk); 753 struct smc_link_group *lgr = smc->conn.lgr; 754 struct smc_link *link; 755 int rest; 756 int rc; 757 758 link = &lgr->lnk[SMC_SINGLE_LINK]; 759 760 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 761 return SMC_CLC_DECL_INTERR; 762 763 /* send CONFIRM LINK request to client over the RoCE fabric */ 764 rc = smc_llc_send_confirm_link(link, 765 link->smcibdev->mac[link->ibport - 1], 766 &link->smcibdev->gid[link->ibport - 1], 767 SMC_LLC_REQ); 768 if (rc < 0) 769 return SMC_CLC_DECL_TCL; 770 771 /* receive CONFIRM LINK response from client over the RoCE fabric */ 772 rest = wait_for_completion_interruptible_timeout( 773 &link->llc_confirm_resp, 774 SMC_LLC_WAIT_FIRST_TIME); 775 if (rest <= 0) { 776 struct smc_clc_msg_decline dclc; 777 778 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 779 SMC_CLC_DECLINE); 780 return rc; 781 } 782 783 if (link->llc_confirm_resp_rc) 784 return SMC_CLC_DECL_RMBE_EC; 785 786 /* send ADD LINK request to client over the RoCE fabric */ 787 rc = smc_llc_send_add_link(link, 788 link->smcibdev->mac[link->ibport - 1], 789 &link->smcibdev->gid[link->ibport - 1], 790 SMC_LLC_REQ); 791 if (rc < 0) 792 return SMC_CLC_DECL_TCL; 793 794 /* receive ADD LINK response from client over the RoCE fabric */ 795 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 796 SMC_LLC_WAIT_TIME); 797 if (rest <= 0) { 798 struct smc_clc_msg_decline dclc; 799 800 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 801 SMC_CLC_DECLINE); 802 return rc; 803 } 804 805 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 806 807 return 0; 808 } 809 810 /* listen worker: finish */ 811 static void smc_listen_out(struct smc_sock *new_smc) 812 { 813 struct smc_sock *lsmc = new_smc->listen_smc; 814 struct sock *newsmcsk = &new_smc->sk; 815 816 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 817 if (lsmc->sk.sk_state == SMC_LISTEN) { 818 smc_accept_enqueue(&lsmc->sk, newsmcsk); 819 } else { /* no longer listening */ 820 smc_close_non_accepted(newsmcsk); 821 } 822 release_sock(&lsmc->sk); 823 824 /* Wake up accept */ 825 lsmc->sk.sk_data_ready(&lsmc->sk); 826 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 827 } 828 829 /* listen worker: finish in state connected */ 830 static void smc_listen_out_connected(struct smc_sock *new_smc) 831 { 832 struct sock *newsmcsk = &new_smc->sk; 833 834 sk_refcnt_debug_inc(newsmcsk); 835 if (newsmcsk->sk_state == SMC_INIT) 836 newsmcsk->sk_state = SMC_ACTIVE; 837 838 smc_listen_out(new_smc); 839 } 840 841 /* listen worker: finish in error state */ 842 static void smc_listen_out_err(struct smc_sock *new_smc) 843 { 844 struct sock *newsmcsk = &new_smc->sk; 845 846 if (newsmcsk->sk_state == SMC_INIT) 847 sock_put(&new_smc->sk); /* passive closing */ 848 newsmcsk->sk_state = SMC_CLOSED; 849 smc_conn_free(&new_smc->conn); 850 851 smc_listen_out(new_smc); 852 } 853 854 /* listen worker: decline and fall back if possible */ 855 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, 856 int local_contact) 857 { 858 /* RDMA setup failed, switch back to TCP */ 859 if (local_contact == SMC_FIRST_CONTACT) 860 smc_lgr_forget(new_smc->conn.lgr); 861 if (reason_code < 0) { /* error, no fallback possible */ 862 smc_listen_out_err(new_smc); 863 return; 864 } 865 smc_conn_free(&new_smc->conn); 866 new_smc->use_fallback = true; 867 if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { 868 if (smc_clc_send_decline(new_smc, reason_code) < 0) { 869 smc_listen_out_err(new_smc); 870 return; 871 } 872 } 873 smc_listen_out_connected(new_smc); 874 } 875 876 /* listen worker: check prefixes */ 877 static int smc_listen_rdma_check(struct smc_sock *new_smc, 878 struct smc_clc_msg_proposal *pclc) 879 { 880 struct smc_clc_msg_proposal_prefix *pclc_prfx; 881 struct socket *newclcsock = new_smc->clcsock; 882 883 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 884 if (smc_clc_prfx_match(newclcsock, pclc_prfx)) 885 return SMC_CLC_DECL_CNFERR; 886 887 return 0; 888 } 889 890 /* listen worker: initialize connection and buffers */ 891 static int smc_listen_rdma_init(struct smc_sock *new_smc, 892 struct smc_clc_msg_proposal *pclc, 893 struct smc_ib_device *ibdev, u8 ibport, 894 int *local_contact) 895 { 896 /* allocate connection / link group */ 897 *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); 898 if (*local_contact < 0) { 899 if (*local_contact == -ENOMEM) 900 return SMC_CLC_DECL_MEM;/* insufficient memory*/ 901 return SMC_CLC_DECL_INTERR; /* other error */ 902 } 903 904 /* create send buffer and rmb */ 905 if (smc_buf_create(new_smc)) 906 return SMC_CLC_DECL_MEM; 907 908 return 0; 909 } 910 911 /* listen worker: register buffers */ 912 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) 913 { 914 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 915 916 if (local_contact != SMC_FIRST_CONTACT) { 917 if (!new_smc->conn.rmb_desc->reused) { 918 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) 919 return SMC_CLC_DECL_INTERR; 920 } 921 } 922 smc_rmb_sync_sg_for_device(&new_smc->conn); 923 924 return 0; 925 } 926 927 /* listen worker: finish RDMA setup */ 928 static void smc_listen_rdma_finish(struct smc_sock *new_smc, 929 struct smc_clc_msg_accept_confirm *cclc, 930 int local_contact) 931 { 932 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 933 int reason_code = 0; 934 935 if (local_contact == SMC_FIRST_CONTACT) 936 smc_link_save_peer_info(link, cclc); 937 938 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { 939 reason_code = SMC_CLC_DECL_INTERR; 940 goto decline; 941 } 942 943 if (local_contact == SMC_FIRST_CONTACT) { 944 if (smc_ib_ready_link(link)) { 945 reason_code = SMC_CLC_DECL_INTERR; 946 goto decline; 947 } 948 /* QP confirmation over RoCE fabric */ 949 reason_code = smc_serv_conf_first_link(new_smc); 950 if (reason_code) 951 goto decline; 952 } 953 return; 954 955 decline: 956 mutex_unlock(&smc_create_lgr_pending); 957 smc_listen_decline(new_smc, reason_code, local_contact); 958 } 959 960 /* setup for RDMA connection of server */ 961 static void smc_listen_work(struct work_struct *work) 962 { 963 struct smc_sock *new_smc = container_of(work, struct smc_sock, 964 smc_listen_work); 965 struct socket *newclcsock = new_smc->clcsock; 966 struct smc_clc_msg_accept_confirm cclc; 967 struct smc_clc_msg_proposal *pclc; 968 struct smc_ib_device *ibdev; 969 u8 buf[SMC_CLC_MAX_LEN]; 970 int local_contact = 0; 971 int reason_code = 0; 972 int rc = 0; 973 u8 ibport; 974 975 if (new_smc->use_fallback) { 976 smc_listen_out_connected(new_smc); 977 return; 978 } 979 980 /* check if peer is smc capable */ 981 if (!tcp_sk(newclcsock->sk)->syn_smc) { 982 new_smc->use_fallback = true; 983 smc_listen_out_connected(new_smc); 984 return; 985 } 986 987 /* do inband token exchange - 988 * wait for and receive SMC Proposal CLC message 989 */ 990 pclc = (struct smc_clc_msg_proposal *)&buf; 991 reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, 992 SMC_CLC_PROPOSAL); 993 if (reason_code) { 994 smc_listen_decline(new_smc, reason_code, 0); 995 return; 996 } 997 998 /* IPSec connections opt out of SMC-R optimizations */ 999 if (using_ipsec(new_smc)) { 1000 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); 1001 return; 1002 } 1003 1004 mutex_lock(&smc_create_lgr_pending); 1005 smc_close_init(new_smc); 1006 smc_rx_init(new_smc); 1007 smc_tx_init(new_smc); 1008 1009 /* check if RDMA is available */ 1010 if (smc_check_rdma(new_smc, &ibdev, &ibport) || 1011 smc_listen_rdma_check(new_smc, pclc) || 1012 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, 1013 &local_contact) || 1014 smc_listen_rdma_reg(new_smc, local_contact)) { 1015 /* SMC not supported, decline */ 1016 mutex_unlock(&smc_create_lgr_pending); 1017 smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); 1018 return; 1019 } 1020 1021 /* send SMC Accept CLC message */ 1022 rc = smc_clc_send_accept(new_smc, local_contact); 1023 if (rc) { 1024 mutex_unlock(&smc_create_lgr_pending); 1025 smc_listen_decline(new_smc, rc, local_contact); 1026 return; 1027 } 1028 1029 /* receive SMC Confirm CLC message */ 1030 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 1031 SMC_CLC_CONFIRM); 1032 if (reason_code) { 1033 mutex_unlock(&smc_create_lgr_pending); 1034 smc_listen_decline(new_smc, reason_code, local_contact); 1035 return; 1036 } 1037 1038 /* finish worker */ 1039 smc_listen_rdma_finish(new_smc, &cclc, local_contact); 1040 smc_conn_save_peer_info(new_smc, &cclc); 1041 mutex_unlock(&smc_create_lgr_pending); 1042 smc_listen_out_connected(new_smc); 1043 } 1044 1045 static void smc_tcp_listen_work(struct work_struct *work) 1046 { 1047 struct smc_sock *lsmc = container_of(work, struct smc_sock, 1048 tcp_listen_work); 1049 struct sock *lsk = &lsmc->sk; 1050 struct smc_sock *new_smc; 1051 int rc = 0; 1052 1053 lock_sock(lsk); 1054 while (lsk->sk_state == SMC_LISTEN) { 1055 rc = smc_clcsock_accept(lsmc, &new_smc); 1056 if (rc) 1057 goto out; 1058 if (!new_smc) 1059 continue; 1060 1061 new_smc->listen_smc = lsmc; 1062 new_smc->use_fallback = lsmc->use_fallback; 1063 sock_hold(lsk); /* sock_put in smc_listen_work */ 1064 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 1065 smc_copy_sock_settings_to_smc(new_smc); 1066 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 1067 if (!schedule_work(&new_smc->smc_listen_work)) 1068 sock_put(&new_smc->sk); 1069 } 1070 1071 out: 1072 release_sock(lsk); 1073 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 1074 } 1075 1076 static int smc_listen(struct socket *sock, int backlog) 1077 { 1078 struct sock *sk = sock->sk; 1079 struct smc_sock *smc; 1080 int rc; 1081 1082 smc = smc_sk(sk); 1083 lock_sock(sk); 1084 1085 rc = -EINVAL; 1086 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 1087 goto out; 1088 1089 rc = 0; 1090 if (sk->sk_state == SMC_LISTEN) { 1091 sk->sk_max_ack_backlog = backlog; 1092 goto out; 1093 } 1094 /* some socket options are handled in core, so we could not apply 1095 * them to the clc socket -- copy smc socket options to clc socket 1096 */ 1097 smc_copy_sock_settings_to_clc(smc); 1098 if (!smc->use_fallback) 1099 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1100 1101 rc = kernel_listen(smc->clcsock, backlog); 1102 if (rc) 1103 goto out; 1104 sk->sk_max_ack_backlog = backlog; 1105 sk->sk_ack_backlog = 0; 1106 sk->sk_state = SMC_LISTEN; 1107 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1108 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1109 if (!schedule_work(&smc->tcp_listen_work)) 1110 sock_put(sk); 1111 1112 out: 1113 release_sock(sk); 1114 return rc; 1115 } 1116 1117 static int smc_accept(struct socket *sock, struct socket *new_sock, 1118 int flags, bool kern) 1119 { 1120 struct sock *sk = sock->sk, *nsk; 1121 DECLARE_WAITQUEUE(wait, current); 1122 struct smc_sock *lsmc; 1123 long timeo; 1124 int rc = 0; 1125 1126 lsmc = smc_sk(sk); 1127 sock_hold(sk); /* sock_put below */ 1128 lock_sock(sk); 1129 1130 if (lsmc->sk.sk_state != SMC_LISTEN) { 1131 rc = -EINVAL; 1132 release_sock(sk); 1133 goto out; 1134 } 1135 1136 /* Wait for an incoming connection */ 1137 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1138 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1139 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1140 set_current_state(TASK_INTERRUPTIBLE); 1141 if (!timeo) { 1142 rc = -EAGAIN; 1143 break; 1144 } 1145 release_sock(sk); 1146 timeo = schedule_timeout(timeo); 1147 /* wakeup by sk_data_ready in smc_listen_work() */ 1148 sched_annotate_sleep(); 1149 lock_sock(sk); 1150 if (signal_pending(current)) { 1151 rc = sock_intr_errno(timeo); 1152 break; 1153 } 1154 } 1155 set_current_state(TASK_RUNNING); 1156 remove_wait_queue(sk_sleep(sk), &wait); 1157 1158 if (!rc) 1159 rc = sock_error(nsk); 1160 release_sock(sk); 1161 if (rc) 1162 goto out; 1163 1164 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { 1165 /* wait till data arrives on the socket */ 1166 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * 1167 MSEC_PER_SEC); 1168 if (smc_sk(nsk)->use_fallback) { 1169 struct sock *clcsk = smc_sk(nsk)->clcsock->sk; 1170 1171 lock_sock(clcsk); 1172 if (skb_queue_empty(&clcsk->sk_receive_queue)) 1173 sk_wait_data(clcsk, &timeo, NULL); 1174 release_sock(clcsk); 1175 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { 1176 lock_sock(nsk); 1177 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); 1178 release_sock(nsk); 1179 } 1180 } 1181 1182 out: 1183 sock_put(sk); /* sock_hold above */ 1184 return rc; 1185 } 1186 1187 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1188 int peer) 1189 { 1190 struct smc_sock *smc; 1191 1192 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1193 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1194 return -ENOTCONN; 1195 1196 smc = smc_sk(sock->sk); 1197 1198 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1199 } 1200 1201 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1202 { 1203 struct sock *sk = sock->sk; 1204 struct smc_sock *smc; 1205 int rc = -EPIPE; 1206 1207 smc = smc_sk(sk); 1208 lock_sock(sk); 1209 if ((sk->sk_state != SMC_ACTIVE) && 1210 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1211 (sk->sk_state != SMC_INIT)) 1212 goto out; 1213 1214 if (msg->msg_flags & MSG_FASTOPEN) { 1215 if (sk->sk_state == SMC_INIT) { 1216 smc->use_fallback = true; 1217 } else { 1218 rc = -EINVAL; 1219 goto out; 1220 } 1221 } 1222 1223 if (smc->use_fallback) 1224 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1225 else 1226 rc = smc_tx_sendmsg(smc, msg, len); 1227 out: 1228 release_sock(sk); 1229 return rc; 1230 } 1231 1232 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1233 int flags) 1234 { 1235 struct sock *sk = sock->sk; 1236 struct smc_sock *smc; 1237 int rc = -ENOTCONN; 1238 1239 smc = smc_sk(sk); 1240 lock_sock(sk); 1241 if ((sk->sk_state == SMC_INIT) || 1242 (sk->sk_state == SMC_LISTEN) || 1243 (sk->sk_state == SMC_CLOSED)) 1244 goto out; 1245 1246 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1247 rc = 0; 1248 goto out; 1249 } 1250 1251 if (smc->use_fallback) { 1252 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1253 } else { 1254 msg->msg_namelen = 0; 1255 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); 1256 } 1257 1258 out: 1259 release_sock(sk); 1260 return rc; 1261 } 1262 1263 static __poll_t smc_accept_poll(struct sock *parent) 1264 { 1265 struct smc_sock *isk = smc_sk(parent); 1266 __poll_t mask = 0; 1267 1268 spin_lock(&isk->accept_q_lock); 1269 if (!list_empty(&isk->accept_q)) 1270 mask = EPOLLIN | EPOLLRDNORM; 1271 spin_unlock(&isk->accept_q_lock); 1272 1273 return mask; 1274 } 1275 1276 static __poll_t smc_poll(struct file *file, struct socket *sock, 1277 poll_table *wait) 1278 { 1279 struct sock *sk = sock->sk; 1280 __poll_t mask = 0; 1281 struct smc_sock *smc; 1282 int rc; 1283 1284 if (!sk) 1285 return EPOLLNVAL; 1286 1287 smc = smc_sk(sock->sk); 1288 sock_hold(sk); 1289 lock_sock(sk); 1290 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1291 /* delegate to CLC child sock */ 1292 release_sock(sk); 1293 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1294 lock_sock(sk); 1295 sk->sk_err = smc->clcsock->sk->sk_err; 1296 if (sk->sk_err) { 1297 mask |= EPOLLERR; 1298 } else { 1299 /* if non-blocking connect finished ... */ 1300 if (sk->sk_state == SMC_INIT && 1301 mask & EPOLLOUT && 1302 smc->clcsock->sk->sk_state != TCP_CLOSE) { 1303 rc = __smc_connect(smc); 1304 if (rc < 0) 1305 mask |= EPOLLERR; 1306 /* success cases including fallback */ 1307 mask |= EPOLLOUT | EPOLLWRNORM; 1308 } 1309 } 1310 } else { 1311 if (sk->sk_state != SMC_CLOSED) { 1312 release_sock(sk); 1313 sock_poll_wait(file, sk_sleep(sk), wait); 1314 lock_sock(sk); 1315 } 1316 if (sk->sk_err) 1317 mask |= EPOLLERR; 1318 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1319 (sk->sk_state == SMC_CLOSED)) 1320 mask |= EPOLLHUP; 1321 if (sk->sk_state == SMC_LISTEN) { 1322 /* woken up by sk_data_ready in smc_listen_work() */ 1323 mask = smc_accept_poll(sk); 1324 } else { 1325 if (atomic_read(&smc->conn.sndbuf_space) || 1326 sk->sk_shutdown & SEND_SHUTDOWN) { 1327 mask |= EPOLLOUT | EPOLLWRNORM; 1328 } else { 1329 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1330 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1331 } 1332 if (atomic_read(&smc->conn.bytes_to_rcv)) 1333 mask |= EPOLLIN | EPOLLRDNORM; 1334 if (sk->sk_shutdown & RCV_SHUTDOWN) 1335 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1336 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1337 mask |= EPOLLIN; 1338 } 1339 if (smc->conn.urg_state == SMC_URG_VALID) 1340 mask |= EPOLLPRI; 1341 1342 } 1343 release_sock(sk); 1344 sock_put(sk); 1345 1346 return mask; 1347 } 1348 1349 static int smc_shutdown(struct socket *sock, int how) 1350 { 1351 struct sock *sk = sock->sk; 1352 struct smc_sock *smc; 1353 int rc = -EINVAL; 1354 int rc1 = 0; 1355 1356 smc = smc_sk(sk); 1357 1358 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1359 return rc; 1360 1361 lock_sock(sk); 1362 1363 rc = -ENOTCONN; 1364 if ((sk->sk_state != SMC_LISTEN) && 1365 (sk->sk_state != SMC_ACTIVE) && 1366 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1367 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1368 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1369 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1370 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1371 goto out; 1372 if (smc->use_fallback) { 1373 rc = kernel_sock_shutdown(smc->clcsock, how); 1374 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1375 if (sk->sk_shutdown == SHUTDOWN_MASK) 1376 sk->sk_state = SMC_CLOSED; 1377 goto out; 1378 } 1379 switch (how) { 1380 case SHUT_RDWR: /* shutdown in both directions */ 1381 rc = smc_close_active(smc); 1382 break; 1383 case SHUT_WR: 1384 rc = smc_close_shutdown_write(smc); 1385 break; 1386 case SHUT_RD: 1387 rc = 0; 1388 /* nothing more to do because peer is not involved */ 1389 break; 1390 } 1391 if (smc->clcsock) 1392 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1393 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1394 sk->sk_shutdown |= how + 1; 1395 1396 out: 1397 release_sock(sk); 1398 return rc ? rc : rc1; 1399 } 1400 1401 static int smc_setsockopt(struct socket *sock, int level, int optname, 1402 char __user *optval, unsigned int optlen) 1403 { 1404 struct sock *sk = sock->sk; 1405 struct smc_sock *smc; 1406 int val, rc; 1407 1408 smc = smc_sk(sk); 1409 1410 /* generic setsockopts reaching us here always apply to the 1411 * CLC socket 1412 */ 1413 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1414 optval, optlen); 1415 if (smc->clcsock->sk->sk_err) { 1416 sk->sk_err = smc->clcsock->sk->sk_err; 1417 sk->sk_error_report(sk); 1418 } 1419 if (rc) 1420 return rc; 1421 1422 if (optlen < sizeof(int)) 1423 return rc; 1424 get_user(val, (int __user *)optval); 1425 1426 lock_sock(sk); 1427 switch (optname) { 1428 case TCP_ULP: 1429 case TCP_FASTOPEN: 1430 case TCP_FASTOPEN_CONNECT: 1431 case TCP_FASTOPEN_KEY: 1432 case TCP_FASTOPEN_NO_COOKIE: 1433 /* option not supported by SMC */ 1434 if (sk->sk_state == SMC_INIT) { 1435 smc->use_fallback = true; 1436 } else { 1437 if (!smc->use_fallback) 1438 rc = -EINVAL; 1439 } 1440 break; 1441 case TCP_NODELAY: 1442 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1443 if (val && !smc->use_fallback) 1444 mod_delayed_work(system_wq, &smc->conn.tx_work, 1445 0); 1446 } 1447 break; 1448 case TCP_CORK: 1449 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1450 if (!val && !smc->use_fallback) 1451 mod_delayed_work(system_wq, &smc->conn.tx_work, 1452 0); 1453 } 1454 break; 1455 case TCP_DEFER_ACCEPT: 1456 smc->sockopt_defer_accept = val; 1457 break; 1458 default: 1459 break; 1460 } 1461 release_sock(sk); 1462 1463 return rc; 1464 } 1465 1466 static int smc_getsockopt(struct socket *sock, int level, int optname, 1467 char __user *optval, int __user *optlen) 1468 { 1469 struct smc_sock *smc; 1470 1471 smc = smc_sk(sock->sk); 1472 /* socket options apply to the CLC socket */ 1473 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1474 optval, optlen); 1475 } 1476 1477 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1478 unsigned long arg) 1479 { 1480 union smc_host_cursor cons, urg; 1481 struct smc_connection *conn; 1482 struct smc_sock *smc; 1483 int answ; 1484 1485 smc = smc_sk(sock->sk); 1486 conn = &smc->conn; 1487 if (smc->use_fallback) { 1488 if (!smc->clcsock) 1489 return -EBADF; 1490 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1491 } 1492 switch (cmd) { 1493 case SIOCINQ: /* same as FIONREAD */ 1494 if (smc->sk.sk_state == SMC_LISTEN) 1495 return -EINVAL; 1496 if (smc->sk.sk_state == SMC_INIT || 1497 smc->sk.sk_state == SMC_CLOSED) 1498 answ = 0; 1499 else 1500 answ = atomic_read(&smc->conn.bytes_to_rcv); 1501 break; 1502 case SIOCOUTQ: 1503 /* output queue size (not send + not acked) */ 1504 if (smc->sk.sk_state == SMC_LISTEN) 1505 return -EINVAL; 1506 if (smc->sk.sk_state == SMC_INIT || 1507 smc->sk.sk_state == SMC_CLOSED) 1508 answ = 0; 1509 else 1510 answ = smc->conn.sndbuf_desc->len - 1511 atomic_read(&smc->conn.sndbuf_space); 1512 break; 1513 case SIOCOUTQNSD: 1514 /* output queue size (not send only) */ 1515 if (smc->sk.sk_state == SMC_LISTEN) 1516 return -EINVAL; 1517 if (smc->sk.sk_state == SMC_INIT || 1518 smc->sk.sk_state == SMC_CLOSED) 1519 answ = 0; 1520 else 1521 answ = smc_tx_prepared_sends(&smc->conn); 1522 break; 1523 case SIOCATMARK: 1524 if (smc->sk.sk_state == SMC_LISTEN) 1525 return -EINVAL; 1526 if (smc->sk.sk_state == SMC_INIT || 1527 smc->sk.sk_state == SMC_CLOSED) { 1528 answ = 0; 1529 } else { 1530 smc_curs_write(&cons, 1531 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 1532 conn); 1533 smc_curs_write(&urg, 1534 smc_curs_read(&conn->urg_curs, conn), 1535 conn); 1536 answ = smc_curs_diff(conn->rmb_desc->len, 1537 &cons, &urg) == 1; 1538 } 1539 break; 1540 default: 1541 return -ENOIOCTLCMD; 1542 } 1543 1544 return put_user(answ, (int __user *)arg); 1545 } 1546 1547 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1548 int offset, size_t size, int flags) 1549 { 1550 struct sock *sk = sock->sk; 1551 struct smc_sock *smc; 1552 int rc = -EPIPE; 1553 1554 smc = smc_sk(sk); 1555 lock_sock(sk); 1556 if (sk->sk_state != SMC_ACTIVE) { 1557 release_sock(sk); 1558 goto out; 1559 } 1560 release_sock(sk); 1561 if (smc->use_fallback) 1562 rc = kernel_sendpage(smc->clcsock, page, offset, 1563 size, flags); 1564 else 1565 rc = sock_no_sendpage(sock, page, offset, size, flags); 1566 1567 out: 1568 return rc; 1569 } 1570 1571 /* Map the affected portions of the rmbe into an spd, note the number of bytes 1572 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor 1573 * updates till whenever a respective page has been fully processed. 1574 * Note that subsequent recv() calls have to wait till all splice() processing 1575 * completed. 1576 */ 1577 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1578 struct pipe_inode_info *pipe, size_t len, 1579 unsigned int flags) 1580 { 1581 struct sock *sk = sock->sk; 1582 struct smc_sock *smc; 1583 int rc = -ENOTCONN; 1584 1585 smc = smc_sk(sk); 1586 lock_sock(sk); 1587 1588 if (sk->sk_state == SMC_INIT || 1589 sk->sk_state == SMC_LISTEN || 1590 sk->sk_state == SMC_CLOSED) 1591 goto out; 1592 1593 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1594 rc = 0; 1595 goto out; 1596 } 1597 1598 if (smc->use_fallback) { 1599 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1600 pipe, len, flags); 1601 } else { 1602 if (*ppos) { 1603 rc = -ESPIPE; 1604 goto out; 1605 } 1606 if (flags & SPLICE_F_NONBLOCK) 1607 flags = MSG_DONTWAIT; 1608 else 1609 flags = 0; 1610 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); 1611 } 1612 out: 1613 release_sock(sk); 1614 1615 return rc; 1616 } 1617 1618 /* must look like tcp */ 1619 static const struct proto_ops smc_sock_ops = { 1620 .family = PF_SMC, 1621 .owner = THIS_MODULE, 1622 .release = smc_release, 1623 .bind = smc_bind, 1624 .connect = smc_connect, 1625 .socketpair = sock_no_socketpair, 1626 .accept = smc_accept, 1627 .getname = smc_getname, 1628 .poll = smc_poll, 1629 .ioctl = smc_ioctl, 1630 .listen = smc_listen, 1631 .shutdown = smc_shutdown, 1632 .setsockopt = smc_setsockopt, 1633 .getsockopt = smc_getsockopt, 1634 .sendmsg = smc_sendmsg, 1635 .recvmsg = smc_recvmsg, 1636 .mmap = sock_no_mmap, 1637 .sendpage = smc_sendpage, 1638 .splice_read = smc_splice_read, 1639 }; 1640 1641 static int smc_create(struct net *net, struct socket *sock, int protocol, 1642 int kern) 1643 { 1644 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 1645 struct smc_sock *smc; 1646 struct sock *sk; 1647 int rc; 1648 1649 rc = -ESOCKTNOSUPPORT; 1650 if (sock->type != SOCK_STREAM) 1651 goto out; 1652 1653 rc = -EPROTONOSUPPORT; 1654 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 1655 goto out; 1656 1657 rc = -ENOBUFS; 1658 sock->ops = &smc_sock_ops; 1659 sk = smc_sock_alloc(net, sock, protocol); 1660 if (!sk) 1661 goto out; 1662 1663 /* create internal TCP socket for CLC handshake and fallback */ 1664 smc = smc_sk(sk); 1665 smc->use_fallback = false; /* assume rdma capability first */ 1666 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1667 &smc->clcsock); 1668 if (rc) { 1669 sk_common_release(sk); 1670 goto out; 1671 } 1672 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1673 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1674 1675 out: 1676 return rc; 1677 } 1678 1679 static const struct net_proto_family smc_sock_family_ops = { 1680 .family = PF_SMC, 1681 .owner = THIS_MODULE, 1682 .create = smc_create, 1683 }; 1684 1685 static int __init smc_init(void) 1686 { 1687 int rc; 1688 1689 rc = smc_pnet_init(); 1690 if (rc) 1691 return rc; 1692 1693 rc = smc_llc_init(); 1694 if (rc) { 1695 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1696 goto out_pnet; 1697 } 1698 1699 rc = smc_cdc_init(); 1700 if (rc) { 1701 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1702 goto out_pnet; 1703 } 1704 1705 rc = proto_register(&smc_proto, 1); 1706 if (rc) { 1707 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 1708 goto out_pnet; 1709 } 1710 1711 rc = proto_register(&smc_proto6, 1); 1712 if (rc) { 1713 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 1714 goto out_proto; 1715 } 1716 1717 rc = sock_register(&smc_sock_family_ops); 1718 if (rc) { 1719 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1720 goto out_proto6; 1721 } 1722 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1723 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 1724 1725 rc = smc_ib_register_client(); 1726 if (rc) { 1727 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1728 goto out_sock; 1729 } 1730 1731 static_branch_enable(&tcp_have_smc); 1732 return 0; 1733 1734 out_sock: 1735 sock_unregister(PF_SMC); 1736 out_proto6: 1737 proto_unregister(&smc_proto6); 1738 out_proto: 1739 proto_unregister(&smc_proto); 1740 out_pnet: 1741 smc_pnet_exit(); 1742 return rc; 1743 } 1744 1745 static void __exit smc_exit(void) 1746 { 1747 smc_core_exit(); 1748 static_branch_disable(&tcp_have_smc); 1749 smc_ib_unregister_client(); 1750 sock_unregister(PF_SMC); 1751 proto_unregister(&smc_proto6); 1752 proto_unregister(&smc_proto); 1753 smc_pnet_exit(); 1754 } 1755 1756 module_init(smc_init); 1757 module_exit(smc_exit); 1758 1759 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1760 MODULE_DESCRIPTION("smc socket address family"); 1761 MODULE_LICENSE("GPL"); 1762 MODULE_ALIAS_NETPROTO(PF_SMC); 1763