1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - support for alternate links postponed 11 * - partial support for non-blocking sockets only 12 * - support for urgent data postponed 13 * 14 * Copyright IBM Corp. 2016, 2018 15 * 16 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 17 * based on prototype from Frank Blaschka 18 */ 19 20 #define KMSG_COMPONENT "smc" 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23 #include <linux/module.h> 24 #include <linux/socket.h> 25 #include <linux/workqueue.h> 26 #include <linux/in.h> 27 #include <linux/sched/signal.h> 28 29 #include <net/sock.h> 30 #include <net/tcp.h> 31 #include <net/smc.h> 32 33 #include "smc.h" 34 #include "smc_clc.h" 35 #include "smc_llc.h" 36 #include "smc_cdc.h" 37 #include "smc_core.h" 38 #include "smc_ib.h" 39 #include "smc_pnet.h" 40 #include "smc_tx.h" 41 #include "smc_rx.h" 42 #include "smc_close.h" 43 44 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 45 * creation 46 */ 47 48 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 49 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 50 .list = LIST_HEAD_INIT(smc_lgr_list.list), 51 }; 52 53 static void smc_tcp_listen_work(struct work_struct *); 54 55 static void smc_set_keepalive(struct sock *sk, int val) 56 { 57 struct smc_sock *smc = smc_sk(sk); 58 59 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 60 } 61 62 static struct smc_hashinfo smc_v4_hashinfo = { 63 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 64 }; 65 66 static struct smc_hashinfo smc_v6_hashinfo = { 67 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 68 }; 69 70 int smc_hash_sk(struct sock *sk) 71 { 72 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 73 struct hlist_head *head; 74 75 head = &h->ht; 76 77 write_lock_bh(&h->lock); 78 sk_add_node(sk, head); 79 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 80 write_unlock_bh(&h->lock); 81 82 return 0; 83 } 84 EXPORT_SYMBOL_GPL(smc_hash_sk); 85 86 void smc_unhash_sk(struct sock *sk) 87 { 88 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 89 90 write_lock_bh(&h->lock); 91 if (sk_del_node_init(sk)) 92 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 93 write_unlock_bh(&h->lock); 94 } 95 EXPORT_SYMBOL_GPL(smc_unhash_sk); 96 97 struct proto smc_proto = { 98 .name = "SMC", 99 .owner = THIS_MODULE, 100 .keepalive = smc_set_keepalive, 101 .hash = smc_hash_sk, 102 .unhash = smc_unhash_sk, 103 .obj_size = sizeof(struct smc_sock), 104 .h.smc_hash = &smc_v4_hashinfo, 105 .slab_flags = SLAB_TYPESAFE_BY_RCU, 106 }; 107 EXPORT_SYMBOL_GPL(smc_proto); 108 109 struct proto smc_proto6 = { 110 .name = "SMC6", 111 .owner = THIS_MODULE, 112 .keepalive = smc_set_keepalive, 113 .hash = smc_hash_sk, 114 .unhash = smc_unhash_sk, 115 .obj_size = sizeof(struct smc_sock), 116 .h.smc_hash = &smc_v6_hashinfo, 117 .slab_flags = SLAB_TYPESAFE_BY_RCU, 118 }; 119 EXPORT_SYMBOL_GPL(smc_proto6); 120 121 static int smc_release(struct socket *sock) 122 { 123 struct sock *sk = sock->sk; 124 struct smc_sock *smc; 125 int rc = 0; 126 127 if (!sk) 128 goto out; 129 130 smc = smc_sk(sk); 131 if (sk->sk_state == SMC_LISTEN) 132 /* smc_close_non_accepted() is called and acquires 133 * sock lock for child sockets again 134 */ 135 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 136 else 137 lock_sock(sk); 138 139 if (!smc->use_fallback) { 140 rc = smc_close_active(smc); 141 sock_set_flag(sk, SOCK_DEAD); 142 sk->sk_shutdown |= SHUTDOWN_MASK; 143 } 144 if (smc->clcsock) { 145 sock_release(smc->clcsock); 146 smc->clcsock = NULL; 147 } 148 if (smc->use_fallback) { 149 sock_put(sk); /* passive closing */ 150 sk->sk_state = SMC_CLOSED; 151 sk->sk_state_change(sk); 152 } 153 154 /* detach socket */ 155 sock_orphan(sk); 156 sock->sk = NULL; 157 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 158 smc_conn_free(&smc->conn); 159 release_sock(sk); 160 161 sk->sk_prot->unhash(sk); 162 sock_put(sk); /* final sock_put */ 163 out: 164 return rc; 165 } 166 167 static void smc_destruct(struct sock *sk) 168 { 169 if (sk->sk_state != SMC_CLOSED) 170 return; 171 if (!sock_flag(sk, SOCK_DEAD)) 172 return; 173 174 sk_refcnt_debug_dec(sk); 175 } 176 177 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 178 int protocol) 179 { 180 struct smc_sock *smc; 181 struct proto *prot; 182 struct sock *sk; 183 184 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 185 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 186 if (!sk) 187 return NULL; 188 189 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 190 sk->sk_state = SMC_INIT; 191 sk->sk_destruct = smc_destruct; 192 sk->sk_protocol = protocol; 193 smc = smc_sk(sk); 194 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 195 INIT_LIST_HEAD(&smc->accept_q); 196 spin_lock_init(&smc->accept_q_lock); 197 sk->sk_prot->hash(sk); 198 sk_refcnt_debug_inc(sk); 199 200 return sk; 201 } 202 203 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 204 int addr_len) 205 { 206 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 207 struct sock *sk = sock->sk; 208 struct smc_sock *smc; 209 int rc; 210 211 smc = smc_sk(sk); 212 213 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 214 rc = -EINVAL; 215 if (addr_len < sizeof(struct sockaddr_in)) 216 goto out; 217 218 rc = -EAFNOSUPPORT; 219 if (addr->sin_family != AF_INET && 220 addr->sin_family != AF_INET6 && 221 addr->sin_family != AF_UNSPEC) 222 goto out; 223 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 224 if (addr->sin_family == AF_UNSPEC && 225 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 226 goto out; 227 228 lock_sock(sk); 229 230 /* Check if socket is already active */ 231 rc = -EINVAL; 232 if (sk->sk_state != SMC_INIT) 233 goto out_rel; 234 235 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 236 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 237 238 out_rel: 239 release_sock(sk); 240 out: 241 return rc; 242 } 243 244 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 245 unsigned long mask) 246 { 247 /* options we don't get control via setsockopt for */ 248 nsk->sk_type = osk->sk_type; 249 nsk->sk_sndbuf = osk->sk_sndbuf; 250 nsk->sk_rcvbuf = osk->sk_rcvbuf; 251 nsk->sk_sndtimeo = osk->sk_sndtimeo; 252 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 253 nsk->sk_mark = osk->sk_mark; 254 nsk->sk_priority = osk->sk_priority; 255 nsk->sk_rcvlowat = osk->sk_rcvlowat; 256 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 257 nsk->sk_err = osk->sk_err; 258 259 nsk->sk_flags &= ~mask; 260 nsk->sk_flags |= osk->sk_flags & mask; 261 } 262 263 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 264 (1UL << SOCK_KEEPOPEN) | \ 265 (1UL << SOCK_LINGER) | \ 266 (1UL << SOCK_BROADCAST) | \ 267 (1UL << SOCK_TIMESTAMP) | \ 268 (1UL << SOCK_DBG) | \ 269 (1UL << SOCK_RCVTSTAMP) | \ 270 (1UL << SOCK_RCVTSTAMPNS) | \ 271 (1UL << SOCK_LOCALROUTE) | \ 272 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 273 (1UL << SOCK_RXQ_OVFL) | \ 274 (1UL << SOCK_WIFI_STATUS) | \ 275 (1UL << SOCK_NOFCS) | \ 276 (1UL << SOCK_FILTER_LOCKED)) 277 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 278 * clc socket (since smc is not called for these options from net/core) 279 */ 280 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 281 { 282 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 283 } 284 285 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 286 (1UL << SOCK_KEEPOPEN) | \ 287 (1UL << SOCK_LINGER) | \ 288 (1UL << SOCK_DBG)) 289 /* copy only settings and flags relevant for smc from clc to smc socket */ 290 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 291 { 292 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 293 } 294 295 static int smc_clnt_conf_first_link(struct smc_sock *smc) 296 { 297 struct smc_link_group *lgr = smc->conn.lgr; 298 struct smc_link *link; 299 int rest; 300 int rc; 301 302 link = &lgr->lnk[SMC_SINGLE_LINK]; 303 /* receive CONFIRM LINK request from server over RoCE fabric */ 304 rest = wait_for_completion_interruptible_timeout( 305 &link->llc_confirm, 306 SMC_LLC_WAIT_FIRST_TIME); 307 if (rest <= 0) { 308 struct smc_clc_msg_decline dclc; 309 310 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 311 SMC_CLC_DECLINE); 312 return rc; 313 } 314 315 if (link->llc_confirm_rc) 316 return SMC_CLC_DECL_RMBE_EC; 317 318 rc = smc_ib_modify_qp_rts(link); 319 if (rc) 320 return SMC_CLC_DECL_INTERR; 321 322 smc_wr_remember_qp_attr(link); 323 324 rc = smc_wr_reg_send(link, 325 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 326 if (rc) 327 return SMC_CLC_DECL_INTERR; 328 329 /* send CONFIRM LINK response over RoCE fabric */ 330 rc = smc_llc_send_confirm_link(link, 331 link->smcibdev->mac[link->ibport - 1], 332 &link->smcibdev->gid[link->ibport - 1], 333 SMC_LLC_RESP); 334 if (rc < 0) 335 return SMC_CLC_DECL_TCL; 336 337 /* receive ADD LINK request from server over RoCE fabric */ 338 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 339 SMC_LLC_WAIT_TIME); 340 if (rest <= 0) { 341 struct smc_clc_msg_decline dclc; 342 343 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 344 SMC_CLC_DECLINE); 345 return rc; 346 } 347 348 /* send add link reject message, only one link supported for now */ 349 rc = smc_llc_send_add_link(link, 350 link->smcibdev->mac[link->ibport - 1], 351 &link->smcibdev->gid[link->ibport - 1], 352 SMC_LLC_RESP); 353 if (rc < 0) 354 return SMC_CLC_DECL_TCL; 355 356 link->state = SMC_LNK_ACTIVE; 357 358 return 0; 359 } 360 361 static void smc_conn_save_peer_info(struct smc_sock *smc, 362 struct smc_clc_msg_accept_confirm *clc) 363 { 364 smc->conn.peer_conn_idx = clc->conn_idx; 365 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 366 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 367 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 368 } 369 370 static void smc_link_save_peer_info(struct smc_link *link, 371 struct smc_clc_msg_accept_confirm *clc) 372 { 373 link->peer_qpn = ntoh24(clc->qpn); 374 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 375 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 376 link->peer_psn = ntoh24(clc->psn); 377 link->peer_mtu = clc->qp_mtu; 378 } 379 380 /* setup for RDMA connection of client */ 381 static int smc_connect_rdma(struct smc_sock *smc) 382 { 383 struct smc_clc_msg_accept_confirm aclc; 384 int local_contact = SMC_FIRST_CONTACT; 385 struct smc_ib_device *smcibdev; 386 struct smc_link *link; 387 u8 srv_first_contact; 388 int reason_code = 0; 389 int rc = 0; 390 u8 ibport; 391 392 sock_hold(&smc->sk); /* sock put in passive closing */ 393 394 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 395 /* peer has not signalled SMC-capability */ 396 smc->use_fallback = true; 397 goto out_connected; 398 } 399 400 /* IPSec connections opt out of SMC-R optimizations */ 401 if (using_ipsec(smc)) { 402 reason_code = SMC_CLC_DECL_IPSEC; 403 goto decline_rdma; 404 } 405 406 /* PNET table look up: search active ib_device and port 407 * within same PNETID that also contains the ethernet device 408 * used for the internal TCP socket 409 */ 410 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 411 if (!smcibdev) { 412 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 413 goto decline_rdma; 414 } 415 416 /* do inband token exchange */ 417 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 418 if (reason_code < 0) { 419 rc = reason_code; 420 goto out_err; 421 } 422 if (reason_code > 0) /* configuration error */ 423 goto decline_rdma; 424 /* receive SMC Accept CLC message */ 425 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 426 SMC_CLC_ACCEPT); 427 if (reason_code < 0) { 428 rc = reason_code; 429 goto out_err; 430 } 431 if (reason_code > 0) 432 goto decline_rdma; 433 434 srv_first_contact = aclc.hdr.flag; 435 mutex_lock(&smc_create_lgr_pending); 436 local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, 437 srv_first_contact); 438 if (local_contact < 0) { 439 rc = local_contact; 440 if (rc == -ENOMEM) 441 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 442 else if (rc == -ENOLINK) 443 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 444 goto decline_rdma_unlock; 445 } 446 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 447 448 smc_conn_save_peer_info(smc, &aclc); 449 450 /* create send buffer and rmb */ 451 rc = smc_buf_create(smc); 452 if (rc) { 453 reason_code = SMC_CLC_DECL_MEM; 454 goto decline_rdma_unlock; 455 } 456 457 if (local_contact == SMC_FIRST_CONTACT) 458 smc_link_save_peer_info(link, &aclc); 459 460 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 461 if (rc) { 462 reason_code = SMC_CLC_DECL_INTERR; 463 goto decline_rdma_unlock; 464 } 465 466 smc_close_init(smc); 467 smc_rx_init(smc); 468 469 if (local_contact == SMC_FIRST_CONTACT) { 470 rc = smc_ib_ready_link(link); 471 if (rc) { 472 reason_code = SMC_CLC_DECL_INTERR; 473 goto decline_rdma_unlock; 474 } 475 } else { 476 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; 477 478 if (!buf_desc->reused) { 479 /* register memory region for new rmb */ 480 rc = smc_wr_reg_send(link, 481 buf_desc->mr_rx[SMC_SINGLE_LINK]); 482 if (rc) { 483 reason_code = SMC_CLC_DECL_INTERR; 484 goto decline_rdma_unlock; 485 } 486 } 487 } 488 smc_rmb_sync_sg_for_device(&smc->conn); 489 490 rc = smc_clc_send_confirm(smc); 491 if (rc) 492 goto out_err_unlock; 493 494 if (local_contact == SMC_FIRST_CONTACT) { 495 /* QP confirmation over RoCE fabric */ 496 reason_code = smc_clnt_conf_first_link(smc); 497 if (reason_code < 0) { 498 rc = reason_code; 499 goto out_err_unlock; 500 } 501 if (reason_code > 0) 502 goto decline_rdma_unlock; 503 } 504 505 mutex_unlock(&smc_create_lgr_pending); 506 smc_tx_init(smc); 507 508 out_connected: 509 smc_copy_sock_settings_to_clc(smc); 510 if (smc->sk.sk_state == SMC_INIT) 511 smc->sk.sk_state = SMC_ACTIVE; 512 513 return rc ? rc : local_contact; 514 515 decline_rdma_unlock: 516 if (local_contact == SMC_FIRST_CONTACT) 517 smc_lgr_forget(smc->conn.lgr); 518 mutex_unlock(&smc_create_lgr_pending); 519 smc_conn_free(&smc->conn); 520 decline_rdma: 521 /* RDMA setup failed, switch back to TCP */ 522 smc->use_fallback = true; 523 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 524 rc = smc_clc_send_decline(smc, reason_code); 525 if (rc < 0) 526 goto out_err; 527 } 528 goto out_connected; 529 530 out_err_unlock: 531 if (local_contact == SMC_FIRST_CONTACT) 532 smc_lgr_forget(smc->conn.lgr); 533 mutex_unlock(&smc_create_lgr_pending); 534 smc_conn_free(&smc->conn); 535 out_err: 536 if (smc->sk.sk_state == SMC_INIT) 537 sock_put(&smc->sk); /* passive closing */ 538 return rc; 539 } 540 541 static int smc_connect(struct socket *sock, struct sockaddr *addr, 542 int alen, int flags) 543 { 544 struct sock *sk = sock->sk; 545 struct smc_sock *smc; 546 int rc = -EINVAL; 547 548 smc = smc_sk(sk); 549 550 /* separate smc parameter checking to be safe */ 551 if (alen < sizeof(addr->sa_family)) 552 goto out_err; 553 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 554 goto out_err; 555 556 lock_sock(sk); 557 switch (sk->sk_state) { 558 default: 559 goto out; 560 case SMC_ACTIVE: 561 rc = -EISCONN; 562 goto out; 563 case SMC_INIT: 564 rc = 0; 565 break; 566 } 567 568 smc_copy_sock_settings_to_clc(smc); 569 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 570 rc = kernel_connect(smc->clcsock, addr, alen, flags); 571 if (rc) 572 goto out; 573 574 /* setup RDMA connection */ 575 rc = smc_connect_rdma(smc); 576 if (rc < 0) 577 goto out; 578 else 579 rc = 0; /* success cases including fallback */ 580 581 out: 582 release_sock(sk); 583 out_err: 584 return rc; 585 } 586 587 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 588 { 589 struct socket *new_clcsock = NULL; 590 struct sock *lsk = &lsmc->sk; 591 struct sock *new_sk; 592 int rc; 593 594 release_sock(lsk); 595 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 596 if (!new_sk) { 597 rc = -ENOMEM; 598 lsk->sk_err = ENOMEM; 599 *new_smc = NULL; 600 lock_sock(lsk); 601 goto out; 602 } 603 *new_smc = smc_sk(new_sk); 604 605 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 606 lock_sock(lsk); 607 if (rc < 0) 608 lsk->sk_err = -rc; 609 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 610 if (new_clcsock) 611 sock_release(new_clcsock); 612 new_sk->sk_state = SMC_CLOSED; 613 sock_set_flag(new_sk, SOCK_DEAD); 614 new_sk->sk_prot->unhash(new_sk); 615 sock_put(new_sk); /* final */ 616 *new_smc = NULL; 617 goto out; 618 } 619 620 (*new_smc)->clcsock = new_clcsock; 621 out: 622 return rc; 623 } 624 625 /* add a just created sock to the accept queue of the listen sock as 626 * candidate for a following socket accept call from user space 627 */ 628 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 629 { 630 struct smc_sock *par = smc_sk(parent); 631 632 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 633 spin_lock(&par->accept_q_lock); 634 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 635 spin_unlock(&par->accept_q_lock); 636 sk_acceptq_added(parent); 637 } 638 639 /* remove a socket from the accept queue of its parental listening socket */ 640 static void smc_accept_unlink(struct sock *sk) 641 { 642 struct smc_sock *par = smc_sk(sk)->listen_smc; 643 644 spin_lock(&par->accept_q_lock); 645 list_del_init(&smc_sk(sk)->accept_q); 646 spin_unlock(&par->accept_q_lock); 647 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 648 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 649 } 650 651 /* remove a sock from the accept queue to bind it to a new socket created 652 * for a socket accept call from user space 653 */ 654 struct sock *smc_accept_dequeue(struct sock *parent, 655 struct socket *new_sock) 656 { 657 struct smc_sock *isk, *n; 658 struct sock *new_sk; 659 660 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 661 new_sk = (struct sock *)isk; 662 663 smc_accept_unlink(new_sk); 664 if (new_sk->sk_state == SMC_CLOSED) { 665 if (isk->clcsock) { 666 sock_release(isk->clcsock); 667 isk->clcsock = NULL; 668 } 669 new_sk->sk_prot->unhash(new_sk); 670 sock_put(new_sk); /* final */ 671 continue; 672 } 673 if (new_sock) 674 sock_graft(new_sk, new_sock); 675 return new_sk; 676 } 677 return NULL; 678 } 679 680 /* clean up for a created but never accepted sock */ 681 void smc_close_non_accepted(struct sock *sk) 682 { 683 struct smc_sock *smc = smc_sk(sk); 684 685 lock_sock(sk); 686 if (!sk->sk_lingertime) 687 /* wait for peer closing */ 688 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 689 if (!smc->use_fallback) { 690 smc_close_active(smc); 691 sock_set_flag(sk, SOCK_DEAD); 692 sk->sk_shutdown |= SHUTDOWN_MASK; 693 } 694 if (smc->clcsock) { 695 struct socket *tcp; 696 697 tcp = smc->clcsock; 698 smc->clcsock = NULL; 699 sock_release(tcp); 700 } 701 if (smc->use_fallback) { 702 sock_put(sk); /* passive closing */ 703 sk->sk_state = SMC_CLOSED; 704 } else { 705 if (sk->sk_state == SMC_CLOSED) 706 smc_conn_free(&smc->conn); 707 } 708 release_sock(sk); 709 sk->sk_prot->unhash(sk); 710 sock_put(sk); /* final sock_put */ 711 } 712 713 static int smc_serv_conf_first_link(struct smc_sock *smc) 714 { 715 struct smc_link_group *lgr = smc->conn.lgr; 716 struct smc_link *link; 717 int rest; 718 int rc; 719 720 link = &lgr->lnk[SMC_SINGLE_LINK]; 721 722 rc = smc_wr_reg_send(link, 723 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 724 if (rc) 725 return SMC_CLC_DECL_INTERR; 726 727 /* send CONFIRM LINK request to client over the RoCE fabric */ 728 rc = smc_llc_send_confirm_link(link, 729 link->smcibdev->mac[link->ibport - 1], 730 &link->smcibdev->gid[link->ibport - 1], 731 SMC_LLC_REQ); 732 if (rc < 0) 733 return SMC_CLC_DECL_TCL; 734 735 /* receive CONFIRM LINK response from client over the RoCE fabric */ 736 rest = wait_for_completion_interruptible_timeout( 737 &link->llc_confirm_resp, 738 SMC_LLC_WAIT_FIRST_TIME); 739 if (rest <= 0) { 740 struct smc_clc_msg_decline dclc; 741 742 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 743 SMC_CLC_DECLINE); 744 return rc; 745 } 746 747 if (link->llc_confirm_resp_rc) 748 return SMC_CLC_DECL_RMBE_EC; 749 750 /* send ADD LINK request to client over the RoCE fabric */ 751 rc = smc_llc_send_add_link(link, 752 link->smcibdev->mac[link->ibport - 1], 753 &link->smcibdev->gid[link->ibport - 1], 754 SMC_LLC_REQ); 755 if (rc < 0) 756 return SMC_CLC_DECL_TCL; 757 758 /* receive ADD LINK response from client over the RoCE fabric */ 759 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 760 SMC_LLC_WAIT_TIME); 761 if (rest <= 0) { 762 struct smc_clc_msg_decline dclc; 763 764 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 765 SMC_CLC_DECLINE); 766 return rc; 767 } 768 769 link->state = SMC_LNK_ACTIVE; 770 771 return 0; 772 } 773 774 /* setup for RDMA connection of server */ 775 static void smc_listen_work(struct work_struct *work) 776 { 777 struct smc_sock *new_smc = container_of(work, struct smc_sock, 778 smc_listen_work); 779 struct smc_clc_msg_proposal_prefix *pclc_prfx; 780 struct socket *newclcsock = new_smc->clcsock; 781 struct smc_sock *lsmc = new_smc->listen_smc; 782 struct smc_clc_msg_accept_confirm cclc; 783 int local_contact = SMC_REUSE_CONTACT; 784 struct sock *newsmcsk = &new_smc->sk; 785 struct smc_clc_msg_proposal *pclc; 786 struct smc_ib_device *smcibdev; 787 u8 buf[SMC_CLC_MAX_LEN]; 788 struct smc_link *link; 789 int reason_code = 0; 790 int rc = 0; 791 u8 ibport; 792 793 /* check if peer is smc capable */ 794 if (!tcp_sk(newclcsock->sk)->syn_smc) { 795 new_smc->use_fallback = true; 796 goto out_connected; 797 } 798 799 /* do inband token exchange - 800 *wait for and receive SMC Proposal CLC message 801 */ 802 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 803 SMC_CLC_PROPOSAL); 804 if (reason_code < 0) 805 goto out_err; 806 if (reason_code > 0) 807 goto decline_rdma; 808 809 /* IPSec connections opt out of SMC-R optimizations */ 810 if (using_ipsec(new_smc)) { 811 reason_code = SMC_CLC_DECL_IPSEC; 812 goto decline_rdma; 813 } 814 815 /* PNET table look up: search active ib_device and port 816 * within same PNETID that also contains the ethernet device 817 * used for the internal TCP socket 818 */ 819 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 820 if (!smcibdev) { 821 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 822 goto decline_rdma; 823 } 824 825 pclc = (struct smc_clc_msg_proposal *)&buf; 826 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 827 828 rc = smc_clc_prfx_match(newclcsock, pclc_prfx); 829 if (rc) { 830 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 831 goto decline_rdma; 832 } 833 834 /* allocate connection / link group */ 835 mutex_lock(&smc_create_lgr_pending); 836 local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, 837 0); 838 if (local_contact < 0) { 839 rc = local_contact; 840 if (rc == -ENOMEM) 841 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 842 goto decline_rdma_unlock; 843 } 844 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 845 846 /* create send buffer and rmb */ 847 rc = smc_buf_create(new_smc); 848 if (rc) { 849 reason_code = SMC_CLC_DECL_MEM; 850 goto decline_rdma_unlock; 851 } 852 853 smc_close_init(new_smc); 854 smc_rx_init(new_smc); 855 856 if (local_contact != SMC_FIRST_CONTACT) { 857 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; 858 859 if (!buf_desc->reused) { 860 /* register memory region for new rmb */ 861 rc = smc_wr_reg_send(link, 862 buf_desc->mr_rx[SMC_SINGLE_LINK]); 863 if (rc) { 864 reason_code = SMC_CLC_DECL_INTERR; 865 goto decline_rdma_unlock; 866 } 867 } 868 } 869 smc_rmb_sync_sg_for_device(&new_smc->conn); 870 871 rc = smc_clc_send_accept(new_smc, local_contact); 872 if (rc) 873 goto out_err_unlock; 874 875 /* receive SMC Confirm CLC message */ 876 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 877 SMC_CLC_CONFIRM); 878 if (reason_code < 0) 879 goto out_err_unlock; 880 if (reason_code > 0) 881 goto decline_rdma_unlock; 882 smc_conn_save_peer_info(new_smc, &cclc); 883 if (local_contact == SMC_FIRST_CONTACT) 884 smc_link_save_peer_info(link, &cclc); 885 886 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 887 if (rc) { 888 reason_code = SMC_CLC_DECL_INTERR; 889 goto decline_rdma_unlock; 890 } 891 892 if (local_contact == SMC_FIRST_CONTACT) { 893 rc = smc_ib_ready_link(link); 894 if (rc) { 895 reason_code = SMC_CLC_DECL_INTERR; 896 goto decline_rdma_unlock; 897 } 898 /* QP confirmation over RoCE fabric */ 899 reason_code = smc_serv_conf_first_link(new_smc); 900 if (reason_code < 0) 901 /* peer is not aware of a problem */ 902 goto out_err_unlock; 903 if (reason_code > 0) 904 goto decline_rdma_unlock; 905 } 906 907 smc_tx_init(new_smc); 908 mutex_unlock(&smc_create_lgr_pending); 909 910 out_connected: 911 sk_refcnt_debug_inc(newsmcsk); 912 if (newsmcsk->sk_state == SMC_INIT) 913 newsmcsk->sk_state = SMC_ACTIVE; 914 enqueue: 915 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 916 if (lsmc->sk.sk_state == SMC_LISTEN) { 917 smc_accept_enqueue(&lsmc->sk, newsmcsk); 918 } else { /* no longer listening */ 919 smc_close_non_accepted(newsmcsk); 920 } 921 release_sock(&lsmc->sk); 922 923 /* Wake up accept */ 924 lsmc->sk.sk_data_ready(&lsmc->sk); 925 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 926 return; 927 928 decline_rdma_unlock: 929 if (local_contact == SMC_FIRST_CONTACT) 930 smc_lgr_forget(new_smc->conn.lgr); 931 mutex_unlock(&smc_create_lgr_pending); 932 decline_rdma: 933 /* RDMA setup failed, switch back to TCP */ 934 smc_conn_free(&new_smc->conn); 935 new_smc->use_fallback = true; 936 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 937 if (smc_clc_send_decline(new_smc, reason_code) < 0) 938 goto out_err; 939 } 940 goto out_connected; 941 942 out_err_unlock: 943 if (local_contact == SMC_FIRST_CONTACT) 944 smc_lgr_forget(new_smc->conn.lgr); 945 mutex_unlock(&smc_create_lgr_pending); 946 out_err: 947 if (newsmcsk->sk_state == SMC_INIT) 948 sock_put(&new_smc->sk); /* passive closing */ 949 newsmcsk->sk_state = SMC_CLOSED; 950 smc_conn_free(&new_smc->conn); 951 goto enqueue; /* queue new sock with sk_err set */ 952 } 953 954 static void smc_tcp_listen_work(struct work_struct *work) 955 { 956 struct smc_sock *lsmc = container_of(work, struct smc_sock, 957 tcp_listen_work); 958 struct sock *lsk = &lsmc->sk; 959 struct smc_sock *new_smc; 960 int rc = 0; 961 962 lock_sock(lsk); 963 while (lsk->sk_state == SMC_LISTEN) { 964 rc = smc_clcsock_accept(lsmc, &new_smc); 965 if (rc) 966 goto out; 967 if (!new_smc) 968 continue; 969 970 new_smc->listen_smc = lsmc; 971 new_smc->use_fallback = false; /* assume rdma capability first*/ 972 sock_hold(lsk); /* sock_put in smc_listen_work */ 973 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 974 smc_copy_sock_settings_to_smc(new_smc); 975 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 976 if (!schedule_work(&new_smc->smc_listen_work)) 977 sock_put(&new_smc->sk); 978 } 979 980 out: 981 if (lsmc->clcsock) { 982 sock_release(lsmc->clcsock); 983 lsmc->clcsock = NULL; 984 } 985 release_sock(lsk); 986 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 987 } 988 989 static int smc_listen(struct socket *sock, int backlog) 990 { 991 struct sock *sk = sock->sk; 992 struct smc_sock *smc; 993 int rc; 994 995 smc = smc_sk(sk); 996 lock_sock(sk); 997 998 rc = -EINVAL; 999 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 1000 goto out; 1001 1002 rc = 0; 1003 if (sk->sk_state == SMC_LISTEN) { 1004 sk->sk_max_ack_backlog = backlog; 1005 goto out; 1006 } 1007 /* some socket options are handled in core, so we could not apply 1008 * them to the clc socket -- copy smc socket options to clc socket 1009 */ 1010 smc_copy_sock_settings_to_clc(smc); 1011 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1012 1013 rc = kernel_listen(smc->clcsock, backlog); 1014 if (rc) 1015 goto out; 1016 sk->sk_max_ack_backlog = backlog; 1017 sk->sk_ack_backlog = 0; 1018 sk->sk_state = SMC_LISTEN; 1019 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1020 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1021 if (!schedule_work(&smc->tcp_listen_work)) 1022 sock_put(sk); 1023 1024 out: 1025 release_sock(sk); 1026 return rc; 1027 } 1028 1029 static int smc_accept(struct socket *sock, struct socket *new_sock, 1030 int flags, bool kern) 1031 { 1032 struct sock *sk = sock->sk, *nsk; 1033 DECLARE_WAITQUEUE(wait, current); 1034 struct smc_sock *lsmc; 1035 long timeo; 1036 int rc = 0; 1037 1038 lsmc = smc_sk(sk); 1039 sock_hold(sk); /* sock_put below */ 1040 lock_sock(sk); 1041 1042 if (lsmc->sk.sk_state != SMC_LISTEN) { 1043 rc = -EINVAL; 1044 goto out; 1045 } 1046 1047 /* Wait for an incoming connection */ 1048 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1049 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1050 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1051 set_current_state(TASK_INTERRUPTIBLE); 1052 if (!timeo) { 1053 rc = -EAGAIN; 1054 break; 1055 } 1056 release_sock(sk); 1057 timeo = schedule_timeout(timeo); 1058 /* wakeup by sk_data_ready in smc_listen_work() */ 1059 sched_annotate_sleep(); 1060 lock_sock(sk); 1061 if (signal_pending(current)) { 1062 rc = sock_intr_errno(timeo); 1063 break; 1064 } 1065 } 1066 set_current_state(TASK_RUNNING); 1067 remove_wait_queue(sk_sleep(sk), &wait); 1068 1069 if (!rc) 1070 rc = sock_error(nsk); 1071 1072 out: 1073 release_sock(sk); 1074 sock_put(sk); /* sock_hold above */ 1075 return rc; 1076 } 1077 1078 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1079 int peer) 1080 { 1081 struct smc_sock *smc; 1082 1083 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1084 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1085 return -ENOTCONN; 1086 1087 smc = smc_sk(sock->sk); 1088 1089 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1090 } 1091 1092 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1093 { 1094 struct sock *sk = sock->sk; 1095 struct smc_sock *smc; 1096 int rc = -EPIPE; 1097 1098 smc = smc_sk(sk); 1099 lock_sock(sk); 1100 if ((sk->sk_state != SMC_ACTIVE) && 1101 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1102 (sk->sk_state != SMC_INIT)) 1103 goto out; 1104 if (smc->use_fallback) 1105 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1106 else 1107 rc = smc_tx_sendmsg(smc, msg, len); 1108 out: 1109 release_sock(sk); 1110 return rc; 1111 } 1112 1113 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1114 int flags) 1115 { 1116 struct sock *sk = sock->sk; 1117 struct smc_sock *smc; 1118 int rc = -ENOTCONN; 1119 1120 smc = smc_sk(sk); 1121 lock_sock(sk); 1122 if ((sk->sk_state == SMC_INIT) || 1123 (sk->sk_state == SMC_LISTEN) || 1124 (sk->sk_state == SMC_CLOSED)) 1125 goto out; 1126 1127 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1128 rc = 0; 1129 goto out; 1130 } 1131 1132 if (smc->use_fallback) 1133 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1134 else 1135 rc = smc_rx_recvmsg(smc, msg, len, flags); 1136 1137 out: 1138 release_sock(sk); 1139 return rc; 1140 } 1141 1142 static __poll_t smc_accept_poll(struct sock *parent) 1143 { 1144 struct smc_sock *isk = smc_sk(parent); 1145 __poll_t mask = 0; 1146 1147 spin_lock(&isk->accept_q_lock); 1148 if (!list_empty(&isk->accept_q)) 1149 mask = EPOLLIN | EPOLLRDNORM; 1150 spin_unlock(&isk->accept_q_lock); 1151 1152 return mask; 1153 } 1154 1155 static __poll_t smc_poll(struct file *file, struct socket *sock, 1156 poll_table *wait) 1157 { 1158 struct sock *sk = sock->sk; 1159 __poll_t mask = 0; 1160 struct smc_sock *smc; 1161 int rc; 1162 1163 if (!sk) 1164 return EPOLLNVAL; 1165 1166 smc = smc_sk(sock->sk); 1167 sock_hold(sk); 1168 lock_sock(sk); 1169 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1170 /* delegate to CLC child sock */ 1171 release_sock(sk); 1172 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1173 /* if non-blocking connect finished ... */ 1174 lock_sock(sk); 1175 if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) { 1176 sk->sk_err = smc->clcsock->sk->sk_err; 1177 if (sk->sk_err) { 1178 mask |= EPOLLERR; 1179 } else { 1180 rc = smc_connect_rdma(smc); 1181 if (rc < 0) 1182 mask |= EPOLLERR; 1183 /* success cases including fallback */ 1184 mask |= EPOLLOUT | EPOLLWRNORM; 1185 } 1186 } 1187 } else { 1188 if (sk->sk_state != SMC_CLOSED) { 1189 release_sock(sk); 1190 sock_poll_wait(file, sk_sleep(sk), wait); 1191 lock_sock(sk); 1192 } 1193 if (sk->sk_err) 1194 mask |= EPOLLERR; 1195 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1196 (sk->sk_state == SMC_CLOSED)) 1197 mask |= EPOLLHUP; 1198 if (sk->sk_state == SMC_LISTEN) { 1199 /* woken up by sk_data_ready in smc_listen_work() */ 1200 mask = smc_accept_poll(sk); 1201 } else { 1202 if (atomic_read(&smc->conn.sndbuf_space) || 1203 sk->sk_shutdown & SEND_SHUTDOWN) { 1204 mask |= EPOLLOUT | EPOLLWRNORM; 1205 } else { 1206 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1207 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1208 } 1209 if (atomic_read(&smc->conn.bytes_to_rcv)) 1210 mask |= EPOLLIN | EPOLLRDNORM; 1211 if (sk->sk_shutdown & RCV_SHUTDOWN) 1212 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1213 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1214 mask |= EPOLLIN; 1215 } 1216 1217 } 1218 release_sock(sk); 1219 sock_put(sk); 1220 1221 return mask; 1222 } 1223 1224 static int smc_shutdown(struct socket *sock, int how) 1225 { 1226 struct sock *sk = sock->sk; 1227 struct smc_sock *smc; 1228 int rc = -EINVAL; 1229 int rc1 = 0; 1230 1231 smc = smc_sk(sk); 1232 1233 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1234 return rc; 1235 1236 lock_sock(sk); 1237 1238 rc = -ENOTCONN; 1239 if ((sk->sk_state != SMC_LISTEN) && 1240 (sk->sk_state != SMC_ACTIVE) && 1241 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1242 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1243 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1244 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1245 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1246 goto out; 1247 if (smc->use_fallback) { 1248 rc = kernel_sock_shutdown(smc->clcsock, how); 1249 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1250 if (sk->sk_shutdown == SHUTDOWN_MASK) 1251 sk->sk_state = SMC_CLOSED; 1252 goto out; 1253 } 1254 switch (how) { 1255 case SHUT_RDWR: /* shutdown in both directions */ 1256 rc = smc_close_active(smc); 1257 break; 1258 case SHUT_WR: 1259 rc = smc_close_shutdown_write(smc); 1260 break; 1261 case SHUT_RD: 1262 if (sk->sk_state == SMC_LISTEN) 1263 rc = smc_close_active(smc); 1264 else 1265 rc = 0; 1266 /* nothing more to do because peer is not involved */ 1267 break; 1268 } 1269 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1270 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1271 sk->sk_shutdown |= how + 1; 1272 1273 out: 1274 release_sock(sk); 1275 return rc ? rc : rc1; 1276 } 1277 1278 static int smc_setsockopt(struct socket *sock, int level, int optname, 1279 char __user *optval, unsigned int optlen) 1280 { 1281 struct sock *sk = sock->sk; 1282 struct smc_sock *smc; 1283 1284 smc = smc_sk(sk); 1285 1286 /* generic setsockopts reaching us here always apply to the 1287 * CLC socket 1288 */ 1289 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1290 optval, optlen); 1291 } 1292 1293 static int smc_getsockopt(struct socket *sock, int level, int optname, 1294 char __user *optval, int __user *optlen) 1295 { 1296 struct smc_sock *smc; 1297 1298 smc = smc_sk(sock->sk); 1299 /* socket options apply to the CLC socket */ 1300 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1301 optval, optlen); 1302 } 1303 1304 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1305 unsigned long arg) 1306 { 1307 struct smc_sock *smc; 1308 1309 smc = smc_sk(sock->sk); 1310 if (smc->use_fallback) 1311 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1312 else 1313 return sock_no_ioctl(sock, cmd, arg); 1314 } 1315 1316 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1317 int offset, size_t size, int flags) 1318 { 1319 struct sock *sk = sock->sk; 1320 struct smc_sock *smc; 1321 int rc = -EPIPE; 1322 1323 smc = smc_sk(sk); 1324 lock_sock(sk); 1325 if (sk->sk_state != SMC_ACTIVE) 1326 goto out; 1327 if (smc->use_fallback) 1328 rc = kernel_sendpage(smc->clcsock, page, offset, 1329 size, flags); 1330 else 1331 rc = sock_no_sendpage(sock, page, offset, size, flags); 1332 1333 out: 1334 release_sock(sk); 1335 return rc; 1336 } 1337 1338 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1339 struct pipe_inode_info *pipe, size_t len, 1340 unsigned int flags) 1341 { 1342 struct sock *sk = sock->sk; 1343 struct smc_sock *smc; 1344 int rc = -ENOTCONN; 1345 1346 smc = smc_sk(sk); 1347 lock_sock(sk); 1348 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1349 goto out; 1350 if (smc->use_fallback) { 1351 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1352 pipe, len, flags); 1353 } else { 1354 rc = -EOPNOTSUPP; 1355 } 1356 out: 1357 release_sock(sk); 1358 return rc; 1359 } 1360 1361 /* must look like tcp */ 1362 static const struct proto_ops smc_sock_ops = { 1363 .family = PF_SMC, 1364 .owner = THIS_MODULE, 1365 .release = smc_release, 1366 .bind = smc_bind, 1367 .connect = smc_connect, 1368 .socketpair = sock_no_socketpair, 1369 .accept = smc_accept, 1370 .getname = smc_getname, 1371 .poll = smc_poll, 1372 .ioctl = smc_ioctl, 1373 .listen = smc_listen, 1374 .shutdown = smc_shutdown, 1375 .setsockopt = smc_setsockopt, 1376 .getsockopt = smc_getsockopt, 1377 .sendmsg = smc_sendmsg, 1378 .recvmsg = smc_recvmsg, 1379 .mmap = sock_no_mmap, 1380 .sendpage = smc_sendpage, 1381 .splice_read = smc_splice_read, 1382 }; 1383 1384 static int smc_create(struct net *net, struct socket *sock, int protocol, 1385 int kern) 1386 { 1387 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 1388 struct smc_sock *smc; 1389 struct sock *sk; 1390 int rc; 1391 1392 rc = -ESOCKTNOSUPPORT; 1393 if (sock->type != SOCK_STREAM) 1394 goto out; 1395 1396 rc = -EPROTONOSUPPORT; 1397 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 1398 goto out; 1399 1400 rc = -ENOBUFS; 1401 sock->ops = &smc_sock_ops; 1402 sk = smc_sock_alloc(net, sock, protocol); 1403 if (!sk) 1404 goto out; 1405 1406 /* create internal TCP socket for CLC handshake and fallback */ 1407 smc = smc_sk(sk); 1408 smc->use_fallback = false; /* assume rdma capability first */ 1409 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1410 &smc->clcsock); 1411 if (rc) { 1412 sk_common_release(sk); 1413 goto out; 1414 } 1415 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1416 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1417 1418 out: 1419 return rc; 1420 } 1421 1422 static const struct net_proto_family smc_sock_family_ops = { 1423 .family = PF_SMC, 1424 .owner = THIS_MODULE, 1425 .create = smc_create, 1426 }; 1427 1428 static int __init smc_init(void) 1429 { 1430 int rc; 1431 1432 rc = smc_pnet_init(); 1433 if (rc) 1434 return rc; 1435 1436 rc = smc_llc_init(); 1437 if (rc) { 1438 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1439 goto out_pnet; 1440 } 1441 1442 rc = smc_cdc_init(); 1443 if (rc) { 1444 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1445 goto out_pnet; 1446 } 1447 1448 rc = proto_register(&smc_proto, 1); 1449 if (rc) { 1450 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 1451 goto out_pnet; 1452 } 1453 1454 rc = proto_register(&smc_proto6, 1); 1455 if (rc) { 1456 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 1457 goto out_proto; 1458 } 1459 1460 rc = sock_register(&smc_sock_family_ops); 1461 if (rc) { 1462 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1463 goto out_proto6; 1464 } 1465 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1466 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 1467 1468 rc = smc_ib_register_client(); 1469 if (rc) { 1470 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1471 goto out_sock; 1472 } 1473 1474 static_branch_enable(&tcp_have_smc); 1475 return 0; 1476 1477 out_sock: 1478 sock_unregister(PF_SMC); 1479 out_proto6: 1480 proto_unregister(&smc_proto6); 1481 out_proto: 1482 proto_unregister(&smc_proto); 1483 out_pnet: 1484 smc_pnet_exit(); 1485 return rc; 1486 } 1487 1488 static void __exit smc_exit(void) 1489 { 1490 struct smc_link_group *lgr, *lg; 1491 LIST_HEAD(lgr_freeing_list); 1492 1493 spin_lock_bh(&smc_lgr_list.lock); 1494 if (!list_empty(&smc_lgr_list.list)) 1495 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1496 spin_unlock_bh(&smc_lgr_list.lock); 1497 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1498 list_del_init(&lgr->list); 1499 cancel_delayed_work_sync(&lgr->free_work); 1500 smc_lgr_free(lgr); /* free link group */ 1501 } 1502 static_branch_disable(&tcp_have_smc); 1503 smc_ib_unregister_client(); 1504 sock_unregister(PF_SMC); 1505 proto_unregister(&smc_proto6); 1506 proto_unregister(&smc_proto); 1507 smc_pnet_exit(); 1508 } 1509 1510 module_init(smc_init); 1511 module_exit(smc_exit); 1512 1513 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1514 MODULE_DESCRIPTION("smc socket address family"); 1515 MODULE_LICENSE("GPL"); 1516 MODULE_ALIAS_NETPROTO(PF_SMC); 1517