1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - support for alternate links postponed 11 * - partial support for non-blocking sockets only 12 * - support for urgent data postponed 13 * 14 * Copyright IBM Corp. 2016, 2018 15 * 16 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 17 * based on prototype from Frank Blaschka 18 */ 19 20 #define KMSG_COMPONENT "smc" 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23 #include <linux/module.h> 24 #include <linux/socket.h> 25 #include <linux/workqueue.h> 26 #include <linux/in.h> 27 #include <linux/sched/signal.h> 28 29 #include <net/sock.h> 30 #include <net/tcp.h> 31 #include <net/smc.h> 32 #include <asm/ioctls.h> 33 34 #include "smc.h" 35 #include "smc_clc.h" 36 #include "smc_llc.h" 37 #include "smc_cdc.h" 38 #include "smc_core.h" 39 #include "smc_ib.h" 40 #include "smc_pnet.h" 41 #include "smc_tx.h" 42 #include "smc_rx.h" 43 #include "smc_close.h" 44 45 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 46 * creation 47 */ 48 49 static void smc_tcp_listen_work(struct work_struct *); 50 51 static void smc_set_keepalive(struct sock *sk, int val) 52 { 53 struct smc_sock *smc = smc_sk(sk); 54 55 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 56 } 57 58 static struct smc_hashinfo smc_v4_hashinfo = { 59 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 60 }; 61 62 static struct smc_hashinfo smc_v6_hashinfo = { 63 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 64 }; 65 66 int smc_hash_sk(struct sock *sk) 67 { 68 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 69 struct hlist_head *head; 70 71 head = &h->ht; 72 73 write_lock_bh(&h->lock); 74 sk_add_node(sk, head); 75 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 76 write_unlock_bh(&h->lock); 77 78 return 0; 79 } 80 EXPORT_SYMBOL_GPL(smc_hash_sk); 81 82 void smc_unhash_sk(struct sock *sk) 83 { 84 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 85 86 write_lock_bh(&h->lock); 87 if (sk_del_node_init(sk)) 88 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 89 write_unlock_bh(&h->lock); 90 } 91 EXPORT_SYMBOL_GPL(smc_unhash_sk); 92 93 struct proto smc_proto = { 94 .name = "SMC", 95 .owner = THIS_MODULE, 96 .keepalive = smc_set_keepalive, 97 .hash = smc_hash_sk, 98 .unhash = smc_unhash_sk, 99 .obj_size = sizeof(struct smc_sock), 100 .h.smc_hash = &smc_v4_hashinfo, 101 .slab_flags = SLAB_TYPESAFE_BY_RCU, 102 }; 103 EXPORT_SYMBOL_GPL(smc_proto); 104 105 struct proto smc_proto6 = { 106 .name = "SMC6", 107 .owner = THIS_MODULE, 108 .keepalive = smc_set_keepalive, 109 .hash = smc_hash_sk, 110 .unhash = smc_unhash_sk, 111 .obj_size = sizeof(struct smc_sock), 112 .h.smc_hash = &smc_v6_hashinfo, 113 .slab_flags = SLAB_TYPESAFE_BY_RCU, 114 }; 115 EXPORT_SYMBOL_GPL(smc_proto6); 116 117 static int smc_release(struct socket *sock) 118 { 119 struct sock *sk = sock->sk; 120 struct smc_sock *smc; 121 int rc = 0; 122 123 if (!sk) 124 goto out; 125 126 smc = smc_sk(sk); 127 if (sk->sk_state == SMC_LISTEN) 128 /* smc_close_non_accepted() is called and acquires 129 * sock lock for child sockets again 130 */ 131 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 132 else 133 lock_sock(sk); 134 135 if (!smc->use_fallback) { 136 rc = smc_close_active(smc); 137 sock_set_flag(sk, SOCK_DEAD); 138 sk->sk_shutdown |= SHUTDOWN_MASK; 139 } 140 if (smc->clcsock) { 141 sock_release(smc->clcsock); 142 smc->clcsock = NULL; 143 } 144 if (smc->use_fallback) { 145 sock_put(sk); /* passive closing */ 146 sk->sk_state = SMC_CLOSED; 147 sk->sk_state_change(sk); 148 } 149 150 /* detach socket */ 151 sock_orphan(sk); 152 sock->sk = NULL; 153 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 154 smc_conn_free(&smc->conn); 155 release_sock(sk); 156 157 sk->sk_prot->unhash(sk); 158 sock_put(sk); /* final sock_put */ 159 out: 160 return rc; 161 } 162 163 static void smc_destruct(struct sock *sk) 164 { 165 if (sk->sk_state != SMC_CLOSED) 166 return; 167 if (!sock_flag(sk, SOCK_DEAD)) 168 return; 169 170 sk_refcnt_debug_dec(sk); 171 } 172 173 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 174 int protocol) 175 { 176 struct smc_sock *smc; 177 struct proto *prot; 178 struct sock *sk; 179 180 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 181 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 182 if (!sk) 183 return NULL; 184 185 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 186 sk->sk_state = SMC_INIT; 187 sk->sk_destruct = smc_destruct; 188 sk->sk_protocol = protocol; 189 smc = smc_sk(sk); 190 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 191 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); 192 INIT_LIST_HEAD(&smc->accept_q); 193 spin_lock_init(&smc->accept_q_lock); 194 spin_lock_init(&smc->conn.send_lock); 195 sk->sk_prot->hash(sk); 196 sk_refcnt_debug_inc(sk); 197 198 return sk; 199 } 200 201 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 202 int addr_len) 203 { 204 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 205 struct sock *sk = sock->sk; 206 struct smc_sock *smc; 207 int rc; 208 209 smc = smc_sk(sk); 210 211 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 212 rc = -EINVAL; 213 if (addr_len < sizeof(struct sockaddr_in)) 214 goto out; 215 216 rc = -EAFNOSUPPORT; 217 if (addr->sin_family != AF_INET && 218 addr->sin_family != AF_INET6 && 219 addr->sin_family != AF_UNSPEC) 220 goto out; 221 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 222 if (addr->sin_family == AF_UNSPEC && 223 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 224 goto out; 225 226 lock_sock(sk); 227 228 /* Check if socket is already active */ 229 rc = -EINVAL; 230 if (sk->sk_state != SMC_INIT) 231 goto out_rel; 232 233 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 234 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 235 236 out_rel: 237 release_sock(sk); 238 out: 239 return rc; 240 } 241 242 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 243 unsigned long mask) 244 { 245 /* options we don't get control via setsockopt for */ 246 nsk->sk_type = osk->sk_type; 247 nsk->sk_sndbuf = osk->sk_sndbuf; 248 nsk->sk_rcvbuf = osk->sk_rcvbuf; 249 nsk->sk_sndtimeo = osk->sk_sndtimeo; 250 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 251 nsk->sk_mark = osk->sk_mark; 252 nsk->sk_priority = osk->sk_priority; 253 nsk->sk_rcvlowat = osk->sk_rcvlowat; 254 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 255 nsk->sk_err = osk->sk_err; 256 257 nsk->sk_flags &= ~mask; 258 nsk->sk_flags |= osk->sk_flags & mask; 259 } 260 261 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 262 (1UL << SOCK_KEEPOPEN) | \ 263 (1UL << SOCK_LINGER) | \ 264 (1UL << SOCK_BROADCAST) | \ 265 (1UL << SOCK_TIMESTAMP) | \ 266 (1UL << SOCK_DBG) | \ 267 (1UL << SOCK_RCVTSTAMP) | \ 268 (1UL << SOCK_RCVTSTAMPNS) | \ 269 (1UL << SOCK_LOCALROUTE) | \ 270 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 271 (1UL << SOCK_RXQ_OVFL) | \ 272 (1UL << SOCK_WIFI_STATUS) | \ 273 (1UL << SOCK_NOFCS) | \ 274 (1UL << SOCK_FILTER_LOCKED)) 275 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 276 * clc socket (since smc is not called for these options from net/core) 277 */ 278 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 279 { 280 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 281 } 282 283 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 284 (1UL << SOCK_KEEPOPEN) | \ 285 (1UL << SOCK_LINGER) | \ 286 (1UL << SOCK_DBG)) 287 /* copy only settings and flags relevant for smc from clc to smc socket */ 288 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 289 { 290 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 291 } 292 293 /* register a new rmb, optionally send confirm_rkey msg to register with peer */ 294 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, 295 bool conf_rkey) 296 { 297 /* register memory region for new rmb */ 298 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { 299 rmb_desc->regerr = 1; 300 return -EFAULT; 301 } 302 if (!conf_rkey) 303 return 0; 304 /* exchange confirm_rkey msg with peer */ 305 if (smc_llc_do_confirm_rkey(link, rmb_desc)) { 306 rmb_desc->regerr = 1; 307 return -EFAULT; 308 } 309 return 0; 310 } 311 312 static int smc_clnt_conf_first_link(struct smc_sock *smc) 313 { 314 struct net *net = sock_net(smc->clcsock->sk); 315 struct smc_link_group *lgr = smc->conn.lgr; 316 struct smc_link *link; 317 int rest; 318 int rc; 319 320 link = &lgr->lnk[SMC_SINGLE_LINK]; 321 /* receive CONFIRM LINK request from server over RoCE fabric */ 322 rest = wait_for_completion_interruptible_timeout( 323 &link->llc_confirm, 324 SMC_LLC_WAIT_FIRST_TIME); 325 if (rest <= 0) { 326 struct smc_clc_msg_decline dclc; 327 328 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 329 SMC_CLC_DECLINE); 330 return rc; 331 } 332 333 if (link->llc_confirm_rc) 334 return SMC_CLC_DECL_RMBE_EC; 335 336 rc = smc_ib_modify_qp_rts(link); 337 if (rc) 338 return SMC_CLC_DECL_INTERR; 339 340 smc_wr_remember_qp_attr(link); 341 342 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 343 return SMC_CLC_DECL_INTERR; 344 345 /* send CONFIRM LINK response over RoCE fabric */ 346 rc = smc_llc_send_confirm_link(link, 347 link->smcibdev->mac[link->ibport - 1], 348 &link->smcibdev->gid[link->ibport - 1], 349 SMC_LLC_RESP); 350 if (rc < 0) 351 return SMC_CLC_DECL_TCL; 352 353 /* receive ADD LINK request from server over RoCE fabric */ 354 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 355 SMC_LLC_WAIT_TIME); 356 if (rest <= 0) { 357 struct smc_clc_msg_decline dclc; 358 359 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 360 SMC_CLC_DECLINE); 361 return rc; 362 } 363 364 /* send add link reject message, only one link supported for now */ 365 rc = smc_llc_send_add_link(link, 366 link->smcibdev->mac[link->ibport - 1], 367 &link->smcibdev->gid[link->ibport - 1], 368 SMC_LLC_RESP); 369 if (rc < 0) 370 return SMC_CLC_DECL_TCL; 371 372 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 373 374 return 0; 375 } 376 377 static void smc_conn_save_peer_info(struct smc_sock *smc, 378 struct smc_clc_msg_accept_confirm *clc) 379 { 380 int bufsize = smc_uncompress_bufsize(clc->rmbe_size); 381 382 smc->conn.peer_rmbe_idx = clc->rmbe_idx; 383 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 384 smc->conn.peer_rmbe_size = bufsize; 385 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 386 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); 387 } 388 389 static void smc_link_save_peer_info(struct smc_link *link, 390 struct smc_clc_msg_accept_confirm *clc) 391 { 392 link->peer_qpn = ntoh24(clc->qpn); 393 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 394 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 395 link->peer_psn = ntoh24(clc->psn); 396 link->peer_mtu = clc->qp_mtu; 397 } 398 399 /* fall back during connect */ 400 static int smc_connect_fallback(struct smc_sock *smc) 401 { 402 smc->use_fallback = true; 403 smc_copy_sock_settings_to_clc(smc); 404 if (smc->sk.sk_state == SMC_INIT) 405 smc->sk.sk_state = SMC_ACTIVE; 406 return 0; 407 } 408 409 /* decline and fall back during connect */ 410 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) 411 { 412 int rc; 413 414 if (reason_code < 0) /* error, fallback is not possible */ 415 return reason_code; 416 if (reason_code != SMC_CLC_DECL_REPLY) { 417 rc = smc_clc_send_decline(smc, reason_code); 418 if (rc < 0) 419 return rc; 420 } 421 return smc_connect_fallback(smc); 422 } 423 424 /* abort connecting */ 425 static int smc_connect_abort(struct smc_sock *smc, int reason_code, 426 int local_contact) 427 { 428 if (local_contact == SMC_FIRST_CONTACT) 429 smc_lgr_forget(smc->conn.lgr); 430 mutex_unlock(&smc_create_lgr_pending); 431 smc_conn_free(&smc->conn); 432 if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) 433 sock_put(&smc->sk); /* passive closing */ 434 return reason_code; 435 } 436 437 /* check if there is a rdma device available for this connection. */ 438 /* called for connect and listen */ 439 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, 440 u8 *ibport) 441 { 442 int reason_code = 0; 443 444 /* PNET table look up: search active ib_device and port 445 * within same PNETID that also contains the ethernet device 446 * used for the internal TCP socket 447 */ 448 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); 449 if (!(*ibdev)) 450 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 451 452 return reason_code; 453 } 454 455 /* CLC handshake during connect */ 456 static int smc_connect_clc(struct smc_sock *smc, 457 struct smc_clc_msg_accept_confirm *aclc, 458 struct smc_ib_device *ibdev, u8 ibport) 459 { 460 int rc = 0; 461 462 /* do inband token exchange */ 463 rc = smc_clc_send_proposal(smc, ibdev, ibport); 464 if (rc) 465 return rc; 466 /* receive SMC Accept CLC message */ 467 return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); 468 } 469 470 /* setup for RDMA connection of client */ 471 static int smc_connect_rdma(struct smc_sock *smc, 472 struct smc_clc_msg_accept_confirm *aclc, 473 struct smc_ib_device *ibdev, u8 ibport) 474 { 475 int local_contact = SMC_FIRST_CONTACT; 476 struct smc_link *link; 477 int reason_code = 0; 478 479 mutex_lock(&smc_create_lgr_pending); 480 local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, 481 aclc->hdr.flag); 482 if (local_contact < 0) { 483 if (local_contact == -ENOMEM) 484 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 485 else if (local_contact == -ENOLINK) 486 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 487 else 488 reason_code = SMC_CLC_DECL_INTERR; /* other error */ 489 return smc_connect_abort(smc, reason_code, 0); 490 } 491 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 492 493 smc_conn_save_peer_info(smc, aclc); 494 495 /* create send buffer and rmb */ 496 if (smc_buf_create(smc)) 497 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); 498 499 if (local_contact == SMC_FIRST_CONTACT) 500 smc_link_save_peer_info(link, aclc); 501 502 if (smc_rmb_rtoken_handling(&smc->conn, aclc)) 503 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 504 local_contact); 505 506 smc_close_init(smc); 507 smc_rx_init(smc); 508 509 if (local_contact == SMC_FIRST_CONTACT) { 510 if (smc_ib_ready_link(link)) 511 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 512 local_contact); 513 } else { 514 if (!smc->conn.rmb_desc->reused && 515 smc_reg_rmb(link, smc->conn.rmb_desc, true)) 516 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 517 local_contact); 518 } 519 smc_rmb_sync_sg_for_device(&smc->conn); 520 521 reason_code = smc_clc_send_confirm(smc); 522 if (reason_code) 523 return smc_connect_abort(smc, reason_code, local_contact); 524 525 smc_tx_init(smc); 526 527 if (local_contact == SMC_FIRST_CONTACT) { 528 /* QP confirmation over RoCE fabric */ 529 reason_code = smc_clnt_conf_first_link(smc); 530 if (reason_code) 531 return smc_connect_abort(smc, reason_code, 532 local_contact); 533 } 534 mutex_unlock(&smc_create_lgr_pending); 535 536 smc_copy_sock_settings_to_clc(smc); 537 if (smc->sk.sk_state == SMC_INIT) 538 smc->sk.sk_state = SMC_ACTIVE; 539 540 return 0; 541 } 542 543 /* perform steps before actually connecting */ 544 static int __smc_connect(struct smc_sock *smc) 545 { 546 struct smc_clc_msg_accept_confirm aclc; 547 struct smc_ib_device *ibdev; 548 int rc = 0; 549 u8 ibport; 550 551 sock_hold(&smc->sk); /* sock put in passive closing */ 552 553 if (smc->use_fallback) 554 return smc_connect_fallback(smc); 555 556 /* if peer has not signalled SMC-capability, fall back */ 557 if (!tcp_sk(smc->clcsock->sk)->syn_smc) 558 return smc_connect_fallback(smc); 559 560 /* IPSec connections opt out of SMC-R optimizations */ 561 if (using_ipsec(smc)) 562 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); 563 564 /* check if a RDMA device is available; if not, fall back */ 565 if (smc_check_rdma(smc, &ibdev, &ibport)) 566 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); 567 568 /* perform CLC handshake */ 569 rc = smc_connect_clc(smc, &aclc, ibdev, ibport); 570 if (rc) 571 return smc_connect_decline_fallback(smc, rc); 572 573 /* connect using rdma */ 574 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); 575 if (rc) 576 return smc_connect_decline_fallback(smc, rc); 577 578 return 0; 579 } 580 581 static int smc_connect(struct socket *sock, struct sockaddr *addr, 582 int alen, int flags) 583 { 584 struct sock *sk = sock->sk; 585 struct smc_sock *smc; 586 int rc = -EINVAL; 587 588 smc = smc_sk(sk); 589 590 /* separate smc parameter checking to be safe */ 591 if (alen < sizeof(addr->sa_family)) 592 goto out_err; 593 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 594 goto out_err; 595 596 lock_sock(sk); 597 switch (sk->sk_state) { 598 default: 599 goto out; 600 case SMC_ACTIVE: 601 rc = -EISCONN; 602 goto out; 603 case SMC_INIT: 604 rc = 0; 605 break; 606 } 607 608 smc_copy_sock_settings_to_clc(smc); 609 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 610 rc = kernel_connect(smc->clcsock, addr, alen, flags); 611 if (rc) 612 goto out; 613 614 rc = __smc_connect(smc); 615 if (rc < 0) 616 goto out; 617 else 618 rc = 0; /* success cases including fallback */ 619 620 out: 621 release_sock(sk); 622 out_err: 623 return rc; 624 } 625 626 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 627 { 628 struct socket *new_clcsock = NULL; 629 struct sock *lsk = &lsmc->sk; 630 struct sock *new_sk; 631 int rc; 632 633 release_sock(lsk); 634 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 635 if (!new_sk) { 636 rc = -ENOMEM; 637 lsk->sk_err = ENOMEM; 638 *new_smc = NULL; 639 lock_sock(lsk); 640 goto out; 641 } 642 *new_smc = smc_sk(new_sk); 643 644 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 645 lock_sock(lsk); 646 if (rc < 0) 647 lsk->sk_err = -rc; 648 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 649 if (new_clcsock) 650 sock_release(new_clcsock); 651 new_sk->sk_state = SMC_CLOSED; 652 sock_set_flag(new_sk, SOCK_DEAD); 653 new_sk->sk_prot->unhash(new_sk); 654 sock_put(new_sk); /* final */ 655 *new_smc = NULL; 656 goto out; 657 } 658 659 (*new_smc)->clcsock = new_clcsock; 660 out: 661 return rc; 662 } 663 664 /* add a just created sock to the accept queue of the listen sock as 665 * candidate for a following socket accept call from user space 666 */ 667 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 668 { 669 struct smc_sock *par = smc_sk(parent); 670 671 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 672 spin_lock(&par->accept_q_lock); 673 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 674 spin_unlock(&par->accept_q_lock); 675 sk_acceptq_added(parent); 676 } 677 678 /* remove a socket from the accept queue of its parental listening socket */ 679 static void smc_accept_unlink(struct sock *sk) 680 { 681 struct smc_sock *par = smc_sk(sk)->listen_smc; 682 683 spin_lock(&par->accept_q_lock); 684 list_del_init(&smc_sk(sk)->accept_q); 685 spin_unlock(&par->accept_q_lock); 686 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 687 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 688 } 689 690 /* remove a sock from the accept queue to bind it to a new socket created 691 * for a socket accept call from user space 692 */ 693 struct sock *smc_accept_dequeue(struct sock *parent, 694 struct socket *new_sock) 695 { 696 struct smc_sock *isk, *n; 697 struct sock *new_sk; 698 699 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 700 new_sk = (struct sock *)isk; 701 702 smc_accept_unlink(new_sk); 703 if (new_sk->sk_state == SMC_CLOSED) { 704 if (isk->clcsock) { 705 sock_release(isk->clcsock); 706 isk->clcsock = NULL; 707 } 708 new_sk->sk_prot->unhash(new_sk); 709 sock_put(new_sk); /* final */ 710 continue; 711 } 712 if (new_sock) 713 sock_graft(new_sk, new_sock); 714 return new_sk; 715 } 716 return NULL; 717 } 718 719 /* clean up for a created but never accepted sock */ 720 void smc_close_non_accepted(struct sock *sk) 721 { 722 struct smc_sock *smc = smc_sk(sk); 723 724 lock_sock(sk); 725 if (!sk->sk_lingertime) 726 /* wait for peer closing */ 727 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 728 if (!smc->use_fallback) { 729 smc_close_active(smc); 730 sock_set_flag(sk, SOCK_DEAD); 731 sk->sk_shutdown |= SHUTDOWN_MASK; 732 } 733 if (smc->clcsock) { 734 struct socket *tcp; 735 736 tcp = smc->clcsock; 737 smc->clcsock = NULL; 738 sock_release(tcp); 739 } 740 if (smc->use_fallback) { 741 sock_put(sk); /* passive closing */ 742 sk->sk_state = SMC_CLOSED; 743 } else { 744 if (sk->sk_state == SMC_CLOSED) 745 smc_conn_free(&smc->conn); 746 } 747 release_sock(sk); 748 sk->sk_prot->unhash(sk); 749 sock_put(sk); /* final sock_put */ 750 } 751 752 static int smc_serv_conf_first_link(struct smc_sock *smc) 753 { 754 struct net *net = sock_net(smc->clcsock->sk); 755 struct smc_link_group *lgr = smc->conn.lgr; 756 struct smc_link *link; 757 int rest; 758 int rc; 759 760 link = &lgr->lnk[SMC_SINGLE_LINK]; 761 762 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 763 return SMC_CLC_DECL_INTERR; 764 765 /* send CONFIRM LINK request to client over the RoCE fabric */ 766 rc = smc_llc_send_confirm_link(link, 767 link->smcibdev->mac[link->ibport - 1], 768 &link->smcibdev->gid[link->ibport - 1], 769 SMC_LLC_REQ); 770 if (rc < 0) 771 return SMC_CLC_DECL_TCL; 772 773 /* receive CONFIRM LINK response from client over the RoCE fabric */ 774 rest = wait_for_completion_interruptible_timeout( 775 &link->llc_confirm_resp, 776 SMC_LLC_WAIT_FIRST_TIME); 777 if (rest <= 0) { 778 struct smc_clc_msg_decline dclc; 779 780 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 781 SMC_CLC_DECLINE); 782 return rc; 783 } 784 785 if (link->llc_confirm_resp_rc) 786 return SMC_CLC_DECL_RMBE_EC; 787 788 /* send ADD LINK request to client over the RoCE fabric */ 789 rc = smc_llc_send_add_link(link, 790 link->smcibdev->mac[link->ibport - 1], 791 &link->smcibdev->gid[link->ibport - 1], 792 SMC_LLC_REQ); 793 if (rc < 0) 794 return SMC_CLC_DECL_TCL; 795 796 /* receive ADD LINK response from client over the RoCE fabric */ 797 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 798 SMC_LLC_WAIT_TIME); 799 if (rest <= 0) { 800 struct smc_clc_msg_decline dclc; 801 802 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 803 SMC_CLC_DECLINE); 804 return rc; 805 } 806 807 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 808 809 return 0; 810 } 811 812 /* listen worker: finish */ 813 static void smc_listen_out(struct smc_sock *new_smc) 814 { 815 struct smc_sock *lsmc = new_smc->listen_smc; 816 struct sock *newsmcsk = &new_smc->sk; 817 818 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 819 if (lsmc->sk.sk_state == SMC_LISTEN) { 820 smc_accept_enqueue(&lsmc->sk, newsmcsk); 821 } else { /* no longer listening */ 822 smc_close_non_accepted(newsmcsk); 823 } 824 release_sock(&lsmc->sk); 825 826 /* Wake up accept */ 827 lsmc->sk.sk_data_ready(&lsmc->sk); 828 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 829 } 830 831 /* listen worker: finish in state connected */ 832 static void smc_listen_out_connected(struct smc_sock *new_smc) 833 { 834 struct sock *newsmcsk = &new_smc->sk; 835 836 sk_refcnt_debug_inc(newsmcsk); 837 if (newsmcsk->sk_state == SMC_INIT) 838 newsmcsk->sk_state = SMC_ACTIVE; 839 840 smc_listen_out(new_smc); 841 } 842 843 /* listen worker: finish in error state */ 844 static void smc_listen_out_err(struct smc_sock *new_smc) 845 { 846 struct sock *newsmcsk = &new_smc->sk; 847 848 if (newsmcsk->sk_state == SMC_INIT) 849 sock_put(&new_smc->sk); /* passive closing */ 850 newsmcsk->sk_state = SMC_CLOSED; 851 smc_conn_free(&new_smc->conn); 852 853 smc_listen_out(new_smc); 854 } 855 856 /* listen worker: decline and fall back if possible */ 857 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, 858 int local_contact) 859 { 860 /* RDMA setup failed, switch back to TCP */ 861 if (local_contact == SMC_FIRST_CONTACT) 862 smc_lgr_forget(new_smc->conn.lgr); 863 if (reason_code < 0) { /* error, no fallback possible */ 864 smc_listen_out_err(new_smc); 865 return; 866 } 867 smc_conn_free(&new_smc->conn); 868 new_smc->use_fallback = true; 869 if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { 870 if (smc_clc_send_decline(new_smc, reason_code) < 0) { 871 smc_listen_out_err(new_smc); 872 return; 873 } 874 } 875 smc_listen_out_connected(new_smc); 876 } 877 878 /* listen worker: check prefixes */ 879 static int smc_listen_rdma_check(struct smc_sock *new_smc, 880 struct smc_clc_msg_proposal *pclc) 881 { 882 struct smc_clc_msg_proposal_prefix *pclc_prfx; 883 struct socket *newclcsock = new_smc->clcsock; 884 885 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 886 if (smc_clc_prfx_match(newclcsock, pclc_prfx)) 887 return SMC_CLC_DECL_CNFERR; 888 889 return 0; 890 } 891 892 /* listen worker: initialize connection and buffers */ 893 static int smc_listen_rdma_init(struct smc_sock *new_smc, 894 struct smc_clc_msg_proposal *pclc, 895 struct smc_ib_device *ibdev, u8 ibport, 896 int *local_contact) 897 { 898 /* allocate connection / link group */ 899 *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); 900 if (*local_contact < 0) { 901 if (*local_contact == -ENOMEM) 902 return SMC_CLC_DECL_MEM;/* insufficient memory*/ 903 return SMC_CLC_DECL_INTERR; /* other error */ 904 } 905 906 /* create send buffer and rmb */ 907 if (smc_buf_create(new_smc)) 908 return SMC_CLC_DECL_MEM; 909 910 return 0; 911 } 912 913 /* listen worker: register buffers */ 914 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) 915 { 916 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 917 918 if (local_contact != SMC_FIRST_CONTACT) { 919 if (!new_smc->conn.rmb_desc->reused) { 920 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) 921 return SMC_CLC_DECL_INTERR; 922 } 923 } 924 smc_rmb_sync_sg_for_device(&new_smc->conn); 925 926 return 0; 927 } 928 929 /* listen worker: finish RDMA setup */ 930 static void smc_listen_rdma_finish(struct smc_sock *new_smc, 931 struct smc_clc_msg_accept_confirm *cclc, 932 int local_contact) 933 { 934 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 935 int reason_code = 0; 936 937 if (local_contact == SMC_FIRST_CONTACT) 938 smc_link_save_peer_info(link, cclc); 939 940 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { 941 reason_code = SMC_CLC_DECL_INTERR; 942 goto decline; 943 } 944 945 if (local_contact == SMC_FIRST_CONTACT) { 946 if (smc_ib_ready_link(link)) { 947 reason_code = SMC_CLC_DECL_INTERR; 948 goto decline; 949 } 950 /* QP confirmation over RoCE fabric */ 951 reason_code = smc_serv_conf_first_link(new_smc); 952 if (reason_code) 953 goto decline; 954 } 955 return; 956 957 decline: 958 mutex_unlock(&smc_create_lgr_pending); 959 smc_listen_decline(new_smc, reason_code, local_contact); 960 } 961 962 /* setup for RDMA connection of server */ 963 static void smc_listen_work(struct work_struct *work) 964 { 965 struct smc_sock *new_smc = container_of(work, struct smc_sock, 966 smc_listen_work); 967 struct socket *newclcsock = new_smc->clcsock; 968 struct smc_clc_msg_accept_confirm cclc; 969 struct smc_clc_msg_proposal *pclc; 970 struct smc_ib_device *ibdev; 971 u8 buf[SMC_CLC_MAX_LEN]; 972 int local_contact = 0; 973 int reason_code = 0; 974 int rc = 0; 975 u8 ibport; 976 977 if (new_smc->use_fallback) { 978 smc_listen_out_connected(new_smc); 979 return; 980 } 981 982 /* check if peer is smc capable */ 983 if (!tcp_sk(newclcsock->sk)->syn_smc) { 984 new_smc->use_fallback = true; 985 smc_listen_out_connected(new_smc); 986 return; 987 } 988 989 /* do inband token exchange - 990 * wait for and receive SMC Proposal CLC message 991 */ 992 pclc = (struct smc_clc_msg_proposal *)&buf; 993 reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, 994 SMC_CLC_PROPOSAL); 995 if (reason_code) { 996 smc_listen_decline(new_smc, reason_code, 0); 997 return; 998 } 999 1000 /* IPSec connections opt out of SMC-R optimizations */ 1001 if (using_ipsec(new_smc)) { 1002 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); 1003 return; 1004 } 1005 1006 mutex_lock(&smc_create_lgr_pending); 1007 smc_close_init(new_smc); 1008 smc_rx_init(new_smc); 1009 smc_tx_init(new_smc); 1010 1011 /* check if RDMA is available */ 1012 if (smc_check_rdma(new_smc, &ibdev, &ibport) || 1013 smc_listen_rdma_check(new_smc, pclc) || 1014 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, 1015 &local_contact) || 1016 smc_listen_rdma_reg(new_smc, local_contact)) { 1017 /* SMC not supported, decline */ 1018 mutex_unlock(&smc_create_lgr_pending); 1019 smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); 1020 return; 1021 } 1022 1023 /* send SMC Accept CLC message */ 1024 rc = smc_clc_send_accept(new_smc, local_contact); 1025 if (rc) { 1026 mutex_unlock(&smc_create_lgr_pending); 1027 smc_listen_decline(new_smc, rc, local_contact); 1028 return; 1029 } 1030 1031 /* receive SMC Confirm CLC message */ 1032 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 1033 SMC_CLC_CONFIRM); 1034 if (reason_code) { 1035 mutex_unlock(&smc_create_lgr_pending); 1036 smc_listen_decline(new_smc, reason_code, local_contact); 1037 return; 1038 } 1039 1040 /* finish worker */ 1041 smc_listen_rdma_finish(new_smc, &cclc, local_contact); 1042 smc_conn_save_peer_info(new_smc, &cclc); 1043 mutex_unlock(&smc_create_lgr_pending); 1044 smc_listen_out_connected(new_smc); 1045 } 1046 1047 static void smc_tcp_listen_work(struct work_struct *work) 1048 { 1049 struct smc_sock *lsmc = container_of(work, struct smc_sock, 1050 tcp_listen_work); 1051 struct sock *lsk = &lsmc->sk; 1052 struct smc_sock *new_smc; 1053 int rc = 0; 1054 1055 lock_sock(lsk); 1056 while (lsk->sk_state == SMC_LISTEN) { 1057 rc = smc_clcsock_accept(lsmc, &new_smc); 1058 if (rc) 1059 goto out; 1060 if (!new_smc) 1061 continue; 1062 1063 new_smc->listen_smc = lsmc; 1064 new_smc->use_fallback = lsmc->use_fallback; 1065 sock_hold(lsk); /* sock_put in smc_listen_work */ 1066 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 1067 smc_copy_sock_settings_to_smc(new_smc); 1068 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 1069 if (!schedule_work(&new_smc->smc_listen_work)) 1070 sock_put(&new_smc->sk); 1071 } 1072 1073 out: 1074 release_sock(lsk); 1075 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 1076 } 1077 1078 static int smc_listen(struct socket *sock, int backlog) 1079 { 1080 struct sock *sk = sock->sk; 1081 struct smc_sock *smc; 1082 int rc; 1083 1084 smc = smc_sk(sk); 1085 lock_sock(sk); 1086 1087 rc = -EINVAL; 1088 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 1089 goto out; 1090 1091 rc = 0; 1092 if (sk->sk_state == SMC_LISTEN) { 1093 sk->sk_max_ack_backlog = backlog; 1094 goto out; 1095 } 1096 /* some socket options are handled in core, so we could not apply 1097 * them to the clc socket -- copy smc socket options to clc socket 1098 */ 1099 smc_copy_sock_settings_to_clc(smc); 1100 if (!smc->use_fallback) 1101 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1102 1103 rc = kernel_listen(smc->clcsock, backlog); 1104 if (rc) 1105 goto out; 1106 sk->sk_max_ack_backlog = backlog; 1107 sk->sk_ack_backlog = 0; 1108 sk->sk_state = SMC_LISTEN; 1109 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1110 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1111 if (!schedule_work(&smc->tcp_listen_work)) 1112 sock_put(sk); 1113 1114 out: 1115 release_sock(sk); 1116 return rc; 1117 } 1118 1119 static int smc_accept(struct socket *sock, struct socket *new_sock, 1120 int flags, bool kern) 1121 { 1122 struct sock *sk = sock->sk, *nsk; 1123 DECLARE_WAITQUEUE(wait, current); 1124 struct smc_sock *lsmc; 1125 long timeo; 1126 int rc = 0; 1127 1128 lsmc = smc_sk(sk); 1129 sock_hold(sk); /* sock_put below */ 1130 lock_sock(sk); 1131 1132 if (lsmc->sk.sk_state != SMC_LISTEN) { 1133 rc = -EINVAL; 1134 release_sock(sk); 1135 goto out; 1136 } 1137 1138 /* Wait for an incoming connection */ 1139 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1140 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1141 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1142 set_current_state(TASK_INTERRUPTIBLE); 1143 if (!timeo) { 1144 rc = -EAGAIN; 1145 break; 1146 } 1147 release_sock(sk); 1148 timeo = schedule_timeout(timeo); 1149 /* wakeup by sk_data_ready in smc_listen_work() */ 1150 sched_annotate_sleep(); 1151 lock_sock(sk); 1152 if (signal_pending(current)) { 1153 rc = sock_intr_errno(timeo); 1154 break; 1155 } 1156 } 1157 set_current_state(TASK_RUNNING); 1158 remove_wait_queue(sk_sleep(sk), &wait); 1159 1160 if (!rc) 1161 rc = sock_error(nsk); 1162 release_sock(sk); 1163 if (rc) 1164 goto out; 1165 1166 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { 1167 /* wait till data arrives on the socket */ 1168 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * 1169 MSEC_PER_SEC); 1170 if (smc_sk(nsk)->use_fallback) { 1171 struct sock *clcsk = smc_sk(nsk)->clcsock->sk; 1172 1173 lock_sock(clcsk); 1174 if (skb_queue_empty(&clcsk->sk_receive_queue)) 1175 sk_wait_data(clcsk, &timeo, NULL); 1176 release_sock(clcsk); 1177 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { 1178 lock_sock(nsk); 1179 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); 1180 release_sock(nsk); 1181 } 1182 } 1183 1184 out: 1185 sock_put(sk); /* sock_hold above */ 1186 return rc; 1187 } 1188 1189 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1190 int peer) 1191 { 1192 struct smc_sock *smc; 1193 1194 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1195 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1196 return -ENOTCONN; 1197 1198 smc = smc_sk(sock->sk); 1199 1200 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1201 } 1202 1203 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1204 { 1205 struct sock *sk = sock->sk; 1206 struct smc_sock *smc; 1207 int rc = -EPIPE; 1208 1209 smc = smc_sk(sk); 1210 lock_sock(sk); 1211 if ((sk->sk_state != SMC_ACTIVE) && 1212 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1213 (sk->sk_state != SMC_INIT)) 1214 goto out; 1215 1216 if (msg->msg_flags & MSG_FASTOPEN) { 1217 if (sk->sk_state == SMC_INIT) { 1218 smc->use_fallback = true; 1219 } else { 1220 rc = -EINVAL; 1221 goto out; 1222 } 1223 } 1224 1225 if (smc->use_fallback) 1226 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1227 else 1228 rc = smc_tx_sendmsg(smc, msg, len); 1229 out: 1230 release_sock(sk); 1231 return rc; 1232 } 1233 1234 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1235 int flags) 1236 { 1237 struct sock *sk = sock->sk; 1238 struct smc_sock *smc; 1239 int rc = -ENOTCONN; 1240 1241 smc = smc_sk(sk); 1242 lock_sock(sk); 1243 if ((sk->sk_state == SMC_INIT) || 1244 (sk->sk_state == SMC_LISTEN) || 1245 (sk->sk_state == SMC_CLOSED)) 1246 goto out; 1247 1248 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1249 rc = 0; 1250 goto out; 1251 } 1252 1253 if (smc->use_fallback) { 1254 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1255 } else { 1256 msg->msg_namelen = 0; 1257 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); 1258 } 1259 1260 out: 1261 release_sock(sk); 1262 return rc; 1263 } 1264 1265 static __poll_t smc_accept_poll(struct sock *parent) 1266 { 1267 struct smc_sock *isk = smc_sk(parent); 1268 __poll_t mask = 0; 1269 1270 spin_lock(&isk->accept_q_lock); 1271 if (!list_empty(&isk->accept_q)) 1272 mask = EPOLLIN | EPOLLRDNORM; 1273 spin_unlock(&isk->accept_q_lock); 1274 1275 return mask; 1276 } 1277 1278 static __poll_t smc_poll(struct file *file, struct socket *sock, 1279 poll_table *wait) 1280 { 1281 struct sock *sk = sock->sk; 1282 __poll_t mask = 0; 1283 struct smc_sock *smc; 1284 int rc; 1285 1286 if (!sk) 1287 return EPOLLNVAL; 1288 1289 smc = smc_sk(sock->sk); 1290 sock_hold(sk); 1291 lock_sock(sk); 1292 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1293 /* delegate to CLC child sock */ 1294 release_sock(sk); 1295 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1296 lock_sock(sk); 1297 sk->sk_err = smc->clcsock->sk->sk_err; 1298 if (sk->sk_err) { 1299 mask |= EPOLLERR; 1300 } else { 1301 /* if non-blocking connect finished ... */ 1302 if (sk->sk_state == SMC_INIT && 1303 mask & EPOLLOUT && 1304 smc->clcsock->sk->sk_state != TCP_CLOSE) { 1305 rc = __smc_connect(smc); 1306 if (rc < 0) 1307 mask |= EPOLLERR; 1308 /* success cases including fallback */ 1309 mask |= EPOLLOUT | EPOLLWRNORM; 1310 } 1311 } 1312 } else { 1313 if (sk->sk_state != SMC_CLOSED) { 1314 release_sock(sk); 1315 sock_poll_wait(file, sk_sleep(sk), wait); 1316 lock_sock(sk); 1317 } 1318 if (sk->sk_err) 1319 mask |= EPOLLERR; 1320 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1321 (sk->sk_state == SMC_CLOSED)) 1322 mask |= EPOLLHUP; 1323 if (sk->sk_state == SMC_LISTEN) { 1324 /* woken up by sk_data_ready in smc_listen_work() */ 1325 mask = smc_accept_poll(sk); 1326 } else { 1327 if (atomic_read(&smc->conn.sndbuf_space) || 1328 sk->sk_shutdown & SEND_SHUTDOWN) { 1329 mask |= EPOLLOUT | EPOLLWRNORM; 1330 } else { 1331 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1332 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1333 } 1334 if (atomic_read(&smc->conn.bytes_to_rcv)) 1335 mask |= EPOLLIN | EPOLLRDNORM; 1336 if (sk->sk_shutdown & RCV_SHUTDOWN) 1337 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1338 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1339 mask |= EPOLLIN; 1340 } 1341 1342 } 1343 release_sock(sk); 1344 sock_put(sk); 1345 1346 return mask; 1347 } 1348 1349 static int smc_shutdown(struct socket *sock, int how) 1350 { 1351 struct sock *sk = sock->sk; 1352 struct smc_sock *smc; 1353 int rc = -EINVAL; 1354 int rc1 = 0; 1355 1356 smc = smc_sk(sk); 1357 1358 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1359 return rc; 1360 1361 lock_sock(sk); 1362 1363 rc = -ENOTCONN; 1364 if ((sk->sk_state != SMC_LISTEN) && 1365 (sk->sk_state != SMC_ACTIVE) && 1366 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1367 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1368 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1369 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1370 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1371 goto out; 1372 if (smc->use_fallback) { 1373 rc = kernel_sock_shutdown(smc->clcsock, how); 1374 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1375 if (sk->sk_shutdown == SHUTDOWN_MASK) 1376 sk->sk_state = SMC_CLOSED; 1377 goto out; 1378 } 1379 switch (how) { 1380 case SHUT_RDWR: /* shutdown in both directions */ 1381 rc = smc_close_active(smc); 1382 break; 1383 case SHUT_WR: 1384 rc = smc_close_shutdown_write(smc); 1385 break; 1386 case SHUT_RD: 1387 rc = 0; 1388 /* nothing more to do because peer is not involved */ 1389 break; 1390 } 1391 if (smc->clcsock) 1392 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1393 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1394 sk->sk_shutdown |= how + 1; 1395 1396 out: 1397 release_sock(sk); 1398 return rc ? rc : rc1; 1399 } 1400 1401 static int smc_setsockopt(struct socket *sock, int level, int optname, 1402 char __user *optval, unsigned int optlen) 1403 { 1404 struct sock *sk = sock->sk; 1405 struct smc_sock *smc; 1406 int val, rc; 1407 1408 smc = smc_sk(sk); 1409 1410 /* generic setsockopts reaching us here always apply to the 1411 * CLC socket 1412 */ 1413 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1414 optval, optlen); 1415 if (smc->clcsock->sk->sk_err) { 1416 sk->sk_err = smc->clcsock->sk->sk_err; 1417 sk->sk_error_report(sk); 1418 } 1419 if (rc) 1420 return rc; 1421 1422 if (optlen < sizeof(int)) 1423 return rc; 1424 get_user(val, (int __user *)optval); 1425 1426 lock_sock(sk); 1427 switch (optname) { 1428 case TCP_ULP: 1429 case TCP_FASTOPEN: 1430 case TCP_FASTOPEN_CONNECT: 1431 case TCP_FASTOPEN_KEY: 1432 case TCP_FASTOPEN_NO_COOKIE: 1433 /* option not supported by SMC */ 1434 if (sk->sk_state == SMC_INIT) { 1435 smc->use_fallback = true; 1436 } else { 1437 if (!smc->use_fallback) 1438 rc = -EINVAL; 1439 } 1440 break; 1441 case TCP_NODELAY: 1442 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1443 if (val && !smc->use_fallback) 1444 mod_delayed_work(system_wq, &smc->conn.tx_work, 1445 0); 1446 } 1447 break; 1448 case TCP_CORK: 1449 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1450 if (!val && !smc->use_fallback) 1451 mod_delayed_work(system_wq, &smc->conn.tx_work, 1452 0); 1453 } 1454 break; 1455 case TCP_DEFER_ACCEPT: 1456 smc->sockopt_defer_accept = val; 1457 break; 1458 default: 1459 break; 1460 } 1461 release_sock(sk); 1462 1463 return rc; 1464 } 1465 1466 static int smc_getsockopt(struct socket *sock, int level, int optname, 1467 char __user *optval, int __user *optlen) 1468 { 1469 struct smc_sock *smc; 1470 1471 smc = smc_sk(sock->sk); 1472 /* socket options apply to the CLC socket */ 1473 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1474 optval, optlen); 1475 } 1476 1477 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1478 unsigned long arg) 1479 { 1480 struct smc_sock *smc; 1481 int answ; 1482 1483 smc = smc_sk(sock->sk); 1484 if (smc->use_fallback) { 1485 if (!smc->clcsock) 1486 return -EBADF; 1487 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1488 } 1489 switch (cmd) { 1490 case SIOCINQ: /* same as FIONREAD */ 1491 if (smc->sk.sk_state == SMC_LISTEN) 1492 return -EINVAL; 1493 answ = atomic_read(&smc->conn.bytes_to_rcv); 1494 break; 1495 case SIOCOUTQ: 1496 /* output queue size (not send + not acked) */ 1497 if (smc->sk.sk_state == SMC_LISTEN) 1498 return -EINVAL; 1499 answ = smc->conn.sndbuf_desc->len - 1500 atomic_read(&smc->conn.sndbuf_space); 1501 break; 1502 case SIOCOUTQNSD: 1503 /* output queue size (not send only) */ 1504 if (smc->sk.sk_state == SMC_LISTEN) 1505 return -EINVAL; 1506 answ = smc_tx_prepared_sends(&smc->conn); 1507 break; 1508 default: 1509 return -ENOIOCTLCMD; 1510 } 1511 1512 return put_user(answ, (int __user *)arg); 1513 } 1514 1515 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1516 int offset, size_t size, int flags) 1517 { 1518 struct sock *sk = sock->sk; 1519 struct smc_sock *smc; 1520 int rc = -EPIPE; 1521 1522 smc = smc_sk(sk); 1523 lock_sock(sk); 1524 if (sk->sk_state != SMC_ACTIVE) { 1525 release_sock(sk); 1526 goto out; 1527 } 1528 release_sock(sk); 1529 if (smc->use_fallback) 1530 rc = kernel_sendpage(smc->clcsock, page, offset, 1531 size, flags); 1532 else 1533 rc = sock_no_sendpage(sock, page, offset, size, flags); 1534 1535 out: 1536 return rc; 1537 } 1538 1539 /* Map the affected portions of the rmbe into an spd, note the number of bytes 1540 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor 1541 * updates till whenever a respective page has been fully processed. 1542 * Note that subsequent recv() calls have to wait till all splice() processing 1543 * completed. 1544 */ 1545 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1546 struct pipe_inode_info *pipe, size_t len, 1547 unsigned int flags) 1548 { 1549 struct sock *sk = sock->sk; 1550 struct smc_sock *smc; 1551 int rc = -ENOTCONN; 1552 1553 smc = smc_sk(sk); 1554 lock_sock(sk); 1555 1556 if (sk->sk_state == SMC_INIT || 1557 sk->sk_state == SMC_LISTEN || 1558 sk->sk_state == SMC_CLOSED) 1559 goto out; 1560 1561 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1562 rc = 0; 1563 goto out; 1564 } 1565 1566 if (smc->use_fallback) { 1567 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1568 pipe, len, flags); 1569 } else { 1570 if (*ppos) { 1571 rc = -ESPIPE; 1572 goto out; 1573 } 1574 if (flags & SPLICE_F_NONBLOCK) 1575 flags = MSG_DONTWAIT; 1576 else 1577 flags = 0; 1578 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); 1579 } 1580 out: 1581 release_sock(sk); 1582 1583 return rc; 1584 } 1585 1586 /* must look like tcp */ 1587 static const struct proto_ops smc_sock_ops = { 1588 .family = PF_SMC, 1589 .owner = THIS_MODULE, 1590 .release = smc_release, 1591 .bind = smc_bind, 1592 .connect = smc_connect, 1593 .socketpair = sock_no_socketpair, 1594 .accept = smc_accept, 1595 .getname = smc_getname, 1596 .poll = smc_poll, 1597 .ioctl = smc_ioctl, 1598 .listen = smc_listen, 1599 .shutdown = smc_shutdown, 1600 .setsockopt = smc_setsockopt, 1601 .getsockopt = smc_getsockopt, 1602 .sendmsg = smc_sendmsg, 1603 .recvmsg = smc_recvmsg, 1604 .mmap = sock_no_mmap, 1605 .sendpage = smc_sendpage, 1606 .splice_read = smc_splice_read, 1607 }; 1608 1609 static int smc_create(struct net *net, struct socket *sock, int protocol, 1610 int kern) 1611 { 1612 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 1613 struct smc_sock *smc; 1614 struct sock *sk; 1615 int rc; 1616 1617 rc = -ESOCKTNOSUPPORT; 1618 if (sock->type != SOCK_STREAM) 1619 goto out; 1620 1621 rc = -EPROTONOSUPPORT; 1622 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 1623 goto out; 1624 1625 rc = -ENOBUFS; 1626 sock->ops = &smc_sock_ops; 1627 sk = smc_sock_alloc(net, sock, protocol); 1628 if (!sk) 1629 goto out; 1630 1631 /* create internal TCP socket for CLC handshake and fallback */ 1632 smc = smc_sk(sk); 1633 smc->use_fallback = false; /* assume rdma capability first */ 1634 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1635 &smc->clcsock); 1636 if (rc) { 1637 sk_common_release(sk); 1638 goto out; 1639 } 1640 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1641 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1642 1643 out: 1644 return rc; 1645 } 1646 1647 static const struct net_proto_family smc_sock_family_ops = { 1648 .family = PF_SMC, 1649 .owner = THIS_MODULE, 1650 .create = smc_create, 1651 }; 1652 1653 static int __init smc_init(void) 1654 { 1655 int rc; 1656 1657 rc = smc_pnet_init(); 1658 if (rc) 1659 return rc; 1660 1661 rc = smc_llc_init(); 1662 if (rc) { 1663 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1664 goto out_pnet; 1665 } 1666 1667 rc = smc_cdc_init(); 1668 if (rc) { 1669 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1670 goto out_pnet; 1671 } 1672 1673 rc = proto_register(&smc_proto, 1); 1674 if (rc) { 1675 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 1676 goto out_pnet; 1677 } 1678 1679 rc = proto_register(&smc_proto6, 1); 1680 if (rc) { 1681 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 1682 goto out_proto; 1683 } 1684 1685 rc = sock_register(&smc_sock_family_ops); 1686 if (rc) { 1687 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1688 goto out_proto6; 1689 } 1690 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1691 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 1692 1693 rc = smc_ib_register_client(); 1694 if (rc) { 1695 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1696 goto out_sock; 1697 } 1698 1699 static_branch_enable(&tcp_have_smc); 1700 return 0; 1701 1702 out_sock: 1703 sock_unregister(PF_SMC); 1704 out_proto6: 1705 proto_unregister(&smc_proto6); 1706 out_proto: 1707 proto_unregister(&smc_proto); 1708 out_pnet: 1709 smc_pnet_exit(); 1710 return rc; 1711 } 1712 1713 static void __exit smc_exit(void) 1714 { 1715 smc_core_exit(); 1716 static_branch_disable(&tcp_have_smc); 1717 smc_ib_unregister_client(); 1718 sock_unregister(PF_SMC); 1719 proto_unregister(&smc_proto6); 1720 proto_unregister(&smc_proto); 1721 smc_pnet_exit(); 1722 } 1723 1724 module_init(smc_init); 1725 module_exit(smc_exit); 1726 1727 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1728 MODULE_DESCRIPTION("smc socket address family"); 1729 MODULE_LICENSE("GPL"); 1730 MODULE_ALIAS_NETPROTO(PF_SMC); 1731