1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Basic Transport Functions exploiting Infiniband API 6 * 7 * Copyright IBM Corp. 2016 8 * 9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 10 */ 11 12 #include <linux/socket.h> 13 #include <linux/if_vlan.h> 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <net/tcp.h> 17 #include <net/sock.h> 18 #include <rdma/ib_verbs.h> 19 20 #include "smc.h" 21 #include "smc_clc.h" 22 #include "smc_core.h" 23 #include "smc_ib.h" 24 #include "smc_wr.h" 25 #include "smc_llc.h" 26 #include "smc_cdc.h" 27 #include "smc_close.h" 28 #include "smc_ism.h" 29 30 #define SMC_LGR_NUM_INCR 256 31 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 32 #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) 33 34 static struct smc_lgr_list smc_lgr_list = { /* established link groups */ 35 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 36 .list = LIST_HEAD_INIT(smc_lgr_list.list), 37 .num = 0, 38 }; 39 40 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 41 struct smc_buf_desc *buf_desc); 42 43 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) 44 { 45 /* client link group creation always follows the server link group 46 * creation. For client use a somewhat higher removal delay time, 47 * otherwise there is a risk of out-of-sync link groups. 48 */ 49 mod_delayed_work(system_wq, &lgr->free_work, 50 (!lgr->is_smcd && lgr->role == SMC_CLNT) ? 51 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); 52 } 53 54 /* Register connection's alert token in our lookup structure. 55 * To use rbtrees we have to implement our own insert core. 56 * Requires @conns_lock 57 * @smc connection to register 58 * Returns 0 on success, != otherwise. 59 */ 60 static void smc_lgr_add_alert_token(struct smc_connection *conn) 61 { 62 struct rb_node **link, *parent = NULL; 63 u32 token = conn->alert_token_local; 64 65 link = &conn->lgr->conns_all.rb_node; 66 while (*link) { 67 struct smc_connection *cur = rb_entry(*link, 68 struct smc_connection, alert_node); 69 70 parent = *link; 71 if (cur->alert_token_local > token) 72 link = &parent->rb_left; 73 else 74 link = &parent->rb_right; 75 } 76 /* Put the new node there */ 77 rb_link_node(&conn->alert_node, parent, link); 78 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); 79 } 80 81 /* Register connection in link group by assigning an alert token 82 * registered in a search tree. 83 * Requires @conns_lock 84 * Note that '0' is a reserved value and not assigned. 85 */ 86 static void smc_lgr_register_conn(struct smc_connection *conn) 87 { 88 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 89 static atomic_t nexttoken = ATOMIC_INIT(0); 90 91 /* find a new alert_token_local value not yet used by some connection 92 * in this link group 93 */ 94 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ 95 while (!conn->alert_token_local) { 96 conn->alert_token_local = atomic_inc_return(&nexttoken); 97 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) 98 conn->alert_token_local = 0; 99 } 100 smc_lgr_add_alert_token(conn); 101 conn->lgr->conns_num++; 102 } 103 104 /* Unregister connection and reset the alert token of the given connection< 105 */ 106 static void __smc_lgr_unregister_conn(struct smc_connection *conn) 107 { 108 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 109 struct smc_link_group *lgr = conn->lgr; 110 111 rb_erase(&conn->alert_node, &lgr->conns_all); 112 lgr->conns_num--; 113 conn->alert_token_local = 0; 114 conn->lgr = NULL; 115 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ 116 } 117 118 /* Unregister connection and trigger lgr freeing if applicable 119 */ 120 static void smc_lgr_unregister_conn(struct smc_connection *conn) 121 { 122 struct smc_link_group *lgr = conn->lgr; 123 int reduced = 0; 124 125 write_lock_bh(&lgr->conns_lock); 126 if (conn->alert_token_local) { 127 reduced = 1; 128 __smc_lgr_unregister_conn(conn); 129 } 130 write_unlock_bh(&lgr->conns_lock); 131 if (!reduced || lgr->conns_num) 132 return; 133 smc_lgr_schedule_free_work(lgr); 134 } 135 136 static void smc_lgr_free_work(struct work_struct *work) 137 { 138 struct smc_link_group *lgr = container_of(to_delayed_work(work), 139 struct smc_link_group, 140 free_work); 141 bool conns; 142 143 spin_lock_bh(&smc_lgr_list.lock); 144 if (list_empty(&lgr->list)) 145 goto free; 146 read_lock_bh(&lgr->conns_lock); 147 conns = RB_EMPTY_ROOT(&lgr->conns_all); 148 read_unlock_bh(&lgr->conns_lock); 149 if (!conns) { /* number of lgr connections is no longer zero */ 150 spin_unlock_bh(&smc_lgr_list.lock); 151 return; 152 } 153 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 154 free: 155 spin_unlock_bh(&smc_lgr_list.lock); 156 if (!delayed_work_pending(&lgr->free_work)) { 157 if (!lgr->is_smcd && 158 lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) 159 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 160 smc_lgr_free(lgr); 161 } 162 } 163 164 /* create a new SMC link group */ 165 static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, 166 struct smc_ib_device *smcibdev, u8 ibport, 167 char *peer_systemid, unsigned short vlan_id, 168 struct smcd_dev *smcismdev, u64 peer_gid) 169 { 170 struct smc_link_group *lgr; 171 struct smc_link *lnk; 172 u8 rndvec[3]; 173 int rc = 0; 174 int i; 175 176 if (is_smcd && vlan_id) { 177 rc = smc_ism_get_vlan(smcismdev, vlan_id); 178 if (rc) 179 goto out; 180 } 181 182 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); 183 if (!lgr) { 184 rc = -ENOMEM; 185 goto out; 186 } 187 lgr->is_smcd = is_smcd; 188 lgr->sync_err = 0; 189 lgr->vlan_id = vlan_id; 190 rwlock_init(&lgr->sndbufs_lock); 191 rwlock_init(&lgr->rmbs_lock); 192 rwlock_init(&lgr->conns_lock); 193 for (i = 0; i < SMC_RMBE_SIZES; i++) { 194 INIT_LIST_HEAD(&lgr->sndbufs[i]); 195 INIT_LIST_HEAD(&lgr->rmbs[i]); 196 } 197 smc_lgr_list.num += SMC_LGR_NUM_INCR; 198 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); 199 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 200 lgr->conns_all = RB_ROOT; 201 if (is_smcd) { 202 /* SMC-D specific settings */ 203 lgr->peer_gid = peer_gid; 204 lgr->smcd = smcismdev; 205 } else { 206 /* SMC-R specific settings */ 207 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 208 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); 209 210 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 211 /* initialize link */ 212 lnk->state = SMC_LNK_ACTIVATING; 213 lnk->link_id = SMC_SINGLE_LINK; 214 lnk->smcibdev = smcibdev; 215 lnk->ibport = ibport; 216 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; 217 if (!smcibdev->initialized) 218 smc_ib_setup_per_ibdev(smcibdev); 219 get_random_bytes(rndvec, sizeof(rndvec)); 220 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + 221 (rndvec[2] << 16); 222 rc = smc_llc_link_init(lnk); 223 if (rc) 224 goto free_lgr; 225 rc = smc_wr_alloc_link_mem(lnk); 226 if (rc) 227 goto clear_llc_lnk; 228 rc = smc_ib_create_protection_domain(lnk); 229 if (rc) 230 goto free_link_mem; 231 rc = smc_ib_create_queue_pair(lnk); 232 if (rc) 233 goto dealloc_pd; 234 rc = smc_wr_create_link(lnk); 235 if (rc) 236 goto destroy_qp; 237 } 238 smc->conn.lgr = lgr; 239 spin_lock_bh(&smc_lgr_list.lock); 240 list_add(&lgr->list, &smc_lgr_list.list); 241 spin_unlock_bh(&smc_lgr_list.lock); 242 return 0; 243 244 destroy_qp: 245 smc_ib_destroy_queue_pair(lnk); 246 dealloc_pd: 247 smc_ib_dealloc_protection_domain(lnk); 248 free_link_mem: 249 smc_wr_free_link_mem(lnk); 250 clear_llc_lnk: 251 smc_llc_link_clear(lnk); 252 free_lgr: 253 kfree(lgr); 254 out: 255 return rc; 256 } 257 258 static void smc_buf_unuse(struct smc_connection *conn) 259 { 260 if (conn->sndbuf_desc) 261 conn->sndbuf_desc->used = 0; 262 if (conn->rmb_desc) { 263 if (!conn->rmb_desc->regerr) { 264 conn->rmb_desc->reused = 1; 265 conn->rmb_desc->used = 0; 266 } else { 267 /* buf registration failed, reuse not possible */ 268 struct smc_link_group *lgr = conn->lgr; 269 270 write_lock_bh(&lgr->rmbs_lock); 271 list_del(&conn->rmb_desc->list); 272 write_unlock_bh(&lgr->rmbs_lock); 273 274 smc_buf_free(lgr, true, conn->rmb_desc); 275 } 276 } 277 } 278 279 /* remove a finished connection from its link group */ 280 void smc_conn_free(struct smc_connection *conn) 281 { 282 if (!conn->lgr) 283 return; 284 if (conn->lgr->is_smcd) { 285 smc_ism_unset_conn(conn); 286 tasklet_kill(&conn->rx_tsklet); 287 } else { 288 smc_cdc_tx_dismiss_slots(conn); 289 } 290 smc_lgr_unregister_conn(conn); 291 smc_buf_unuse(conn); 292 } 293 294 static void smc_link_clear(struct smc_link *lnk) 295 { 296 lnk->peer_qpn = 0; 297 smc_llc_link_clear(lnk); 298 smc_ib_modify_qp_reset(lnk); 299 smc_wr_free_link(lnk); 300 smc_ib_destroy_queue_pair(lnk); 301 smc_ib_dealloc_protection_domain(lnk); 302 smc_wr_free_link_mem(lnk); 303 } 304 305 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, 306 struct smc_buf_desc *buf_desc) 307 { 308 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 309 310 if (is_rmb) { 311 if (buf_desc->mr_rx[SMC_SINGLE_LINK]) 312 smc_ib_put_memory_region( 313 buf_desc->mr_rx[SMC_SINGLE_LINK]); 314 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 315 DMA_FROM_DEVICE); 316 } else { 317 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 318 DMA_TO_DEVICE); 319 } 320 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); 321 if (buf_desc->pages) 322 __free_pages(buf_desc->pages, buf_desc->order); 323 kfree(buf_desc); 324 } 325 326 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, 327 struct smc_buf_desc *buf_desc) 328 { 329 if (is_dmb) { 330 /* restore original buf len */ 331 buf_desc->len += sizeof(struct smcd_cdc_msg); 332 smc_ism_unregister_dmb(lgr->smcd, buf_desc); 333 } else { 334 kfree(buf_desc->cpu_addr); 335 } 336 kfree(buf_desc); 337 } 338 339 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 340 struct smc_buf_desc *buf_desc) 341 { 342 if (lgr->is_smcd) 343 smcd_buf_free(lgr, is_rmb, buf_desc); 344 else 345 smcr_buf_free(lgr, is_rmb, buf_desc); 346 } 347 348 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 349 { 350 struct smc_buf_desc *buf_desc, *bf_desc; 351 struct list_head *buf_list; 352 int i; 353 354 for (i = 0; i < SMC_RMBE_SIZES; i++) { 355 if (is_rmb) 356 buf_list = &lgr->rmbs[i]; 357 else 358 buf_list = &lgr->sndbufs[i]; 359 list_for_each_entry_safe(buf_desc, bf_desc, buf_list, 360 list) { 361 list_del(&buf_desc->list); 362 smc_buf_free(lgr, is_rmb, buf_desc); 363 } 364 } 365 } 366 367 static void smc_lgr_free_bufs(struct smc_link_group *lgr) 368 { 369 /* free send buffers */ 370 __smc_lgr_free_bufs(lgr, false); 371 /* free rmbs */ 372 __smc_lgr_free_bufs(lgr, true); 373 } 374 375 /* remove a link group */ 376 void smc_lgr_free(struct smc_link_group *lgr) 377 { 378 smc_lgr_free_bufs(lgr); 379 if (lgr->is_smcd) 380 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); 381 else 382 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); 383 kfree(lgr); 384 } 385 386 void smc_lgr_forget(struct smc_link_group *lgr) 387 { 388 spin_lock_bh(&smc_lgr_list.lock); 389 /* do not use this link group for new connections */ 390 if (!list_empty(&lgr->list)) 391 list_del_init(&lgr->list); 392 spin_unlock_bh(&smc_lgr_list.lock); 393 } 394 395 /* terminate linkgroup abnormally */ 396 static void __smc_lgr_terminate(struct smc_link_group *lgr) 397 { 398 struct smc_connection *conn; 399 struct smc_sock *smc; 400 struct rb_node *node; 401 402 if (lgr->terminating) 403 return; /* lgr already terminating */ 404 lgr->terminating = 1; 405 if (!list_empty(&lgr->list)) /* forget lgr */ 406 list_del_init(&lgr->list); 407 if (!lgr->is_smcd) 408 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 409 410 write_lock_bh(&lgr->conns_lock); 411 node = rb_first(&lgr->conns_all); 412 while (node) { 413 conn = rb_entry(node, struct smc_connection, alert_node); 414 smc = container_of(conn, struct smc_sock, conn); 415 sock_hold(&smc->sk); /* sock_put in close work */ 416 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; 417 __smc_lgr_unregister_conn(conn); 418 write_unlock_bh(&lgr->conns_lock); 419 if (!schedule_work(&conn->close_work)) 420 sock_put(&smc->sk); 421 write_lock_bh(&lgr->conns_lock); 422 node = rb_first(&lgr->conns_all); 423 } 424 write_unlock_bh(&lgr->conns_lock); 425 if (!lgr->is_smcd) 426 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); 427 smc_lgr_schedule_free_work(lgr); 428 } 429 430 void smc_lgr_terminate(struct smc_link_group *lgr) 431 { 432 spin_lock_bh(&smc_lgr_list.lock); 433 __smc_lgr_terminate(lgr); 434 spin_unlock_bh(&smc_lgr_list.lock); 435 } 436 437 /* Called when IB port is terminated */ 438 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) 439 { 440 struct smc_link_group *lgr, *l; 441 442 spin_lock_bh(&smc_lgr_list.lock); 443 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 444 if (!lgr->is_smcd && 445 lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && 446 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) 447 __smc_lgr_terminate(lgr); 448 } 449 spin_unlock_bh(&smc_lgr_list.lock); 450 } 451 452 /* Called when SMC-D device is terminated or peer is lost */ 453 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) 454 { 455 struct smc_link_group *lgr, *l; 456 LIST_HEAD(lgr_free_list); 457 458 /* run common cleanup function and build free list */ 459 spin_lock_bh(&smc_lgr_list.lock); 460 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 461 if (lgr->is_smcd && lgr->smcd == dev && 462 (!peer_gid || lgr->peer_gid == peer_gid) && 463 !list_empty(&lgr->list)) { 464 __smc_lgr_terminate(lgr); 465 list_move(&lgr->list, &lgr_free_list); 466 } 467 } 468 spin_unlock_bh(&smc_lgr_list.lock); 469 470 /* cancel the regular free workers and actually free lgrs */ 471 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { 472 list_del_init(&lgr->list); 473 cancel_delayed_work_sync(&lgr->free_work); 474 smc_lgr_free(lgr); 475 } 476 } 477 478 /* Determine vlan of internal TCP socket. 479 * @vlan_id: address to store the determined vlan id into 480 */ 481 int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) 482 { 483 struct dst_entry *dst = sk_dst_get(clcsock->sk); 484 struct net_device *ndev; 485 int i, nest_lvl, rc = 0; 486 487 *vlan_id = 0; 488 if (!dst) { 489 rc = -ENOTCONN; 490 goto out; 491 } 492 if (!dst->dev) { 493 rc = -ENODEV; 494 goto out_rel; 495 } 496 497 ndev = dst->dev; 498 if (is_vlan_dev(ndev)) { 499 *vlan_id = vlan_dev_vlan_id(ndev); 500 goto out_rel; 501 } 502 503 rtnl_lock(); 504 nest_lvl = dev_get_nest_level(ndev); 505 for (i = 0; i < nest_lvl; i++) { 506 struct list_head *lower = &ndev->adj_list.lower; 507 508 if (list_empty(lower)) 509 break; 510 lower = lower->next; 511 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); 512 if (is_vlan_dev(ndev)) { 513 *vlan_id = vlan_dev_vlan_id(ndev); 514 break; 515 } 516 } 517 rtnl_unlock(); 518 519 out_rel: 520 dst_release(dst); 521 out: 522 return rc; 523 } 524 525 /* determine the link gid matching the vlan id of the link group */ 526 static int smc_link_determine_gid(struct smc_link_group *lgr) 527 { 528 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 529 struct ib_gid_attr gattr; 530 union ib_gid gid; 531 int i; 532 533 if (!lgr->vlan_id) { 534 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; 535 return 0; 536 } 537 538 for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; 539 i++) { 540 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, 541 &gattr)) 542 continue; 543 if (gattr.ndev) { 544 if (is_vlan_dev(gattr.ndev) && 545 vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { 546 lnk->gid = gid; 547 dev_put(gattr.ndev); 548 return 0; 549 } 550 dev_put(gattr.ndev); 551 } 552 } 553 return -ENODEV; 554 } 555 556 static bool smcr_lgr_match(struct smc_link_group *lgr, 557 struct smc_clc_msg_local *lcl, 558 enum smc_lgr_role role) 559 { 560 return !memcmp(lgr->peer_systemid, lcl->id_for_peer, 561 SMC_SYSTEMID_LEN) && 562 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, 563 SMC_GID_SIZE) && 564 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, 565 sizeof(lcl->mac)) && 566 lgr->role == role; 567 } 568 569 static bool smcd_lgr_match(struct smc_link_group *lgr, 570 struct smcd_dev *smcismdev, u64 peer_gid) 571 { 572 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; 573 } 574 575 /* create a new SMC connection (and a new link group if necessary) */ 576 int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, 577 struct smc_ib_device *smcibdev, u8 ibport, 578 struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, 579 u64 peer_gid) 580 { 581 struct smc_connection *conn = &smc->conn; 582 int local_contact = SMC_FIRST_CONTACT; 583 struct smc_link_group *lgr; 584 unsigned short vlan_id; 585 enum smc_lgr_role role; 586 int rc = 0; 587 588 role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 589 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); 590 if (rc) 591 return rc; 592 593 if ((role == SMC_CLNT) && srv_first_contact) 594 /* create new link group as well */ 595 goto create; 596 597 /* determine if an existing link group can be reused */ 598 spin_lock_bh(&smc_lgr_list.lock); 599 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 600 write_lock_bh(&lgr->conns_lock); 601 if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : 602 smcr_lgr_match(lgr, lcl, role)) && 603 !lgr->sync_err && 604 lgr->vlan_id == vlan_id && 605 (role == SMC_CLNT || 606 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { 607 /* link group found */ 608 local_contact = SMC_REUSE_CONTACT; 609 conn->lgr = lgr; 610 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 611 write_unlock_bh(&lgr->conns_lock); 612 break; 613 } 614 write_unlock_bh(&lgr->conns_lock); 615 } 616 spin_unlock_bh(&smc_lgr_list.lock); 617 618 if (role == SMC_CLNT && !srv_first_contact && 619 (local_contact == SMC_FIRST_CONTACT)) { 620 /* Server reuses a link group, but Client wants to start 621 * a new one 622 * send out_of_sync decline, reason synchr. error 623 */ 624 return -ENOLINK; 625 } 626 627 create: 628 if (local_contact == SMC_FIRST_CONTACT) { 629 rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, 630 lcl->id_for_peer, vlan_id, smcd, peer_gid); 631 if (rc) 632 goto out; 633 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 634 if (!is_smcd) 635 rc = smc_link_determine_gid(conn->lgr); 636 } 637 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 638 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 639 conn->urg_state = SMC_URG_READ; 640 if (is_smcd) { 641 conn->rx_off = sizeof(struct smcd_cdc_msg); 642 smcd_cdc_rx_init(conn); /* init tasklet for this conn */ 643 } 644 #ifndef KERNEL_HAS_ATOMIC64 645 spin_lock_init(&conn->acurs_lock); 646 #endif 647 648 out: 649 return rc ? rc : local_contact; 650 } 651 652 /* convert the RMB size into the compressed notation - minimum 16K. 653 * In contrast to plain ilog2, this rounds towards the next power of 2, 654 * so the socket application gets at least its desired sndbuf / rcvbuf size. 655 */ 656 static u8 smc_compress_bufsize(int size) 657 { 658 u8 compressed; 659 660 if (size <= SMC_BUF_MIN_SIZE) 661 return 0; 662 663 size = (size - 1) >> 14; 664 compressed = ilog2(size) + 1; 665 if (compressed >= SMC_RMBE_SIZES) 666 compressed = SMC_RMBE_SIZES - 1; 667 return compressed; 668 } 669 670 /* convert the RMB size from compressed notation into integer */ 671 int smc_uncompress_bufsize(u8 compressed) 672 { 673 u32 size; 674 675 size = 0x00000001 << (((int)compressed) + 14); 676 return (int)size; 677 } 678 679 /* try to reuse a sndbuf or rmb description slot for a certain 680 * buffer size; if not available, return NULL 681 */ 682 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, 683 rwlock_t *lock, 684 struct list_head *buf_list) 685 { 686 struct smc_buf_desc *buf_slot; 687 688 read_lock_bh(lock); 689 list_for_each_entry(buf_slot, buf_list, list) { 690 if (cmpxchg(&buf_slot->used, 0, 1) == 0) { 691 read_unlock_bh(lock); 692 return buf_slot; 693 } 694 } 695 read_unlock_bh(lock); 696 return NULL; 697 } 698 699 /* one of the conditions for announcing a receiver's current window size is 700 * that it "results in a minimum increase in the window size of 10% of the 701 * receive buffer space" [RFC7609] 702 */ 703 static inline int smc_rmb_wnd_update_limit(int rmbe_size) 704 { 705 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); 706 } 707 708 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, 709 bool is_rmb, int bufsize) 710 { 711 struct smc_buf_desc *buf_desc; 712 struct smc_link *lnk; 713 int rc; 714 715 /* try to alloc a new buffer */ 716 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 717 if (!buf_desc) 718 return ERR_PTR(-ENOMEM); 719 720 buf_desc->order = get_order(bufsize); 721 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | 722 __GFP_NOMEMALLOC | __GFP_COMP | 723 __GFP_NORETRY | __GFP_ZERO, 724 buf_desc->order); 725 if (!buf_desc->pages) { 726 kfree(buf_desc); 727 return ERR_PTR(-EAGAIN); 728 } 729 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); 730 731 /* build the sg table from the pages */ 732 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 733 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, 734 GFP_KERNEL); 735 if (rc) { 736 smc_buf_free(lgr, is_rmb, buf_desc); 737 return ERR_PTR(rc); 738 } 739 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, 740 buf_desc->cpu_addr, bufsize); 741 742 /* map sg table to DMA address */ 743 rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, 744 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 745 /* SMC protocol depends on mapping to one DMA address only */ 746 if (rc != 1) { 747 smc_buf_free(lgr, is_rmb, buf_desc); 748 return ERR_PTR(-EAGAIN); 749 } 750 751 /* create a new memory region for the RMB */ 752 if (is_rmb) { 753 rc = smc_ib_get_memory_region(lnk->roce_pd, 754 IB_ACCESS_REMOTE_WRITE | 755 IB_ACCESS_LOCAL_WRITE, 756 buf_desc); 757 if (rc) { 758 smc_buf_free(lgr, is_rmb, buf_desc); 759 return ERR_PTR(rc); 760 } 761 } 762 763 buf_desc->len = bufsize; 764 return buf_desc; 765 } 766 767 #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ 768 769 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, 770 bool is_dmb, int bufsize) 771 { 772 struct smc_buf_desc *buf_desc; 773 int rc; 774 775 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) 776 return ERR_PTR(-EAGAIN); 777 778 /* try to alloc a new DMB */ 779 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 780 if (!buf_desc) 781 return ERR_PTR(-ENOMEM); 782 if (is_dmb) { 783 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); 784 if (rc) { 785 kfree(buf_desc); 786 return ERR_PTR(-EAGAIN); 787 } 788 buf_desc->pages = virt_to_page(buf_desc->cpu_addr); 789 /* CDC header stored in buf. So, pretend it was smaller */ 790 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); 791 } else { 792 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | 793 __GFP_NOWARN | __GFP_NORETRY | 794 __GFP_NOMEMALLOC); 795 if (!buf_desc->cpu_addr) { 796 kfree(buf_desc); 797 return ERR_PTR(-EAGAIN); 798 } 799 buf_desc->len = bufsize; 800 } 801 return buf_desc; 802 } 803 804 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) 805 { 806 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); 807 struct smc_connection *conn = &smc->conn; 808 struct smc_link_group *lgr = conn->lgr; 809 struct list_head *buf_list; 810 int bufsize, bufsize_short; 811 int sk_buf_size; 812 rwlock_t *lock; 813 814 if (is_rmb) 815 /* use socket recv buffer size (w/o overhead) as start value */ 816 sk_buf_size = smc->sk.sk_rcvbuf / 2; 817 else 818 /* use socket send buffer size (w/o overhead) as start value */ 819 sk_buf_size = smc->sk.sk_sndbuf / 2; 820 821 for (bufsize_short = smc_compress_bufsize(sk_buf_size); 822 bufsize_short >= 0; bufsize_short--) { 823 824 if (is_rmb) { 825 lock = &lgr->rmbs_lock; 826 buf_list = &lgr->rmbs[bufsize_short]; 827 } else { 828 lock = &lgr->sndbufs_lock; 829 buf_list = &lgr->sndbufs[bufsize_short]; 830 } 831 bufsize = smc_uncompress_bufsize(bufsize_short); 832 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) 833 continue; 834 835 /* check for reusable slot in the link group */ 836 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); 837 if (buf_desc) { 838 memset(buf_desc->cpu_addr, 0, bufsize); 839 break; /* found reusable slot */ 840 } 841 842 if (is_smcd) 843 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); 844 else 845 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); 846 847 if (PTR_ERR(buf_desc) == -ENOMEM) 848 break; 849 if (IS_ERR(buf_desc)) 850 continue; 851 852 buf_desc->used = 1; 853 write_lock_bh(lock); 854 list_add(&buf_desc->list, buf_list); 855 write_unlock_bh(lock); 856 break; /* found */ 857 } 858 859 if (IS_ERR(buf_desc)) 860 return -ENOMEM; 861 862 if (is_rmb) { 863 conn->rmb_desc = buf_desc; 864 conn->rmbe_size_short = bufsize_short; 865 smc->sk.sk_rcvbuf = bufsize * 2; 866 atomic_set(&conn->bytes_to_rcv, 0); 867 conn->rmbe_update_limit = 868 smc_rmb_wnd_update_limit(buf_desc->len); 869 if (is_smcd) 870 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ 871 } else { 872 conn->sndbuf_desc = buf_desc; 873 smc->sk.sk_sndbuf = bufsize * 2; 874 atomic_set(&conn->sndbuf_space, bufsize); 875 } 876 return 0; 877 } 878 879 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) 880 { 881 struct smc_link_group *lgr = conn->lgr; 882 883 if (!conn->lgr || conn->lgr->is_smcd) 884 return; 885 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 886 conn->sndbuf_desc, DMA_TO_DEVICE); 887 } 888 889 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) 890 { 891 struct smc_link_group *lgr = conn->lgr; 892 893 if (!conn->lgr || conn->lgr->is_smcd) 894 return; 895 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 896 conn->sndbuf_desc, DMA_TO_DEVICE); 897 } 898 899 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) 900 { 901 struct smc_link_group *lgr = conn->lgr; 902 903 if (!conn->lgr || conn->lgr->is_smcd) 904 return; 905 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 906 conn->rmb_desc, DMA_FROM_DEVICE); 907 } 908 909 void smc_rmb_sync_sg_for_device(struct smc_connection *conn) 910 { 911 struct smc_link_group *lgr = conn->lgr; 912 913 if (!conn->lgr || conn->lgr->is_smcd) 914 return; 915 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 916 conn->rmb_desc, DMA_FROM_DEVICE); 917 } 918 919 /* create the send and receive buffer for an SMC socket; 920 * receive buffers are called RMBs; 921 * (even though the SMC protocol allows more than one RMB-element per RMB, 922 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an 923 * extra RMB for every connection in a link group 924 */ 925 int smc_buf_create(struct smc_sock *smc, bool is_smcd) 926 { 927 int rc; 928 929 /* create send buffer */ 930 rc = __smc_buf_create(smc, is_smcd, false); 931 if (rc) 932 return rc; 933 /* create rmb */ 934 rc = __smc_buf_create(smc, is_smcd, true); 935 if (rc) 936 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); 937 return rc; 938 } 939 940 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) 941 { 942 int i; 943 944 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { 945 if (!test_and_set_bit(i, lgr->rtokens_used_mask)) 946 return i; 947 } 948 return -ENOSPC; 949 } 950 951 /* add a new rtoken from peer */ 952 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) 953 { 954 u64 dma_addr = be64_to_cpu(nw_vaddr); 955 u32 rkey = ntohl(nw_rkey); 956 int i; 957 958 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 959 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && 960 (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && 961 test_bit(i, lgr->rtokens_used_mask)) { 962 /* already in list */ 963 return i; 964 } 965 } 966 i = smc_rmb_reserve_rtoken_idx(lgr); 967 if (i < 0) 968 return i; 969 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; 970 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; 971 return i; 972 } 973 974 /* delete an rtoken */ 975 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) 976 { 977 u32 rkey = ntohl(nw_rkey); 978 int i; 979 980 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 981 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && 982 test_bit(i, lgr->rtokens_used_mask)) { 983 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; 984 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; 985 986 clear_bit(i, lgr->rtokens_used_mask); 987 return 0; 988 } 989 } 990 return -ENOENT; 991 } 992 993 /* save rkey and dma_addr received from peer during clc handshake */ 994 int smc_rmb_rtoken_handling(struct smc_connection *conn, 995 struct smc_clc_msg_accept_confirm *clc) 996 { 997 conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, 998 clc->rmb_rkey); 999 if (conn->rtoken_idx < 0) 1000 return conn->rtoken_idx; 1001 return 0; 1002 } 1003 1004 /* Called (from smc_exit) when module is removed */ 1005 void smc_core_exit(void) 1006 { 1007 struct smc_link_group *lgr, *lg; 1008 LIST_HEAD(lgr_freeing_list); 1009 1010 spin_lock_bh(&smc_lgr_list.lock); 1011 if (!list_empty(&smc_lgr_list.list)) 1012 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1013 spin_unlock_bh(&smc_lgr_list.lock); 1014 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1015 list_del_init(&lgr->list); 1016 if (!lgr->is_smcd) 1017 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 1018 cancel_delayed_work_sync(&lgr->free_work); 1019 smc_lgr_free(lgr); /* free link group */ 1020 } 1021 } 1022