1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Basic Transport Functions exploiting Infiniband API 6 * 7 * Copyright IBM Corp. 2016 8 * 9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 10 */ 11 12 #include <linux/socket.h> 13 #include <linux/if_vlan.h> 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <net/tcp.h> 17 #include <net/sock.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/ib_cache.h> 20 21 #include "smc.h" 22 #include "smc_clc.h" 23 #include "smc_core.h" 24 #include "smc_ib.h" 25 #include "smc_wr.h" 26 #include "smc_llc.h" 27 #include "smc_cdc.h" 28 #include "smc_close.h" 29 #include "smc_ism.h" 30 31 #define SMC_LGR_NUM_INCR 256 32 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 33 #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) 34 #define SMC_LGR_FREE_DELAY_FAST (8 * HZ) 35 36 static struct smc_lgr_list smc_lgr_list = { /* established link groups */ 37 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 38 .list = LIST_HEAD_INIT(smc_lgr_list.list), 39 .num = 0, 40 }; 41 42 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 43 struct smc_buf_desc *buf_desc); 44 45 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) 46 { 47 /* client link group creation always follows the server link group 48 * creation. For client use a somewhat higher removal delay time, 49 * otherwise there is a risk of out-of-sync link groups. 50 */ 51 mod_delayed_work(system_wq, &lgr->free_work, 52 (!lgr->is_smcd && lgr->role == SMC_CLNT) ? 53 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); 54 } 55 56 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) 57 { 58 mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); 59 } 60 61 /* Register connection's alert token in our lookup structure. 62 * To use rbtrees we have to implement our own insert core. 63 * Requires @conns_lock 64 * @smc connection to register 65 * Returns 0 on success, != otherwise. 66 */ 67 static void smc_lgr_add_alert_token(struct smc_connection *conn) 68 { 69 struct rb_node **link, *parent = NULL; 70 u32 token = conn->alert_token_local; 71 72 link = &conn->lgr->conns_all.rb_node; 73 while (*link) { 74 struct smc_connection *cur = rb_entry(*link, 75 struct smc_connection, alert_node); 76 77 parent = *link; 78 if (cur->alert_token_local > token) 79 link = &parent->rb_left; 80 else 81 link = &parent->rb_right; 82 } 83 /* Put the new node there */ 84 rb_link_node(&conn->alert_node, parent, link); 85 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); 86 } 87 88 /* Register connection in link group by assigning an alert token 89 * registered in a search tree. 90 * Requires @conns_lock 91 * Note that '0' is a reserved value and not assigned. 92 */ 93 static void smc_lgr_register_conn(struct smc_connection *conn) 94 { 95 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 96 static atomic_t nexttoken = ATOMIC_INIT(0); 97 98 /* find a new alert_token_local value not yet used by some connection 99 * in this link group 100 */ 101 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ 102 while (!conn->alert_token_local) { 103 conn->alert_token_local = atomic_inc_return(&nexttoken); 104 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) 105 conn->alert_token_local = 0; 106 } 107 smc_lgr_add_alert_token(conn); 108 conn->lgr->conns_num++; 109 } 110 111 /* Unregister connection and reset the alert token of the given connection< 112 */ 113 static void __smc_lgr_unregister_conn(struct smc_connection *conn) 114 { 115 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 116 struct smc_link_group *lgr = conn->lgr; 117 118 rb_erase(&conn->alert_node, &lgr->conns_all); 119 lgr->conns_num--; 120 conn->alert_token_local = 0; 121 conn->lgr = NULL; 122 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ 123 } 124 125 /* Unregister connection and trigger lgr freeing if applicable 126 */ 127 static void smc_lgr_unregister_conn(struct smc_connection *conn) 128 { 129 struct smc_link_group *lgr = conn->lgr; 130 int reduced = 0; 131 132 write_lock_bh(&lgr->conns_lock); 133 if (conn->alert_token_local) { 134 reduced = 1; 135 __smc_lgr_unregister_conn(conn); 136 } 137 write_unlock_bh(&lgr->conns_lock); 138 if (!reduced || lgr->conns_num) 139 return; 140 smc_lgr_schedule_free_work(lgr); 141 } 142 143 /* Send delete link, either as client to request the initiation 144 * of the DELETE LINK sequence from server; or as server to 145 * initiate the delete processing. See smc_llc_rx_delete_link(). 146 */ 147 static int smc_link_send_delete(struct smc_link *lnk) 148 { 149 if (lnk->state == SMC_LNK_ACTIVE && 150 !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { 151 smc_llc_link_deleting(lnk); 152 return 0; 153 } 154 return -ENOTCONN; 155 } 156 157 static void smc_lgr_free_work(struct work_struct *work) 158 { 159 struct smc_link_group *lgr = container_of(to_delayed_work(work), 160 struct smc_link_group, 161 free_work); 162 bool conns; 163 164 spin_lock_bh(&smc_lgr_list.lock); 165 if (list_empty(&lgr->list)) 166 goto free; 167 read_lock_bh(&lgr->conns_lock); 168 conns = RB_EMPTY_ROOT(&lgr->conns_all); 169 read_unlock_bh(&lgr->conns_lock); 170 if (!conns) { /* number of lgr connections is no longer zero */ 171 spin_unlock_bh(&smc_lgr_list.lock); 172 return; 173 } 174 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 175 free: 176 spin_unlock_bh(&smc_lgr_list.lock); 177 178 if (!lgr->is_smcd && !lgr->terminating) { 179 /* try to send del link msg, on error free lgr immediately */ 180 if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { 181 /* reschedule in case we never receive a response */ 182 smc_lgr_schedule_free_work(lgr); 183 return; 184 } 185 } 186 187 if (!delayed_work_pending(&lgr->free_work)) { 188 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 189 190 if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) 191 smc_llc_link_inactive(lnk); 192 smc_lgr_free(lgr); 193 } 194 } 195 196 /* create a new SMC link group */ 197 static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, 198 struct smc_ib_device *smcibdev, u8 ibport, 199 char *peer_systemid, unsigned short vlan_id, 200 struct smcd_dev *smcismdev, u64 peer_gid) 201 { 202 struct smc_link_group *lgr; 203 struct smc_link *lnk; 204 u8 rndvec[3]; 205 int rc = 0; 206 int i; 207 208 if (is_smcd && vlan_id) { 209 rc = smc_ism_get_vlan(smcismdev, vlan_id); 210 if (rc) 211 goto out; 212 } 213 214 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); 215 if (!lgr) { 216 rc = -ENOMEM; 217 goto out; 218 } 219 lgr->is_smcd = is_smcd; 220 lgr->sync_err = 0; 221 lgr->vlan_id = vlan_id; 222 rwlock_init(&lgr->sndbufs_lock); 223 rwlock_init(&lgr->rmbs_lock); 224 rwlock_init(&lgr->conns_lock); 225 for (i = 0; i < SMC_RMBE_SIZES; i++) { 226 INIT_LIST_HEAD(&lgr->sndbufs[i]); 227 INIT_LIST_HEAD(&lgr->rmbs[i]); 228 } 229 smc_lgr_list.num += SMC_LGR_NUM_INCR; 230 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); 231 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 232 lgr->conns_all = RB_ROOT; 233 if (is_smcd) { 234 /* SMC-D specific settings */ 235 lgr->peer_gid = peer_gid; 236 lgr->smcd = smcismdev; 237 } else { 238 /* SMC-R specific settings */ 239 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 240 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); 241 242 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 243 /* initialize link */ 244 lnk->state = SMC_LNK_ACTIVATING; 245 lnk->link_id = SMC_SINGLE_LINK; 246 lnk->smcibdev = smcibdev; 247 lnk->ibport = ibport; 248 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; 249 if (!smcibdev->initialized) 250 smc_ib_setup_per_ibdev(smcibdev); 251 get_random_bytes(rndvec, sizeof(rndvec)); 252 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + 253 (rndvec[2] << 16); 254 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, 255 vlan_id, lnk->gid, &lnk->sgid_index); 256 if (rc) 257 goto free_lgr; 258 rc = smc_llc_link_init(lnk); 259 if (rc) 260 goto free_lgr; 261 rc = smc_wr_alloc_link_mem(lnk); 262 if (rc) 263 goto clear_llc_lnk; 264 rc = smc_ib_create_protection_domain(lnk); 265 if (rc) 266 goto free_link_mem; 267 rc = smc_ib_create_queue_pair(lnk); 268 if (rc) 269 goto dealloc_pd; 270 rc = smc_wr_create_link(lnk); 271 if (rc) 272 goto destroy_qp; 273 } 274 smc->conn.lgr = lgr; 275 spin_lock_bh(&smc_lgr_list.lock); 276 list_add(&lgr->list, &smc_lgr_list.list); 277 spin_unlock_bh(&smc_lgr_list.lock); 278 return 0; 279 280 destroy_qp: 281 smc_ib_destroy_queue_pair(lnk); 282 dealloc_pd: 283 smc_ib_dealloc_protection_domain(lnk); 284 free_link_mem: 285 smc_wr_free_link_mem(lnk); 286 clear_llc_lnk: 287 smc_llc_link_clear(lnk); 288 free_lgr: 289 kfree(lgr); 290 out: 291 return rc; 292 } 293 294 static void smc_buf_unuse(struct smc_connection *conn) 295 { 296 if (conn->sndbuf_desc) 297 conn->sndbuf_desc->used = 0; 298 if (conn->rmb_desc) { 299 if (!conn->rmb_desc->regerr) { 300 conn->rmb_desc->reused = 1; 301 conn->rmb_desc->used = 0; 302 } else { 303 /* buf registration failed, reuse not possible */ 304 struct smc_link_group *lgr = conn->lgr; 305 306 write_lock_bh(&lgr->rmbs_lock); 307 list_del(&conn->rmb_desc->list); 308 write_unlock_bh(&lgr->rmbs_lock); 309 310 smc_buf_free(lgr, true, conn->rmb_desc); 311 } 312 } 313 } 314 315 /* remove a finished connection from its link group */ 316 void smc_conn_free(struct smc_connection *conn) 317 { 318 if (!conn->lgr) 319 return; 320 if (conn->lgr->is_smcd) { 321 smc_ism_unset_conn(conn); 322 tasklet_kill(&conn->rx_tsklet); 323 } else { 324 smc_cdc_tx_dismiss_slots(conn); 325 } 326 smc_lgr_unregister_conn(conn); 327 smc_buf_unuse(conn); 328 } 329 330 static void smc_link_clear(struct smc_link *lnk) 331 { 332 lnk->peer_qpn = 0; 333 smc_llc_link_clear(lnk); 334 smc_ib_modify_qp_reset(lnk); 335 smc_wr_free_link(lnk); 336 smc_ib_destroy_queue_pair(lnk); 337 smc_ib_dealloc_protection_domain(lnk); 338 smc_wr_free_link_mem(lnk); 339 } 340 341 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, 342 struct smc_buf_desc *buf_desc) 343 { 344 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 345 346 if (is_rmb) { 347 if (buf_desc->mr_rx[SMC_SINGLE_LINK]) 348 smc_ib_put_memory_region( 349 buf_desc->mr_rx[SMC_SINGLE_LINK]); 350 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 351 DMA_FROM_DEVICE); 352 } else { 353 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 354 DMA_TO_DEVICE); 355 } 356 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); 357 if (buf_desc->pages) 358 __free_pages(buf_desc->pages, buf_desc->order); 359 kfree(buf_desc); 360 } 361 362 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, 363 struct smc_buf_desc *buf_desc) 364 { 365 if (is_dmb) { 366 /* restore original buf len */ 367 buf_desc->len += sizeof(struct smcd_cdc_msg); 368 smc_ism_unregister_dmb(lgr->smcd, buf_desc); 369 } else { 370 kfree(buf_desc->cpu_addr); 371 } 372 kfree(buf_desc); 373 } 374 375 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 376 struct smc_buf_desc *buf_desc) 377 { 378 if (lgr->is_smcd) 379 smcd_buf_free(lgr, is_rmb, buf_desc); 380 else 381 smcr_buf_free(lgr, is_rmb, buf_desc); 382 } 383 384 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 385 { 386 struct smc_buf_desc *buf_desc, *bf_desc; 387 struct list_head *buf_list; 388 int i; 389 390 for (i = 0; i < SMC_RMBE_SIZES; i++) { 391 if (is_rmb) 392 buf_list = &lgr->rmbs[i]; 393 else 394 buf_list = &lgr->sndbufs[i]; 395 list_for_each_entry_safe(buf_desc, bf_desc, buf_list, 396 list) { 397 list_del(&buf_desc->list); 398 smc_buf_free(lgr, is_rmb, buf_desc); 399 } 400 } 401 } 402 403 static void smc_lgr_free_bufs(struct smc_link_group *lgr) 404 { 405 /* free send buffers */ 406 __smc_lgr_free_bufs(lgr, false); 407 /* free rmbs */ 408 __smc_lgr_free_bufs(lgr, true); 409 } 410 411 /* remove a link group */ 412 void smc_lgr_free(struct smc_link_group *lgr) 413 { 414 smc_lgr_free_bufs(lgr); 415 if (lgr->is_smcd) 416 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); 417 else 418 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); 419 kfree(lgr); 420 } 421 422 void smc_lgr_forget(struct smc_link_group *lgr) 423 { 424 spin_lock_bh(&smc_lgr_list.lock); 425 /* do not use this link group for new connections */ 426 if (!list_empty(&lgr->list)) 427 list_del_init(&lgr->list); 428 spin_unlock_bh(&smc_lgr_list.lock); 429 } 430 431 /* terminate linkgroup abnormally */ 432 static void __smc_lgr_terminate(struct smc_link_group *lgr) 433 { 434 struct smc_connection *conn; 435 struct smc_sock *smc; 436 struct rb_node *node; 437 438 if (lgr->terminating) 439 return; /* lgr already terminating */ 440 lgr->terminating = 1; 441 if (!list_empty(&lgr->list)) /* forget lgr */ 442 list_del_init(&lgr->list); 443 if (!lgr->is_smcd) 444 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 445 446 write_lock_bh(&lgr->conns_lock); 447 node = rb_first(&lgr->conns_all); 448 while (node) { 449 conn = rb_entry(node, struct smc_connection, alert_node); 450 smc = container_of(conn, struct smc_sock, conn); 451 sock_hold(&smc->sk); /* sock_put in close work */ 452 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; 453 __smc_lgr_unregister_conn(conn); 454 write_unlock_bh(&lgr->conns_lock); 455 if (!schedule_work(&conn->close_work)) 456 sock_put(&smc->sk); 457 write_lock_bh(&lgr->conns_lock); 458 node = rb_first(&lgr->conns_all); 459 } 460 write_unlock_bh(&lgr->conns_lock); 461 if (!lgr->is_smcd) 462 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); 463 smc_lgr_schedule_free_work(lgr); 464 } 465 466 void smc_lgr_terminate(struct smc_link_group *lgr) 467 { 468 spin_lock_bh(&smc_lgr_list.lock); 469 __smc_lgr_terminate(lgr); 470 spin_unlock_bh(&smc_lgr_list.lock); 471 } 472 473 /* Called when IB port is terminated */ 474 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) 475 { 476 struct smc_link_group *lgr, *l; 477 478 spin_lock_bh(&smc_lgr_list.lock); 479 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 480 if (!lgr->is_smcd && 481 lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && 482 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) 483 __smc_lgr_terminate(lgr); 484 } 485 spin_unlock_bh(&smc_lgr_list.lock); 486 } 487 488 /* Called when SMC-D device is terminated or peer is lost */ 489 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) 490 { 491 struct smc_link_group *lgr, *l; 492 LIST_HEAD(lgr_free_list); 493 494 /* run common cleanup function and build free list */ 495 spin_lock_bh(&smc_lgr_list.lock); 496 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 497 if (lgr->is_smcd && lgr->smcd == dev && 498 (!peer_gid || lgr->peer_gid == peer_gid) && 499 !list_empty(&lgr->list)) { 500 __smc_lgr_terminate(lgr); 501 list_move(&lgr->list, &lgr_free_list); 502 } 503 } 504 spin_unlock_bh(&smc_lgr_list.lock); 505 506 /* cancel the regular free workers and actually free lgrs */ 507 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { 508 list_del_init(&lgr->list); 509 cancel_delayed_work_sync(&lgr->free_work); 510 smc_lgr_free(lgr); 511 } 512 } 513 514 /* Determine vlan of internal TCP socket. 515 * @vlan_id: address to store the determined vlan id into 516 */ 517 int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) 518 { 519 struct dst_entry *dst = sk_dst_get(clcsock->sk); 520 struct net_device *ndev; 521 int i, nest_lvl, rc = 0; 522 523 *vlan_id = 0; 524 if (!dst) { 525 rc = -ENOTCONN; 526 goto out; 527 } 528 if (!dst->dev) { 529 rc = -ENODEV; 530 goto out_rel; 531 } 532 533 ndev = dst->dev; 534 if (is_vlan_dev(ndev)) { 535 *vlan_id = vlan_dev_vlan_id(ndev); 536 goto out_rel; 537 } 538 539 rtnl_lock(); 540 nest_lvl = dev_get_nest_level(ndev); 541 for (i = 0; i < nest_lvl; i++) { 542 struct list_head *lower = &ndev->adj_list.lower; 543 544 if (list_empty(lower)) 545 break; 546 lower = lower->next; 547 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); 548 if (is_vlan_dev(ndev)) { 549 *vlan_id = vlan_dev_vlan_id(ndev); 550 break; 551 } 552 } 553 rtnl_unlock(); 554 555 out_rel: 556 dst_release(dst); 557 out: 558 return rc; 559 } 560 561 static bool smcr_lgr_match(struct smc_link_group *lgr, 562 struct smc_clc_msg_local *lcl, 563 enum smc_lgr_role role) 564 { 565 return !memcmp(lgr->peer_systemid, lcl->id_for_peer, 566 SMC_SYSTEMID_LEN) && 567 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, 568 SMC_GID_SIZE) && 569 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, 570 sizeof(lcl->mac)) && 571 lgr->role == role; 572 } 573 574 static bool smcd_lgr_match(struct smc_link_group *lgr, 575 struct smcd_dev *smcismdev, u64 peer_gid) 576 { 577 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; 578 } 579 580 /* create a new SMC connection (and a new link group if necessary) */ 581 int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, 582 struct smc_ib_device *smcibdev, u8 ibport, 583 struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, 584 u64 peer_gid) 585 { 586 struct smc_connection *conn = &smc->conn; 587 int local_contact = SMC_FIRST_CONTACT; 588 struct smc_link_group *lgr; 589 unsigned short vlan_id; 590 enum smc_lgr_role role; 591 int rc = 0; 592 593 role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 594 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); 595 if (rc) 596 return rc; 597 598 if ((role == SMC_CLNT) && srv_first_contact) 599 /* create new link group as well */ 600 goto create; 601 602 /* determine if an existing link group can be reused */ 603 spin_lock_bh(&smc_lgr_list.lock); 604 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 605 write_lock_bh(&lgr->conns_lock); 606 if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : 607 smcr_lgr_match(lgr, lcl, role)) && 608 !lgr->sync_err && 609 lgr->vlan_id == vlan_id && 610 (role == SMC_CLNT || 611 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { 612 /* link group found */ 613 local_contact = SMC_REUSE_CONTACT; 614 conn->lgr = lgr; 615 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 616 write_unlock_bh(&lgr->conns_lock); 617 break; 618 } 619 write_unlock_bh(&lgr->conns_lock); 620 } 621 spin_unlock_bh(&smc_lgr_list.lock); 622 623 if (role == SMC_CLNT && !srv_first_contact && 624 (local_contact == SMC_FIRST_CONTACT)) { 625 /* Server reuses a link group, but Client wants to start 626 * a new one 627 * send out_of_sync decline, reason synchr. error 628 */ 629 return -ENOLINK; 630 } 631 632 create: 633 if (local_contact == SMC_FIRST_CONTACT) { 634 rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, 635 lcl->id_for_peer, vlan_id, smcd, peer_gid); 636 if (rc) 637 goto out; 638 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 639 } 640 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 641 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 642 conn->urg_state = SMC_URG_READ; 643 if (is_smcd) { 644 conn->rx_off = sizeof(struct smcd_cdc_msg); 645 smcd_cdc_rx_init(conn); /* init tasklet for this conn */ 646 } 647 #ifndef KERNEL_HAS_ATOMIC64 648 spin_lock_init(&conn->acurs_lock); 649 #endif 650 651 out: 652 return rc ? rc : local_contact; 653 } 654 655 /* convert the RMB size into the compressed notation - minimum 16K. 656 * In contrast to plain ilog2, this rounds towards the next power of 2, 657 * so the socket application gets at least its desired sndbuf / rcvbuf size. 658 */ 659 static u8 smc_compress_bufsize(int size) 660 { 661 u8 compressed; 662 663 if (size <= SMC_BUF_MIN_SIZE) 664 return 0; 665 666 size = (size - 1) >> 14; 667 compressed = ilog2(size) + 1; 668 if (compressed >= SMC_RMBE_SIZES) 669 compressed = SMC_RMBE_SIZES - 1; 670 return compressed; 671 } 672 673 /* convert the RMB size from compressed notation into integer */ 674 int smc_uncompress_bufsize(u8 compressed) 675 { 676 u32 size; 677 678 size = 0x00000001 << (((int)compressed) + 14); 679 return (int)size; 680 } 681 682 /* try to reuse a sndbuf or rmb description slot for a certain 683 * buffer size; if not available, return NULL 684 */ 685 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, 686 rwlock_t *lock, 687 struct list_head *buf_list) 688 { 689 struct smc_buf_desc *buf_slot; 690 691 read_lock_bh(lock); 692 list_for_each_entry(buf_slot, buf_list, list) { 693 if (cmpxchg(&buf_slot->used, 0, 1) == 0) { 694 read_unlock_bh(lock); 695 return buf_slot; 696 } 697 } 698 read_unlock_bh(lock); 699 return NULL; 700 } 701 702 /* one of the conditions for announcing a receiver's current window size is 703 * that it "results in a minimum increase in the window size of 10% of the 704 * receive buffer space" [RFC7609] 705 */ 706 static inline int smc_rmb_wnd_update_limit(int rmbe_size) 707 { 708 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); 709 } 710 711 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, 712 bool is_rmb, int bufsize) 713 { 714 struct smc_buf_desc *buf_desc; 715 struct smc_link *lnk; 716 int rc; 717 718 /* try to alloc a new buffer */ 719 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 720 if (!buf_desc) 721 return ERR_PTR(-ENOMEM); 722 723 buf_desc->order = get_order(bufsize); 724 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | 725 __GFP_NOMEMALLOC | __GFP_COMP | 726 __GFP_NORETRY | __GFP_ZERO, 727 buf_desc->order); 728 if (!buf_desc->pages) { 729 kfree(buf_desc); 730 return ERR_PTR(-EAGAIN); 731 } 732 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); 733 734 /* build the sg table from the pages */ 735 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 736 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, 737 GFP_KERNEL); 738 if (rc) { 739 smc_buf_free(lgr, is_rmb, buf_desc); 740 return ERR_PTR(rc); 741 } 742 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, 743 buf_desc->cpu_addr, bufsize); 744 745 /* map sg table to DMA address */ 746 rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, 747 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 748 /* SMC protocol depends on mapping to one DMA address only */ 749 if (rc != 1) { 750 smc_buf_free(lgr, is_rmb, buf_desc); 751 return ERR_PTR(-EAGAIN); 752 } 753 754 /* create a new memory region for the RMB */ 755 if (is_rmb) { 756 rc = smc_ib_get_memory_region(lnk->roce_pd, 757 IB_ACCESS_REMOTE_WRITE | 758 IB_ACCESS_LOCAL_WRITE, 759 buf_desc); 760 if (rc) { 761 smc_buf_free(lgr, is_rmb, buf_desc); 762 return ERR_PTR(rc); 763 } 764 } 765 766 buf_desc->len = bufsize; 767 return buf_desc; 768 } 769 770 #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ 771 772 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, 773 bool is_dmb, int bufsize) 774 { 775 struct smc_buf_desc *buf_desc; 776 int rc; 777 778 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) 779 return ERR_PTR(-EAGAIN); 780 781 /* try to alloc a new DMB */ 782 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 783 if (!buf_desc) 784 return ERR_PTR(-ENOMEM); 785 if (is_dmb) { 786 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); 787 if (rc) { 788 kfree(buf_desc); 789 return ERR_PTR(-EAGAIN); 790 } 791 buf_desc->pages = virt_to_page(buf_desc->cpu_addr); 792 /* CDC header stored in buf. So, pretend it was smaller */ 793 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); 794 } else { 795 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | 796 __GFP_NOWARN | __GFP_NORETRY | 797 __GFP_NOMEMALLOC); 798 if (!buf_desc->cpu_addr) { 799 kfree(buf_desc); 800 return ERR_PTR(-EAGAIN); 801 } 802 buf_desc->len = bufsize; 803 } 804 return buf_desc; 805 } 806 807 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) 808 { 809 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); 810 struct smc_connection *conn = &smc->conn; 811 struct smc_link_group *lgr = conn->lgr; 812 struct list_head *buf_list; 813 int bufsize, bufsize_short; 814 int sk_buf_size; 815 rwlock_t *lock; 816 817 if (is_rmb) 818 /* use socket recv buffer size (w/o overhead) as start value */ 819 sk_buf_size = smc->sk.sk_rcvbuf / 2; 820 else 821 /* use socket send buffer size (w/o overhead) as start value */ 822 sk_buf_size = smc->sk.sk_sndbuf / 2; 823 824 for (bufsize_short = smc_compress_bufsize(sk_buf_size); 825 bufsize_short >= 0; bufsize_short--) { 826 827 if (is_rmb) { 828 lock = &lgr->rmbs_lock; 829 buf_list = &lgr->rmbs[bufsize_short]; 830 } else { 831 lock = &lgr->sndbufs_lock; 832 buf_list = &lgr->sndbufs[bufsize_short]; 833 } 834 bufsize = smc_uncompress_bufsize(bufsize_short); 835 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) 836 continue; 837 838 /* check for reusable slot in the link group */ 839 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); 840 if (buf_desc) { 841 memset(buf_desc->cpu_addr, 0, bufsize); 842 break; /* found reusable slot */ 843 } 844 845 if (is_smcd) 846 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); 847 else 848 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); 849 850 if (PTR_ERR(buf_desc) == -ENOMEM) 851 break; 852 if (IS_ERR(buf_desc)) 853 continue; 854 855 buf_desc->used = 1; 856 write_lock_bh(lock); 857 list_add(&buf_desc->list, buf_list); 858 write_unlock_bh(lock); 859 break; /* found */ 860 } 861 862 if (IS_ERR(buf_desc)) 863 return -ENOMEM; 864 865 if (is_rmb) { 866 conn->rmb_desc = buf_desc; 867 conn->rmbe_size_short = bufsize_short; 868 smc->sk.sk_rcvbuf = bufsize * 2; 869 atomic_set(&conn->bytes_to_rcv, 0); 870 conn->rmbe_update_limit = 871 smc_rmb_wnd_update_limit(buf_desc->len); 872 if (is_smcd) 873 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ 874 } else { 875 conn->sndbuf_desc = buf_desc; 876 smc->sk.sk_sndbuf = bufsize * 2; 877 atomic_set(&conn->sndbuf_space, bufsize); 878 } 879 return 0; 880 } 881 882 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) 883 { 884 struct smc_link_group *lgr = conn->lgr; 885 886 if (!conn->lgr || conn->lgr->is_smcd) 887 return; 888 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 889 conn->sndbuf_desc, DMA_TO_DEVICE); 890 } 891 892 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) 893 { 894 struct smc_link_group *lgr = conn->lgr; 895 896 if (!conn->lgr || conn->lgr->is_smcd) 897 return; 898 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 899 conn->sndbuf_desc, DMA_TO_DEVICE); 900 } 901 902 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) 903 { 904 struct smc_link_group *lgr = conn->lgr; 905 906 if (!conn->lgr || conn->lgr->is_smcd) 907 return; 908 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 909 conn->rmb_desc, DMA_FROM_DEVICE); 910 } 911 912 void smc_rmb_sync_sg_for_device(struct smc_connection *conn) 913 { 914 struct smc_link_group *lgr = conn->lgr; 915 916 if (!conn->lgr || conn->lgr->is_smcd) 917 return; 918 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 919 conn->rmb_desc, DMA_FROM_DEVICE); 920 } 921 922 /* create the send and receive buffer for an SMC socket; 923 * receive buffers are called RMBs; 924 * (even though the SMC protocol allows more than one RMB-element per RMB, 925 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an 926 * extra RMB for every connection in a link group 927 */ 928 int smc_buf_create(struct smc_sock *smc, bool is_smcd) 929 { 930 int rc; 931 932 /* create send buffer */ 933 rc = __smc_buf_create(smc, is_smcd, false); 934 if (rc) 935 return rc; 936 /* create rmb */ 937 rc = __smc_buf_create(smc, is_smcd, true); 938 if (rc) 939 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); 940 return rc; 941 } 942 943 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) 944 { 945 int i; 946 947 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { 948 if (!test_and_set_bit(i, lgr->rtokens_used_mask)) 949 return i; 950 } 951 return -ENOSPC; 952 } 953 954 /* add a new rtoken from peer */ 955 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) 956 { 957 u64 dma_addr = be64_to_cpu(nw_vaddr); 958 u32 rkey = ntohl(nw_rkey); 959 int i; 960 961 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 962 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && 963 (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && 964 test_bit(i, lgr->rtokens_used_mask)) { 965 /* already in list */ 966 return i; 967 } 968 } 969 i = smc_rmb_reserve_rtoken_idx(lgr); 970 if (i < 0) 971 return i; 972 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; 973 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; 974 return i; 975 } 976 977 /* delete an rtoken */ 978 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) 979 { 980 u32 rkey = ntohl(nw_rkey); 981 int i; 982 983 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 984 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && 985 test_bit(i, lgr->rtokens_used_mask)) { 986 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; 987 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; 988 989 clear_bit(i, lgr->rtokens_used_mask); 990 return 0; 991 } 992 } 993 return -ENOENT; 994 } 995 996 /* save rkey and dma_addr received from peer during clc handshake */ 997 int smc_rmb_rtoken_handling(struct smc_connection *conn, 998 struct smc_clc_msg_accept_confirm *clc) 999 { 1000 conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, 1001 clc->rmb_rkey); 1002 if (conn->rtoken_idx < 0) 1003 return conn->rtoken_idx; 1004 return 0; 1005 } 1006 1007 /* Called (from smc_exit) when module is removed */ 1008 void smc_core_exit(void) 1009 { 1010 struct smc_link_group *lgr, *lg; 1011 LIST_HEAD(lgr_freeing_list); 1012 1013 spin_lock_bh(&smc_lgr_list.lock); 1014 if (!list_empty(&smc_lgr_list.list)) 1015 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1016 spin_unlock_bh(&smc_lgr_list.lock); 1017 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1018 list_del_init(&lgr->list); 1019 if (!lgr->is_smcd) { 1020 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 1021 1022 if (lnk->state == SMC_LNK_ACTIVE) 1023 smc_llc_send_delete_link(lnk, SMC_LLC_REQ, 1024 false); 1025 smc_llc_link_inactive(lnk); 1026 } 1027 cancel_delayed_work_sync(&lgr->free_work); 1028 smc_lgr_free(lgr); /* free link group */ 1029 } 1030 } 1031