1 /* 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/kernel.h> 34 #include <linux/list.h> 35 #include <linux/slab.h> 36 #include <linux/export.h> 37 #include <net/ipv6.h> 38 #include <net/inet6_hashtables.h> 39 #include <net/addrconf.h> 40 41 #include "rds.h" 42 #include "loop.h" 43 44 #define RDS_CONNECTION_HASH_BITS 12 45 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) 46 #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) 47 48 /* converting this to RCU is a chore for another day.. */ 49 static DEFINE_SPINLOCK(rds_conn_lock); 50 static unsigned long rds_conn_count; 51 static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; 52 static struct kmem_cache *rds_conn_slab; 53 54 static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, 55 const struct in6_addr *faddr) 56 { 57 static u32 rds6_hash_secret __read_mostly; 58 static u32 rds_hash_secret __read_mostly; 59 60 u32 lhash, fhash, hash; 61 62 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); 63 net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); 64 65 lhash = (__force u32)laddr->s6_addr32[3]; 66 #if IS_ENABLED(CONFIG_IPV6) 67 fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); 68 #else 69 fhash = (__force u32)faddr->s6_addr32[3]; 70 #endif 71 hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); 72 73 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; 74 } 75 76 #define rds_conn_info_set(var, test, suffix) do { \ 77 if (test) \ 78 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ 79 } while (0) 80 81 /* rcu read lock must be held or the connection spinlock */ 82 static struct rds_connection *rds_conn_lookup(struct net *net, 83 struct hlist_head *head, 84 const struct in6_addr *laddr, 85 const struct in6_addr *faddr, 86 struct rds_transport *trans, 87 int dev_if) 88 { 89 struct rds_connection *conn, *ret = NULL; 90 91 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 92 if (ipv6_addr_equal(&conn->c_faddr, faddr) && 93 ipv6_addr_equal(&conn->c_laddr, laddr) && 94 conn->c_trans == trans && 95 net == rds_conn_net(conn) && 96 conn->c_dev_if == dev_if) { 97 ret = conn; 98 break; 99 } 100 } 101 rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, 102 laddr, faddr); 103 return ret; 104 } 105 106 /* 107 * This is called by transports as they're bringing down a connection. 108 * It clears partial message state so that the transport can start sending 109 * and receiving over this connection again in the future. It is up to 110 * the transport to have serialized this call with its send and recv. 111 */ 112 static void rds_conn_path_reset(struct rds_conn_path *cp) 113 { 114 struct rds_connection *conn = cp->cp_conn; 115 116 rdsdebug("connection %pI6c to %pI6c reset\n", 117 &conn->c_laddr, &conn->c_faddr); 118 119 rds_stats_inc(s_conn_reset); 120 rds_send_path_reset(cp); 121 cp->cp_flags = 0; 122 123 /* Do not clear next_rx_seq here, else we cannot distinguish 124 * retransmitted packets from new packets, and will hand all 125 * of them to the application. That is not consistent with the 126 * reliability guarantees of RDS. */ 127 } 128 129 static void __rds_conn_path_init(struct rds_connection *conn, 130 struct rds_conn_path *cp, bool is_outgoing) 131 { 132 spin_lock_init(&cp->cp_lock); 133 cp->cp_next_tx_seq = 1; 134 init_waitqueue_head(&cp->cp_waitq); 135 INIT_LIST_HEAD(&cp->cp_send_queue); 136 INIT_LIST_HEAD(&cp->cp_retrans); 137 138 cp->cp_conn = conn; 139 atomic_set(&cp->cp_state, RDS_CONN_DOWN); 140 cp->cp_send_gen = 0; 141 cp->cp_reconnect_jiffies = 0; 142 INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); 143 INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); 144 INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); 145 INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); 146 mutex_init(&cp->cp_cm_lock); 147 cp->cp_flags = 0; 148 } 149 150 /* 151 * There is only every one 'conn' for a given pair of addresses in the 152 * system at a time. They contain messages to be retransmitted and so 153 * span the lifetime of the actual underlying transport connections. 154 * 155 * For now they are not garbage collected once they're created. They 156 * are torn down as the module is removed, if ever. 157 */ 158 static struct rds_connection *__rds_conn_create(struct net *net, 159 const struct in6_addr *laddr, 160 const struct in6_addr *faddr, 161 struct rds_transport *trans, 162 gfp_t gfp, 163 int is_outgoing, 164 int dev_if) 165 { 166 struct rds_connection *conn, *parent = NULL; 167 struct hlist_head *head = rds_conn_bucket(laddr, faddr); 168 struct rds_transport *loop_trans; 169 unsigned long flags; 170 int ret, i; 171 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 172 173 rcu_read_lock(); 174 conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); 175 if (conn && 176 conn->c_loopback && 177 conn->c_trans != &rds_loop_transport && 178 ipv6_addr_equal(laddr, faddr) && 179 !is_outgoing) { 180 /* This is a looped back IB connection, and we're 181 * called by the code handling the incoming connect. 182 * We need a second connection object into which we 183 * can stick the other QP. */ 184 parent = conn; 185 conn = parent->c_passive; 186 } 187 rcu_read_unlock(); 188 if (conn) 189 goto out; 190 191 conn = kmem_cache_zalloc(rds_conn_slab, gfp); 192 if (!conn) { 193 conn = ERR_PTR(-ENOMEM); 194 goto out; 195 } 196 conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); 197 if (!conn->c_path) { 198 kmem_cache_free(rds_conn_slab, conn); 199 conn = ERR_PTR(-ENOMEM); 200 goto out; 201 } 202 203 INIT_HLIST_NODE(&conn->c_hash_node); 204 conn->c_laddr = *laddr; 205 conn->c_isv6 = !ipv6_addr_v4mapped(laddr); 206 conn->c_faddr = *faddr; 207 conn->c_dev_if = dev_if; 208 209 #if IS_ENABLED(CONFIG_IPV6) 210 /* If the local address is link local, set c_bound_if to be the 211 * index used for this connection. Otherwise, set it to 0 as 212 * the socket is not bound to an interface. c_bound_if is used 213 * to look up a socket when a packet is received 214 */ 215 if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) 216 conn->c_bound_if = dev_if; 217 else 218 #endif 219 conn->c_bound_if = 0; 220 221 rds_conn_net_set(conn, net); 222 223 ret = rds_cong_get_maps(conn); 224 if (ret) { 225 kfree(conn->c_path); 226 kmem_cache_free(rds_conn_slab, conn); 227 conn = ERR_PTR(ret); 228 goto out; 229 } 230 231 /* 232 * This is where a connection becomes loopback. If *any* RDS sockets 233 * can bind to the destination address then we'd rather the messages 234 * flow through loopback rather than either transport. 235 */ 236 loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); 237 if (loop_trans) { 238 rds_trans_put(loop_trans); 239 conn->c_loopback = 1; 240 if (is_outgoing && trans->t_prefer_loopback) { 241 /* "outgoing" connection - and the transport 242 * says it wants the connection handled by the 243 * loopback transport. This is what TCP does. 244 */ 245 trans = &rds_loop_transport; 246 } 247 } 248 249 conn->c_trans = trans; 250 251 init_waitqueue_head(&conn->c_hs_waitq); 252 for (i = 0; i < npaths; i++) { 253 __rds_conn_path_init(conn, &conn->c_path[i], 254 is_outgoing); 255 conn->c_path[i].cp_index = i; 256 } 257 rcu_read_lock(); 258 if (rds_destroy_pending(conn)) 259 ret = -ENETDOWN; 260 else 261 ret = trans->conn_alloc(conn, GFP_ATOMIC); 262 if (ret) { 263 rcu_read_unlock(); 264 kfree(conn->c_path); 265 kmem_cache_free(rds_conn_slab, conn); 266 conn = ERR_PTR(ret); 267 goto out; 268 } 269 270 rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", 271 conn, laddr, faddr, 272 strnlen(trans->t_name, sizeof(trans->t_name)) ? 273 trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); 274 275 /* 276 * Since we ran without holding the conn lock, someone could 277 * have created the same conn (either normal or passive) in the 278 * interim. We check while holding the lock. If we won, we complete 279 * init and return our conn. If we lost, we rollback and return the 280 * other one. 281 */ 282 spin_lock_irqsave(&rds_conn_lock, flags); 283 if (parent) { 284 /* Creating passive conn */ 285 if (parent->c_passive) { 286 trans->conn_free(conn->c_path[0].cp_transport_data); 287 kfree(conn->c_path); 288 kmem_cache_free(rds_conn_slab, conn); 289 conn = parent->c_passive; 290 } else { 291 parent->c_passive = conn; 292 rds_cong_add_conn(conn); 293 rds_conn_count++; 294 } 295 } else { 296 /* Creating normal conn */ 297 struct rds_connection *found; 298 299 found = rds_conn_lookup(net, head, laddr, faddr, trans, 300 dev_if); 301 if (found) { 302 struct rds_conn_path *cp; 303 int i; 304 305 for (i = 0; i < npaths; i++) { 306 cp = &conn->c_path[i]; 307 /* The ->conn_alloc invocation may have 308 * allocated resource for all paths, so all 309 * of them may have to be freed here. 310 */ 311 if (cp->cp_transport_data) 312 trans->conn_free(cp->cp_transport_data); 313 } 314 kfree(conn->c_path); 315 kmem_cache_free(rds_conn_slab, conn); 316 conn = found; 317 } else { 318 conn->c_my_gen_num = rds_gen_num; 319 conn->c_peer_gen_num = 0; 320 hlist_add_head_rcu(&conn->c_hash_node, head); 321 rds_cong_add_conn(conn); 322 rds_conn_count++; 323 } 324 } 325 spin_unlock_irqrestore(&rds_conn_lock, flags); 326 rcu_read_unlock(); 327 328 out: 329 return conn; 330 } 331 332 struct rds_connection *rds_conn_create(struct net *net, 333 const struct in6_addr *laddr, 334 const struct in6_addr *faddr, 335 struct rds_transport *trans, gfp_t gfp, 336 int dev_if) 337 { 338 return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); 339 } 340 EXPORT_SYMBOL_GPL(rds_conn_create); 341 342 struct rds_connection *rds_conn_create_outgoing(struct net *net, 343 const struct in6_addr *laddr, 344 const struct in6_addr *faddr, 345 struct rds_transport *trans, 346 gfp_t gfp, int dev_if) 347 { 348 return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); 349 } 350 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 351 352 void rds_conn_shutdown(struct rds_conn_path *cp) 353 { 354 struct rds_connection *conn = cp->cp_conn; 355 356 /* shut it down unless it's down already */ 357 if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { 358 /* 359 * Quiesce the connection mgmt handlers before we start tearing 360 * things down. We don't hold the mutex for the entire 361 * duration of the shutdown operation, else we may be 362 * deadlocking with the CM handler. Instead, the CM event 363 * handler is supposed to check for state DISCONNECTING 364 */ 365 mutex_lock(&cp->cp_cm_lock); 366 if (!rds_conn_path_transition(cp, RDS_CONN_UP, 367 RDS_CONN_DISCONNECTING) && 368 !rds_conn_path_transition(cp, RDS_CONN_ERROR, 369 RDS_CONN_DISCONNECTING)) { 370 rds_conn_path_error(cp, 371 "shutdown called in state %d\n", 372 atomic_read(&cp->cp_state)); 373 mutex_unlock(&cp->cp_cm_lock); 374 return; 375 } 376 mutex_unlock(&cp->cp_cm_lock); 377 378 wait_event(cp->cp_waitq, 379 !test_bit(RDS_IN_XMIT, &cp->cp_flags)); 380 wait_event(cp->cp_waitq, 381 !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); 382 383 conn->c_trans->conn_path_shutdown(cp); 384 rds_conn_path_reset(cp); 385 386 if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, 387 RDS_CONN_DOWN) && 388 !rds_conn_path_transition(cp, RDS_CONN_ERROR, 389 RDS_CONN_DOWN)) { 390 /* This can happen - eg when we're in the middle of tearing 391 * down the connection, and someone unloads the rds module. 392 * Quite reproducible with loopback connections. 393 * Mostly harmless. 394 * 395 * Note that this also happens with rds-tcp because 396 * we could have triggered rds_conn_path_drop in irq 397 * mode from rds_tcp_state change on the receipt of 398 * a FIN, thus we need to recheck for RDS_CONN_ERROR 399 * here. 400 */ 401 rds_conn_path_error(cp, "%s: failed to transition " 402 "to state DOWN, current state " 403 "is %d\n", __func__, 404 atomic_read(&cp->cp_state)); 405 return; 406 } 407 } 408 409 /* Then reconnect if it's still live. 410 * The passive side of an IB loopback connection is never added 411 * to the conn hash, so we never trigger a reconnect on this 412 * conn - the reconnect is always triggered by the active peer. */ 413 cancel_delayed_work_sync(&cp->cp_conn_w); 414 rcu_read_lock(); 415 if (!hlist_unhashed(&conn->c_hash_node)) { 416 rcu_read_unlock(); 417 rds_queue_reconnect(cp); 418 } else { 419 rcu_read_unlock(); 420 } 421 } 422 423 /* destroy a single rds_conn_path. rds_conn_destroy() iterates over 424 * all paths using rds_conn_path_destroy() 425 */ 426 static void rds_conn_path_destroy(struct rds_conn_path *cp) 427 { 428 struct rds_message *rm, *rtmp; 429 430 if (!cp->cp_transport_data) 431 return; 432 433 /* make sure lingering queued work won't try to ref the conn */ 434 cancel_delayed_work_sync(&cp->cp_send_w); 435 cancel_delayed_work_sync(&cp->cp_recv_w); 436 437 rds_conn_path_drop(cp, true); 438 flush_work(&cp->cp_down_w); 439 440 /* tear down queued messages */ 441 list_for_each_entry_safe(rm, rtmp, 442 &cp->cp_send_queue, 443 m_conn_item) { 444 list_del_init(&rm->m_conn_item); 445 BUG_ON(!list_empty(&rm->m_sock_item)); 446 rds_message_put(rm); 447 } 448 if (cp->cp_xmit_rm) 449 rds_message_put(cp->cp_xmit_rm); 450 451 WARN_ON(delayed_work_pending(&cp->cp_send_w)); 452 WARN_ON(delayed_work_pending(&cp->cp_recv_w)); 453 WARN_ON(delayed_work_pending(&cp->cp_conn_w)); 454 WARN_ON(work_pending(&cp->cp_down_w)); 455 456 cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); 457 } 458 459 /* 460 * Stop and free a connection. 461 * 462 * This can only be used in very limited circumstances. It assumes that once 463 * the conn has been shutdown that no one else is referencing the connection. 464 * We can only ensure this in the rmmod path in the current code. 465 */ 466 void rds_conn_destroy(struct rds_connection *conn) 467 { 468 unsigned long flags; 469 int i; 470 struct rds_conn_path *cp; 471 int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 472 473 rdsdebug("freeing conn %p for %pI4 -> " 474 "%pI4\n", conn, &conn->c_laddr, 475 &conn->c_faddr); 476 477 /* Ensure conn will not be scheduled for reconnect */ 478 spin_lock_irq(&rds_conn_lock); 479 hlist_del_init_rcu(&conn->c_hash_node); 480 spin_unlock_irq(&rds_conn_lock); 481 synchronize_rcu(); 482 483 /* shut the connection down */ 484 for (i = 0; i < npaths; i++) { 485 cp = &conn->c_path[i]; 486 rds_conn_path_destroy(cp); 487 BUG_ON(!list_empty(&cp->cp_retrans)); 488 } 489 490 /* 491 * The congestion maps aren't freed up here. They're 492 * freed by rds_cong_exit() after all the connections 493 * have been freed. 494 */ 495 rds_cong_remove_conn(conn); 496 497 kfree(conn->c_path); 498 kmem_cache_free(rds_conn_slab, conn); 499 500 spin_lock_irqsave(&rds_conn_lock, flags); 501 rds_conn_count--; 502 spin_unlock_irqrestore(&rds_conn_lock, flags); 503 } 504 EXPORT_SYMBOL_GPL(rds_conn_destroy); 505 506 static void __rds_inc_msg_cp(struct rds_incoming *inc, 507 struct rds_info_iterator *iter, 508 void *saddr, void *daddr, int flip, bool isv6) 509 { 510 #if IS_ENABLED(CONFIG_IPV6) 511 if (isv6) 512 rds6_inc_info_copy(inc, iter, saddr, daddr, flip); 513 else 514 #endif 515 rds_inc_info_copy(inc, iter, *(__be32 *)saddr, 516 *(__be32 *)daddr, flip); 517 } 518 519 static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, 520 struct rds_info_iterator *iter, 521 struct rds_info_lengths *lens, 522 int want_send, bool isv6) 523 { 524 struct hlist_head *head; 525 struct list_head *list; 526 struct rds_connection *conn; 527 struct rds_message *rm; 528 unsigned int total = 0; 529 unsigned long flags; 530 size_t i; 531 int j; 532 533 if (isv6) 534 len /= sizeof(struct rds6_info_message); 535 else 536 len /= sizeof(struct rds_info_message); 537 538 rcu_read_lock(); 539 540 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 541 i++, head++) { 542 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 543 struct rds_conn_path *cp; 544 int npaths; 545 546 if (!isv6 && conn->c_isv6) 547 continue; 548 549 npaths = (conn->c_trans->t_mp_capable ? 550 RDS_MPATH_WORKERS : 1); 551 552 for (j = 0; j < npaths; j++) { 553 cp = &conn->c_path[j]; 554 if (want_send) 555 list = &cp->cp_send_queue; 556 else 557 list = &cp->cp_retrans; 558 559 spin_lock_irqsave(&cp->cp_lock, flags); 560 561 /* XXX too lazy to maintain counts.. */ 562 list_for_each_entry(rm, list, m_conn_item) { 563 total++; 564 if (total <= len) 565 __rds_inc_msg_cp(&rm->m_inc, 566 iter, 567 &conn->c_laddr, 568 &conn->c_faddr, 569 0, isv6); 570 } 571 572 spin_unlock_irqrestore(&cp->cp_lock, flags); 573 } 574 } 575 } 576 rcu_read_unlock(); 577 578 lens->nr = total; 579 if (isv6) 580 lens->each = sizeof(struct rds6_info_message); 581 else 582 lens->each = sizeof(struct rds_info_message); 583 } 584 585 static void rds_conn_message_info(struct socket *sock, unsigned int len, 586 struct rds_info_iterator *iter, 587 struct rds_info_lengths *lens, 588 int want_send) 589 { 590 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); 591 } 592 593 #if IS_ENABLED(CONFIG_IPV6) 594 static void rds6_conn_message_info(struct socket *sock, unsigned int len, 595 struct rds_info_iterator *iter, 596 struct rds_info_lengths *lens, 597 int want_send) 598 { 599 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); 600 } 601 #endif 602 603 static void rds_conn_message_info_send(struct socket *sock, unsigned int len, 604 struct rds_info_iterator *iter, 605 struct rds_info_lengths *lens) 606 { 607 rds_conn_message_info(sock, len, iter, lens, 1); 608 } 609 610 #if IS_ENABLED(CONFIG_IPV6) 611 static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, 612 struct rds_info_iterator *iter, 613 struct rds_info_lengths *lens) 614 { 615 rds6_conn_message_info(sock, len, iter, lens, 1); 616 } 617 #endif 618 619 static void rds_conn_message_info_retrans(struct socket *sock, 620 unsigned int len, 621 struct rds_info_iterator *iter, 622 struct rds_info_lengths *lens) 623 { 624 rds_conn_message_info(sock, len, iter, lens, 0); 625 } 626 627 #if IS_ENABLED(CONFIG_IPV6) 628 static void rds6_conn_message_info_retrans(struct socket *sock, 629 unsigned int len, 630 struct rds_info_iterator *iter, 631 struct rds_info_lengths *lens) 632 { 633 rds6_conn_message_info(sock, len, iter, lens, 0); 634 } 635 #endif 636 637 void rds_for_each_conn_info(struct socket *sock, unsigned int len, 638 struct rds_info_iterator *iter, 639 struct rds_info_lengths *lens, 640 int (*visitor)(struct rds_connection *, void *), 641 u64 *buffer, 642 size_t item_len) 643 { 644 struct hlist_head *head; 645 struct rds_connection *conn; 646 size_t i; 647 648 rcu_read_lock(); 649 650 lens->nr = 0; 651 lens->each = item_len; 652 653 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 654 i++, head++) { 655 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 656 657 /* XXX no c_lock usage.. */ 658 if (!visitor(conn, buffer)) 659 continue; 660 661 /* We copy as much as we can fit in the buffer, 662 * but we count all items so that the caller 663 * can resize the buffer. */ 664 if (len >= item_len) { 665 rds_info_copy(iter, buffer, item_len); 666 len -= item_len; 667 } 668 lens->nr++; 669 } 670 } 671 rcu_read_unlock(); 672 } 673 EXPORT_SYMBOL_GPL(rds_for_each_conn_info); 674 675 static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, 676 struct rds_info_iterator *iter, 677 struct rds_info_lengths *lens, 678 int (*visitor)(struct rds_conn_path *, void *), 679 u64 *buffer, 680 size_t item_len) 681 { 682 struct hlist_head *head; 683 struct rds_connection *conn; 684 size_t i; 685 686 rcu_read_lock(); 687 688 lens->nr = 0; 689 lens->each = item_len; 690 691 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 692 i++, head++) { 693 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 694 struct rds_conn_path *cp; 695 696 /* XXX We only copy the information from the first 697 * path for now. The problem is that if there are 698 * more than one underlying paths, we cannot report 699 * information of all of them using the existing 700 * API. For example, there is only one next_tx_seq, 701 * which path's next_tx_seq should we report? It is 702 * a bug in the design of MPRDS. 703 */ 704 cp = conn->c_path; 705 706 /* XXX no cp_lock usage.. */ 707 if (!visitor(cp, buffer)) 708 continue; 709 710 /* We copy as much as we can fit in the buffer, 711 * but we count all items so that the caller 712 * can resize the buffer. 713 */ 714 if (len >= item_len) { 715 rds_info_copy(iter, buffer, item_len); 716 len -= item_len; 717 } 718 lens->nr++; 719 } 720 } 721 rcu_read_unlock(); 722 } 723 724 static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) 725 { 726 struct rds_info_connection *cinfo = buffer; 727 struct rds_connection *conn = cp->cp_conn; 728 729 if (conn->c_isv6) 730 return 0; 731 732 cinfo->next_tx_seq = cp->cp_next_tx_seq; 733 cinfo->next_rx_seq = cp->cp_next_rx_seq; 734 cinfo->laddr = conn->c_laddr.s6_addr32[3]; 735 cinfo->faddr = conn->c_faddr.s6_addr32[3]; 736 strncpy(cinfo->transport, conn->c_trans->t_name, 737 sizeof(cinfo->transport)); 738 cinfo->flags = 0; 739 740 rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), 741 SENDING); 742 /* XXX Future: return the state rather than these funky bits */ 743 rds_conn_info_set(cinfo->flags, 744 atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, 745 CONNECTING); 746 rds_conn_info_set(cinfo->flags, 747 atomic_read(&cp->cp_state) == RDS_CONN_UP, 748 CONNECTED); 749 return 1; 750 } 751 752 #if IS_ENABLED(CONFIG_IPV6) 753 static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) 754 { 755 struct rds6_info_connection *cinfo6 = buffer; 756 struct rds_connection *conn = cp->cp_conn; 757 758 cinfo6->next_tx_seq = cp->cp_next_tx_seq; 759 cinfo6->next_rx_seq = cp->cp_next_rx_seq; 760 cinfo6->laddr = conn->c_laddr; 761 cinfo6->faddr = conn->c_faddr; 762 strncpy(cinfo6->transport, conn->c_trans->t_name, 763 sizeof(cinfo6->transport)); 764 cinfo6->flags = 0; 765 766 rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), 767 SENDING); 768 /* XXX Future: return the state rather than these funky bits */ 769 rds_conn_info_set(cinfo6->flags, 770 atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, 771 CONNECTING); 772 rds_conn_info_set(cinfo6->flags, 773 atomic_read(&cp->cp_state) == RDS_CONN_UP, 774 CONNECTED); 775 /* Just return 1 as there is no error case. This is a helper function 776 * for rds_walk_conn_path_info() and it wants a return value. 777 */ 778 return 1; 779 } 780 #endif 781 782 static void rds_conn_info(struct socket *sock, unsigned int len, 783 struct rds_info_iterator *iter, 784 struct rds_info_lengths *lens) 785 { 786 u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; 787 788 rds_walk_conn_path_info(sock, len, iter, lens, 789 rds_conn_info_visitor, 790 buffer, 791 sizeof(struct rds_info_connection)); 792 } 793 794 #if IS_ENABLED(CONFIG_IPV6) 795 static void rds6_conn_info(struct socket *sock, unsigned int len, 796 struct rds_info_iterator *iter, 797 struct rds_info_lengths *lens) 798 { 799 u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; 800 801 rds_walk_conn_path_info(sock, len, iter, lens, 802 rds6_conn_info_visitor, 803 buffer, 804 sizeof(struct rds6_info_connection)); 805 } 806 #endif 807 808 int rds_conn_init(void) 809 { 810 int ret; 811 812 ret = rds_loop_net_init(); /* register pernet callback */ 813 if (ret) 814 return ret; 815 816 rds_conn_slab = kmem_cache_create("rds_connection", 817 sizeof(struct rds_connection), 818 0, 0, NULL); 819 if (!rds_conn_slab) { 820 rds_loop_net_exit(); 821 return -ENOMEM; 822 } 823 824 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); 825 rds_info_register_func(RDS_INFO_SEND_MESSAGES, 826 rds_conn_message_info_send); 827 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, 828 rds_conn_message_info_retrans); 829 #if IS_ENABLED(CONFIG_IPV6) 830 rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); 831 rds_info_register_func(RDS6_INFO_SEND_MESSAGES, 832 rds6_conn_message_info_send); 833 rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, 834 rds6_conn_message_info_retrans); 835 #endif 836 return 0; 837 } 838 839 void rds_conn_exit(void) 840 { 841 rds_loop_net_exit(); /* unregister pernet callback */ 842 rds_loop_exit(); 843 844 WARN_ON(!hlist_empty(rds_conn_hash)); 845 846 kmem_cache_destroy(rds_conn_slab); 847 848 rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); 849 rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, 850 rds_conn_message_info_send); 851 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, 852 rds_conn_message_info_retrans); 853 #if IS_ENABLED(CONFIG_IPV6) 854 rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); 855 rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, 856 rds6_conn_message_info_send); 857 rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, 858 rds6_conn_message_info_retrans); 859 #endif 860 } 861 862 /* 863 * Force a disconnect 864 */ 865 void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) 866 { 867 atomic_set(&cp->cp_state, RDS_CONN_ERROR); 868 869 rcu_read_lock(); 870 if (!destroy && rds_destroy_pending(cp->cp_conn)) { 871 rcu_read_unlock(); 872 return; 873 } 874 queue_work(rds_wq, &cp->cp_down_w); 875 rcu_read_unlock(); 876 } 877 EXPORT_SYMBOL_GPL(rds_conn_path_drop); 878 879 void rds_conn_drop(struct rds_connection *conn) 880 { 881 WARN_ON(conn->c_trans->t_mp_capable); 882 rds_conn_path_drop(&conn->c_path[0], false); 883 } 884 EXPORT_SYMBOL_GPL(rds_conn_drop); 885 886 /* 887 * If the connection is down, trigger a connect. We may have scheduled a 888 * delayed reconnect however - in this case we should not interfere. 889 */ 890 void rds_conn_path_connect_if_down(struct rds_conn_path *cp) 891 { 892 rcu_read_lock(); 893 if (rds_destroy_pending(cp->cp_conn)) { 894 rcu_read_unlock(); 895 return; 896 } 897 if (rds_conn_path_state(cp) == RDS_CONN_DOWN && 898 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) 899 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 900 rcu_read_unlock(); 901 } 902 EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); 903 904 void rds_conn_connect_if_down(struct rds_connection *conn) 905 { 906 WARN_ON(conn->c_trans->t_mp_capable); 907 rds_conn_path_connect_if_down(&conn->c_path[0]); 908 } 909 EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); 910 911 void 912 __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) 913 { 914 va_list ap; 915 916 va_start(ap, fmt); 917 vprintk(fmt, ap); 918 va_end(ap); 919 920 rds_conn_path_drop(cp, false); 921 } 922