1 /* 2 drbd_receiver.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25 26 #include <linux/module.h> 27 28 #include <linux/uaccess.h> 29 #include <net/sock.h> 30 31 #include <linux/drbd.h> 32 #include <linux/fs.h> 33 #include <linux/file.h> 34 #include <linux/in.h> 35 #include <linux/mm.h> 36 #include <linux/memcontrol.h> 37 #include <linux/mm_inline.h> 38 #include <linux/slab.h> 39 #include <uapi/linux/sched/types.h> 40 #include <linux/sched/signal.h> 41 #include <linux/pkt_sched.h> 42 #define __KERNEL_SYSCALLS__ 43 #include <linux/unistd.h> 44 #include <linux/vmalloc.h> 45 #include <linux/random.h> 46 #include <linux/string.h> 47 #include <linux/scatterlist.h> 48 #include "drbd_int.h" 49 #include "drbd_protocol.h" 50 #include "drbd_req.h" 51 #include "drbd_vli.h" 52 53 #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) 54 55 struct packet_info { 56 enum drbd_packet cmd; 57 unsigned int size; 58 unsigned int vnr; 59 void *data; 60 }; 61 62 enum finish_epoch { 63 FE_STILL_LIVE, 64 FE_DESTROYED, 65 FE_RECYCLED, 66 }; 67 68 static int drbd_do_features(struct drbd_connection *connection); 69 static int drbd_do_auth(struct drbd_connection *connection); 70 static int drbd_disconnected(struct drbd_peer_device *); 71 static void conn_wait_active_ee_empty(struct drbd_connection *connection); 72 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); 73 static int e_end_block(struct drbd_work *, int); 74 75 76 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 77 78 /* 79 * some helper functions to deal with single linked page lists, 80 * page->private being our "next" pointer. 81 */ 82 83 /* If at least n pages are linked at head, get n pages off. 84 * Otherwise, don't modify head, and return NULL. 85 * Locking is the responsibility of the caller. 86 */ 87 static struct page *page_chain_del(struct page **head, int n) 88 { 89 struct page *page; 90 struct page *tmp; 91 92 BUG_ON(!n); 93 BUG_ON(!head); 94 95 page = *head; 96 97 if (!page) 98 return NULL; 99 100 while (page) { 101 tmp = page_chain_next(page); 102 if (--n == 0) 103 break; /* found sufficient pages */ 104 if (tmp == NULL) 105 /* insufficient pages, don't use any of them. */ 106 return NULL; 107 page = tmp; 108 } 109 110 /* add end of list marker for the returned list */ 111 set_page_private(page, 0); 112 /* actual return value, and adjustment of head */ 113 page = *head; 114 *head = tmp; 115 return page; 116 } 117 118 /* may be used outside of locks to find the tail of a (usually short) 119 * "private" page chain, before adding it back to a global chain head 120 * with page_chain_add() under a spinlock. */ 121 static struct page *page_chain_tail(struct page *page, int *len) 122 { 123 struct page *tmp; 124 int i = 1; 125 while ((tmp = page_chain_next(page))) 126 ++i, page = tmp; 127 if (len) 128 *len = i; 129 return page; 130 } 131 132 static int page_chain_free(struct page *page) 133 { 134 struct page *tmp; 135 int i = 0; 136 page_chain_for_each_safe(page, tmp) { 137 put_page(page); 138 ++i; 139 } 140 return i; 141 } 142 143 static void page_chain_add(struct page **head, 144 struct page *chain_first, struct page *chain_last) 145 { 146 #if 1 147 struct page *tmp; 148 tmp = page_chain_tail(chain_first, NULL); 149 BUG_ON(tmp != chain_last); 150 #endif 151 152 /* add chain to head */ 153 set_page_private(chain_last, (unsigned long)*head); 154 *head = chain_first; 155 } 156 157 static struct page *__drbd_alloc_pages(struct drbd_device *device, 158 unsigned int number) 159 { 160 struct page *page = NULL; 161 struct page *tmp = NULL; 162 unsigned int i = 0; 163 164 /* Yes, testing drbd_pp_vacant outside the lock is racy. 165 * So what. It saves a spin_lock. */ 166 if (drbd_pp_vacant >= number) { 167 spin_lock(&drbd_pp_lock); 168 page = page_chain_del(&drbd_pp_pool, number); 169 if (page) 170 drbd_pp_vacant -= number; 171 spin_unlock(&drbd_pp_lock); 172 if (page) 173 return page; 174 } 175 176 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD 177 * "criss-cross" setup, that might cause write-out on some other DRBD, 178 * which in turn might block on the other node at this very place. */ 179 for (i = 0; i < number; i++) { 180 tmp = alloc_page(GFP_TRY); 181 if (!tmp) 182 break; 183 set_page_private(tmp, (unsigned long)page); 184 page = tmp; 185 } 186 187 if (i == number) 188 return page; 189 190 /* Not enough pages immediately available this time. 191 * No need to jump around here, drbd_alloc_pages will retry this 192 * function "soon". */ 193 if (page) { 194 tmp = page_chain_tail(page, NULL); 195 spin_lock(&drbd_pp_lock); 196 page_chain_add(&drbd_pp_pool, page, tmp); 197 drbd_pp_vacant += i; 198 spin_unlock(&drbd_pp_lock); 199 } 200 return NULL; 201 } 202 203 static void reclaim_finished_net_peer_reqs(struct drbd_device *device, 204 struct list_head *to_be_freed) 205 { 206 struct drbd_peer_request *peer_req, *tmp; 207 208 /* The EEs are always appended to the end of the list. Since 209 they are sent in order over the wire, they have to finish 210 in order. As soon as we see the first not finished we can 211 stop to examine the list... */ 212 213 list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { 214 if (drbd_peer_req_has_active_page(peer_req)) 215 break; 216 list_move(&peer_req->w.list, to_be_freed); 217 } 218 } 219 220 static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) 221 { 222 LIST_HEAD(reclaimed); 223 struct drbd_peer_request *peer_req, *t; 224 225 spin_lock_irq(&device->resource->req_lock); 226 reclaim_finished_net_peer_reqs(device, &reclaimed); 227 spin_unlock_irq(&device->resource->req_lock); 228 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) 229 drbd_free_net_peer_req(device, peer_req); 230 } 231 232 static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) 233 { 234 struct drbd_peer_device *peer_device; 235 int vnr; 236 237 rcu_read_lock(); 238 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 239 struct drbd_device *device = peer_device->device; 240 if (!atomic_read(&device->pp_in_use_by_net)) 241 continue; 242 243 kref_get(&device->kref); 244 rcu_read_unlock(); 245 drbd_reclaim_net_peer_reqs(device); 246 kref_put(&device->kref, drbd_destroy_device); 247 rcu_read_lock(); 248 } 249 rcu_read_unlock(); 250 } 251 252 /** 253 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) 254 * @device: DRBD device. 255 * @number: number of pages requested 256 * @retry: whether to retry, if not enough pages are available right now 257 * 258 * Tries to allocate number pages, first from our own page pool, then from 259 * the kernel. 260 * Possibly retry until DRBD frees sufficient pages somewhere else. 261 * 262 * If this allocation would exceed the max_buffers setting, we throttle 263 * allocation (schedule_timeout) to give the system some room to breathe. 264 * 265 * We do not use max-buffers as hard limit, because it could lead to 266 * congestion and further to a distributed deadlock during online-verify or 267 * (checksum based) resync, if the max-buffers, socket buffer sizes and 268 * resync-rate settings are mis-configured. 269 * 270 * Returns a page chain linked via page->private. 271 */ 272 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, 273 bool retry) 274 { 275 struct drbd_device *device = peer_device->device; 276 struct page *page = NULL; 277 struct net_conf *nc; 278 DEFINE_WAIT(wait); 279 unsigned int mxb; 280 281 rcu_read_lock(); 282 nc = rcu_dereference(peer_device->connection->net_conf); 283 mxb = nc ? nc->max_buffers : 1000000; 284 rcu_read_unlock(); 285 286 if (atomic_read(&device->pp_in_use) < mxb) 287 page = __drbd_alloc_pages(device, number); 288 289 /* Try to keep the fast path fast, but occasionally we need 290 * to reclaim the pages we lended to the network stack. */ 291 if (page && atomic_read(&device->pp_in_use_by_net) > 512) 292 drbd_reclaim_net_peer_reqs(device); 293 294 while (page == NULL) { 295 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 296 297 drbd_reclaim_net_peer_reqs(device); 298 299 if (atomic_read(&device->pp_in_use) < mxb) { 300 page = __drbd_alloc_pages(device, number); 301 if (page) 302 break; 303 } 304 305 if (!retry) 306 break; 307 308 if (signal_pending(current)) { 309 drbd_warn(device, "drbd_alloc_pages interrupted!\n"); 310 break; 311 } 312 313 if (schedule_timeout(HZ/10) == 0) 314 mxb = UINT_MAX; 315 } 316 finish_wait(&drbd_pp_wait, &wait); 317 318 if (page) 319 atomic_add(number, &device->pp_in_use); 320 return page; 321 } 322 323 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. 324 * Is also used from inside an other spin_lock_irq(&resource->req_lock); 325 * Either links the page chain back to the global pool, 326 * or returns all pages to the system. */ 327 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) 328 { 329 atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; 330 int i; 331 332 if (page == NULL) 333 return; 334 335 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count) 336 i = page_chain_free(page); 337 else { 338 struct page *tmp; 339 tmp = page_chain_tail(page, &i); 340 spin_lock(&drbd_pp_lock); 341 page_chain_add(&drbd_pp_pool, page, tmp); 342 drbd_pp_vacant += i; 343 spin_unlock(&drbd_pp_lock); 344 } 345 i = atomic_sub_return(i, a); 346 if (i < 0) 347 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", 348 is_net ? "pp_in_use_by_net" : "pp_in_use", i); 349 wake_up(&drbd_pp_wait); 350 } 351 352 /* 353 You need to hold the req_lock: 354 _drbd_wait_ee_list_empty() 355 356 You must not have the req_lock: 357 drbd_free_peer_req() 358 drbd_alloc_peer_req() 359 drbd_free_peer_reqs() 360 drbd_ee_fix_bhs() 361 drbd_finish_peer_reqs() 362 drbd_clear_done_ee() 363 drbd_wait_ee_list_empty() 364 */ 365 366 /* normal: payload_size == request size (bi_size) 367 * w_same: payload_size == logical_block_size 368 * trim: payload_size == 0 */ 369 struct drbd_peer_request * 370 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 371 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local) 372 { 373 struct drbd_device *device = peer_device->device; 374 struct drbd_peer_request *peer_req; 375 struct page *page = NULL; 376 unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; 377 378 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) 379 return NULL; 380 381 peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); 382 if (!peer_req) { 383 if (!(gfp_mask & __GFP_NOWARN)) 384 drbd_err(device, "%s: allocation failed\n", __func__); 385 return NULL; 386 } 387 388 if (nr_pages) { 389 page = drbd_alloc_pages(peer_device, nr_pages, 390 gfpflags_allow_blocking(gfp_mask)); 391 if (!page) 392 goto fail; 393 } 394 395 memset(peer_req, 0, sizeof(*peer_req)); 396 INIT_LIST_HEAD(&peer_req->w.list); 397 drbd_clear_interval(&peer_req->i); 398 peer_req->i.size = request_size; 399 peer_req->i.sector = sector; 400 peer_req->submit_jif = jiffies; 401 peer_req->peer_device = peer_device; 402 peer_req->pages = page; 403 /* 404 * The block_id is opaque to the receiver. It is not endianness 405 * converted, and sent back to the sender unchanged. 406 */ 407 peer_req->block_id = id; 408 409 return peer_req; 410 411 fail: 412 mempool_free(peer_req, &drbd_ee_mempool); 413 return NULL; 414 } 415 416 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, 417 int is_net) 418 { 419 might_sleep(); 420 if (peer_req->flags & EE_HAS_DIGEST) 421 kfree(peer_req->digest); 422 drbd_free_pages(device, peer_req->pages, is_net); 423 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); 424 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 425 if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { 426 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 427 drbd_al_complete_io(device, &peer_req->i); 428 } 429 mempool_free(peer_req, &drbd_ee_mempool); 430 } 431 432 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) 433 { 434 LIST_HEAD(work_list); 435 struct drbd_peer_request *peer_req, *t; 436 int count = 0; 437 int is_net = list == &device->net_ee; 438 439 spin_lock_irq(&device->resource->req_lock); 440 list_splice_init(list, &work_list); 441 spin_unlock_irq(&device->resource->req_lock); 442 443 list_for_each_entry_safe(peer_req, t, &work_list, w.list) { 444 __drbd_free_peer_req(device, peer_req, is_net); 445 count++; 446 } 447 return count; 448 } 449 450 /* 451 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. 452 */ 453 static int drbd_finish_peer_reqs(struct drbd_device *device) 454 { 455 LIST_HEAD(work_list); 456 LIST_HEAD(reclaimed); 457 struct drbd_peer_request *peer_req, *t; 458 int err = 0; 459 460 spin_lock_irq(&device->resource->req_lock); 461 reclaim_finished_net_peer_reqs(device, &reclaimed); 462 list_splice_init(&device->done_ee, &work_list); 463 spin_unlock_irq(&device->resource->req_lock); 464 465 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) 466 drbd_free_net_peer_req(device, peer_req); 467 468 /* possible callbacks here: 469 * e_end_block, and e_end_resync_block, e_send_superseded. 470 * all ignore the last argument. 471 */ 472 list_for_each_entry_safe(peer_req, t, &work_list, w.list) { 473 int err2; 474 475 /* list_del not necessary, next/prev members not touched */ 476 err2 = peer_req->w.cb(&peer_req->w, !!err); 477 if (!err) 478 err = err2; 479 drbd_free_peer_req(device, peer_req); 480 } 481 wake_up(&device->ee_wait); 482 483 return err; 484 } 485 486 static void _drbd_wait_ee_list_empty(struct drbd_device *device, 487 struct list_head *head) 488 { 489 DEFINE_WAIT(wait); 490 491 /* avoids spin_lock/unlock 492 * and calling prepare_to_wait in the fast path */ 493 while (!list_empty(head)) { 494 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE); 495 spin_unlock_irq(&device->resource->req_lock); 496 io_schedule(); 497 finish_wait(&device->ee_wait, &wait); 498 spin_lock_irq(&device->resource->req_lock); 499 } 500 } 501 502 static void drbd_wait_ee_list_empty(struct drbd_device *device, 503 struct list_head *head) 504 { 505 spin_lock_irq(&device->resource->req_lock); 506 _drbd_wait_ee_list_empty(device, head); 507 spin_unlock_irq(&device->resource->req_lock); 508 } 509 510 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) 511 { 512 struct kvec iov = { 513 .iov_base = buf, 514 .iov_len = size, 515 }; 516 struct msghdr msg = { 517 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) 518 }; 519 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); 520 return sock_recvmsg(sock, &msg, msg.msg_flags); 521 } 522 523 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size) 524 { 525 int rv; 526 527 rv = drbd_recv_short(connection->data.socket, buf, size, 0); 528 529 if (rv < 0) { 530 if (rv == -ECONNRESET) 531 drbd_info(connection, "sock was reset by peer\n"); 532 else if (rv != -ERESTARTSYS) 533 drbd_err(connection, "sock_recvmsg returned %d\n", rv); 534 } else if (rv == 0) { 535 if (test_bit(DISCONNECT_SENT, &connection->flags)) { 536 long t; 537 rcu_read_lock(); 538 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; 539 rcu_read_unlock(); 540 541 t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t); 542 543 if (t) 544 goto out; 545 } 546 drbd_info(connection, "sock was shut down by peer\n"); 547 } 548 549 if (rv != size) 550 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); 551 552 out: 553 return rv; 554 } 555 556 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size) 557 { 558 int err; 559 560 err = drbd_recv(connection, buf, size); 561 if (err != size) { 562 if (err >= 0) 563 err = -EIO; 564 } else 565 err = 0; 566 return err; 567 } 568 569 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size) 570 { 571 int err; 572 573 err = drbd_recv_all(connection, buf, size); 574 if (err && !signal_pending(current)) 575 drbd_warn(connection, "short read (expected size %d)\n", (int)size); 576 return err; 577 } 578 579 /* quoting tcp(7): 580 * On individual connections, the socket buffer size must be set prior to the 581 * listen(2) or connect(2) calls in order to have it take effect. 582 * This is our wrapper to do so. 583 */ 584 static void drbd_setbufsize(struct socket *sock, unsigned int snd, 585 unsigned int rcv) 586 { 587 /* open coded SO_SNDBUF, SO_RCVBUF */ 588 if (snd) { 589 sock->sk->sk_sndbuf = snd; 590 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 591 } 592 if (rcv) { 593 sock->sk->sk_rcvbuf = rcv; 594 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 595 } 596 } 597 598 static struct socket *drbd_try_connect(struct drbd_connection *connection) 599 { 600 const char *what; 601 struct socket *sock; 602 struct sockaddr_in6 src_in6; 603 struct sockaddr_in6 peer_in6; 604 struct net_conf *nc; 605 int err, peer_addr_len, my_addr_len; 606 int sndbuf_size, rcvbuf_size, connect_int; 607 int disconnect_on_error = 1; 608 609 rcu_read_lock(); 610 nc = rcu_dereference(connection->net_conf); 611 if (!nc) { 612 rcu_read_unlock(); 613 return NULL; 614 } 615 sndbuf_size = nc->sndbuf_size; 616 rcvbuf_size = nc->rcvbuf_size; 617 connect_int = nc->connect_int; 618 rcu_read_unlock(); 619 620 my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6)); 621 memcpy(&src_in6, &connection->my_addr, my_addr_len); 622 623 if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6) 624 src_in6.sin6_port = 0; 625 else 626 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ 627 628 peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6)); 629 memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); 630 631 what = "sock_create_kern"; 632 err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, 633 SOCK_STREAM, IPPROTO_TCP, &sock); 634 if (err < 0) { 635 sock = NULL; 636 goto out; 637 } 638 639 sock->sk->sk_rcvtimeo = 640 sock->sk->sk_sndtimeo = connect_int * HZ; 641 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); 642 643 /* explicitly bind to the configured IP as source IP 644 * for the outgoing connections. 645 * This is needed for multihomed hosts and to be 646 * able to use lo: interfaces for drbd. 647 * Make sure to use 0 as port number, so linux selects 648 * a free one dynamically. 649 */ 650 what = "bind before connect"; 651 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); 652 if (err < 0) 653 goto out; 654 655 /* connect may fail, peer not yet available. 656 * stay C_WF_CONNECTION, don't go Disconnecting! */ 657 disconnect_on_error = 0; 658 what = "connect"; 659 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); 660 661 out: 662 if (err < 0) { 663 if (sock) { 664 sock_release(sock); 665 sock = NULL; 666 } 667 switch (-err) { 668 /* timeout, busy, signal pending */ 669 case ETIMEDOUT: case EAGAIN: case EINPROGRESS: 670 case EINTR: case ERESTARTSYS: 671 /* peer not (yet) available, network problem */ 672 case ECONNREFUSED: case ENETUNREACH: 673 case EHOSTDOWN: case EHOSTUNREACH: 674 disconnect_on_error = 0; 675 break; 676 default: 677 drbd_err(connection, "%s failed, err = %d\n", what, err); 678 } 679 if (disconnect_on_error) 680 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 681 } 682 683 return sock; 684 } 685 686 struct accept_wait_data { 687 struct drbd_connection *connection; 688 struct socket *s_listen; 689 struct completion door_bell; 690 void (*original_sk_state_change)(struct sock *sk); 691 692 }; 693 694 static void drbd_incoming_connection(struct sock *sk) 695 { 696 struct accept_wait_data *ad = sk->sk_user_data; 697 void (*state_change)(struct sock *sk); 698 699 state_change = ad->original_sk_state_change; 700 if (sk->sk_state == TCP_ESTABLISHED) 701 complete(&ad->door_bell); 702 state_change(sk); 703 } 704 705 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad) 706 { 707 int err, sndbuf_size, rcvbuf_size, my_addr_len; 708 struct sockaddr_in6 my_addr; 709 struct socket *s_listen; 710 struct net_conf *nc; 711 const char *what; 712 713 rcu_read_lock(); 714 nc = rcu_dereference(connection->net_conf); 715 if (!nc) { 716 rcu_read_unlock(); 717 return -EIO; 718 } 719 sndbuf_size = nc->sndbuf_size; 720 rcvbuf_size = nc->rcvbuf_size; 721 rcu_read_unlock(); 722 723 my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6)); 724 memcpy(&my_addr, &connection->my_addr, my_addr_len); 725 726 what = "sock_create_kern"; 727 err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family, 728 SOCK_STREAM, IPPROTO_TCP, &s_listen); 729 if (err) { 730 s_listen = NULL; 731 goto out; 732 } 733 734 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 735 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); 736 737 what = "bind before listen"; 738 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); 739 if (err < 0) 740 goto out; 741 742 ad->s_listen = s_listen; 743 write_lock_bh(&s_listen->sk->sk_callback_lock); 744 ad->original_sk_state_change = s_listen->sk->sk_state_change; 745 s_listen->sk->sk_state_change = drbd_incoming_connection; 746 s_listen->sk->sk_user_data = ad; 747 write_unlock_bh(&s_listen->sk->sk_callback_lock); 748 749 what = "listen"; 750 err = s_listen->ops->listen(s_listen, 5); 751 if (err < 0) 752 goto out; 753 754 return 0; 755 out: 756 if (s_listen) 757 sock_release(s_listen); 758 if (err < 0) { 759 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { 760 drbd_err(connection, "%s failed, err = %d\n", what, err); 761 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 762 } 763 } 764 765 return -EIO; 766 } 767 768 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) 769 { 770 write_lock_bh(&sk->sk_callback_lock); 771 sk->sk_state_change = ad->original_sk_state_change; 772 sk->sk_user_data = NULL; 773 write_unlock_bh(&sk->sk_callback_lock); 774 } 775 776 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad) 777 { 778 int timeo, connect_int, err = 0; 779 struct socket *s_estab = NULL; 780 struct net_conf *nc; 781 782 rcu_read_lock(); 783 nc = rcu_dereference(connection->net_conf); 784 if (!nc) { 785 rcu_read_unlock(); 786 return NULL; 787 } 788 connect_int = nc->connect_int; 789 rcu_read_unlock(); 790 791 timeo = connect_int * HZ; 792 /* 28.5% random jitter */ 793 timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; 794 795 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); 796 if (err <= 0) 797 return NULL; 798 799 err = kernel_accept(ad->s_listen, &s_estab, 0); 800 if (err < 0) { 801 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { 802 drbd_err(connection, "accept failed, err = %d\n", err); 803 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 804 } 805 } 806 807 if (s_estab) 808 unregister_state_change(s_estab->sk, ad); 809 810 return s_estab; 811 } 812 813 static int decode_header(struct drbd_connection *, void *, struct packet_info *); 814 815 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock, 816 enum drbd_packet cmd) 817 { 818 if (!conn_prepare_command(connection, sock)) 819 return -EIO; 820 return conn_send_command(connection, sock, cmd, 0, NULL, 0); 821 } 822 823 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock) 824 { 825 unsigned int header_size = drbd_header_size(connection); 826 struct packet_info pi; 827 struct net_conf *nc; 828 int err; 829 830 rcu_read_lock(); 831 nc = rcu_dereference(connection->net_conf); 832 if (!nc) { 833 rcu_read_unlock(); 834 return -EIO; 835 } 836 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; 837 rcu_read_unlock(); 838 839 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); 840 if (err != header_size) { 841 if (err >= 0) 842 err = -EIO; 843 return err; 844 } 845 err = decode_header(connection, connection->data.rbuf, &pi); 846 if (err) 847 return err; 848 return pi.cmd; 849 } 850 851 /** 852 * drbd_socket_okay() - Free the socket if its connection is not okay 853 * @sock: pointer to the pointer to the socket. 854 */ 855 static bool drbd_socket_okay(struct socket **sock) 856 { 857 int rr; 858 char tb[4]; 859 860 if (!*sock) 861 return false; 862 863 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); 864 865 if (rr > 0 || rr == -EAGAIN) { 866 return true; 867 } else { 868 sock_release(*sock); 869 *sock = NULL; 870 return false; 871 } 872 } 873 874 static bool connection_established(struct drbd_connection *connection, 875 struct socket **sock1, 876 struct socket **sock2) 877 { 878 struct net_conf *nc; 879 int timeout; 880 bool ok; 881 882 if (!*sock1 || !*sock2) 883 return false; 884 885 rcu_read_lock(); 886 nc = rcu_dereference(connection->net_conf); 887 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; 888 rcu_read_unlock(); 889 schedule_timeout_interruptible(timeout); 890 891 ok = drbd_socket_okay(sock1); 892 ok = drbd_socket_okay(sock2) && ok; 893 894 return ok; 895 } 896 897 /* Gets called if a connection is established, or if a new minor gets created 898 in a connection */ 899 int drbd_connected(struct drbd_peer_device *peer_device) 900 { 901 struct drbd_device *device = peer_device->device; 902 int err; 903 904 atomic_set(&device->packet_seq, 0); 905 device->peer_seq = 0; 906 907 device->state_mutex = peer_device->connection->agreed_pro_version < 100 ? 908 &peer_device->connection->cstate_mutex : 909 &device->own_state_mutex; 910 911 err = drbd_send_sync_param(peer_device); 912 if (!err) 913 err = drbd_send_sizes(peer_device, 0, 0); 914 if (!err) 915 err = drbd_send_uuids(peer_device); 916 if (!err) 917 err = drbd_send_current_state(peer_device); 918 clear_bit(USE_DEGR_WFC_T, &device->flags); 919 clear_bit(RESIZE_PENDING, &device->flags); 920 atomic_set(&device->ap_in_flight, 0); 921 mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */ 922 return err; 923 } 924 925 /* 926 * return values: 927 * 1 yes, we have a valid connection 928 * 0 oops, did not work out, please try again 929 * -1 peer talks different language, 930 * no point in trying again, please go standalone. 931 * -2 We do not have a network config... 932 */ 933 static int conn_connect(struct drbd_connection *connection) 934 { 935 struct drbd_socket sock, msock; 936 struct drbd_peer_device *peer_device; 937 struct net_conf *nc; 938 int vnr, timeout, h; 939 bool discard_my_data, ok; 940 enum drbd_state_rv rv; 941 struct accept_wait_data ad = { 942 .connection = connection, 943 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), 944 }; 945 946 clear_bit(DISCONNECT_SENT, &connection->flags); 947 if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) 948 return -2; 949 950 mutex_init(&sock.mutex); 951 sock.sbuf = connection->data.sbuf; 952 sock.rbuf = connection->data.rbuf; 953 sock.socket = NULL; 954 mutex_init(&msock.mutex); 955 msock.sbuf = connection->meta.sbuf; 956 msock.rbuf = connection->meta.rbuf; 957 msock.socket = NULL; 958 959 /* Assume that the peer only understands protocol 80 until we know better. */ 960 connection->agreed_pro_version = 80; 961 962 if (prepare_listen_socket(connection, &ad)) 963 return 0; 964 965 do { 966 struct socket *s; 967 968 s = drbd_try_connect(connection); 969 if (s) { 970 if (!sock.socket) { 971 sock.socket = s; 972 send_first_packet(connection, &sock, P_INITIAL_DATA); 973 } else if (!msock.socket) { 974 clear_bit(RESOLVE_CONFLICTS, &connection->flags); 975 msock.socket = s; 976 send_first_packet(connection, &msock, P_INITIAL_META); 977 } else { 978 drbd_err(connection, "Logic error in conn_connect()\n"); 979 goto out_release_sockets; 980 } 981 } 982 983 if (connection_established(connection, &sock.socket, &msock.socket)) 984 break; 985 986 retry: 987 s = drbd_wait_for_connect(connection, &ad); 988 if (s) { 989 int fp = receive_first_packet(connection, s); 990 drbd_socket_okay(&sock.socket); 991 drbd_socket_okay(&msock.socket); 992 switch (fp) { 993 case P_INITIAL_DATA: 994 if (sock.socket) { 995 drbd_warn(connection, "initial packet S crossed\n"); 996 sock_release(sock.socket); 997 sock.socket = s; 998 goto randomize; 999 } 1000 sock.socket = s; 1001 break; 1002 case P_INITIAL_META: 1003 set_bit(RESOLVE_CONFLICTS, &connection->flags); 1004 if (msock.socket) { 1005 drbd_warn(connection, "initial packet M crossed\n"); 1006 sock_release(msock.socket); 1007 msock.socket = s; 1008 goto randomize; 1009 } 1010 msock.socket = s; 1011 break; 1012 default: 1013 drbd_warn(connection, "Error receiving initial packet\n"); 1014 sock_release(s); 1015 randomize: 1016 if (prandom_u32() & 1) 1017 goto retry; 1018 } 1019 } 1020 1021 if (connection->cstate <= C_DISCONNECTING) 1022 goto out_release_sockets; 1023 if (signal_pending(current)) { 1024 flush_signals(current); 1025 smp_rmb(); 1026 if (get_t_state(&connection->receiver) == EXITING) 1027 goto out_release_sockets; 1028 } 1029 1030 ok = connection_established(connection, &sock.socket, &msock.socket); 1031 } while (!ok); 1032 1033 if (ad.s_listen) 1034 sock_release(ad.s_listen); 1035 1036 sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 1037 msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 1038 1039 sock.socket->sk->sk_allocation = GFP_NOIO; 1040 msock.socket->sk->sk_allocation = GFP_NOIO; 1041 1042 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; 1043 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; 1044 1045 /* NOT YET ... 1046 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10; 1047 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 1048 * first set it to the P_CONNECTION_FEATURES timeout, 1049 * which we set to 4x the configured ping_timeout. */ 1050 rcu_read_lock(); 1051 nc = rcu_dereference(connection->net_conf); 1052 1053 sock.socket->sk->sk_sndtimeo = 1054 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; 1055 1056 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; 1057 timeout = nc->timeout * HZ / 10; 1058 discard_my_data = nc->discard_my_data; 1059 rcu_read_unlock(); 1060 1061 msock.socket->sk->sk_sndtimeo = timeout; 1062 1063 /* we don't want delays. 1064 * we use TCP_CORK where appropriate, though */ 1065 drbd_tcp_nodelay(sock.socket); 1066 drbd_tcp_nodelay(msock.socket); 1067 1068 connection->data.socket = sock.socket; 1069 connection->meta.socket = msock.socket; 1070 connection->last_received = jiffies; 1071 1072 h = drbd_do_features(connection); 1073 if (h <= 0) 1074 return h; 1075 1076 if (connection->cram_hmac_tfm) { 1077 /* drbd_request_state(device, NS(conn, WFAuth)); */ 1078 switch (drbd_do_auth(connection)) { 1079 case -1: 1080 drbd_err(connection, "Authentication of peer failed\n"); 1081 return -1; 1082 case 0: 1083 drbd_err(connection, "Authentication of peer failed, trying again.\n"); 1084 return 0; 1085 } 1086 } 1087 1088 connection->data.socket->sk->sk_sndtimeo = timeout; 1089 connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 1090 1091 if (drbd_send_protocol(connection) == -EOPNOTSUPP) 1092 return -1; 1093 1094 /* Prevent a race between resync-handshake and 1095 * being promoted to Primary. 1096 * 1097 * Grab and release the state mutex, so we know that any current 1098 * drbd_set_role() is finished, and any incoming drbd_set_role 1099 * will see the STATE_SENT flag, and wait for it to be cleared. 1100 */ 1101 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1102 mutex_lock(peer_device->device->state_mutex); 1103 1104 /* avoid a race with conn_request_state( C_DISCONNECTING ) */ 1105 spin_lock_irq(&connection->resource->req_lock); 1106 set_bit(STATE_SENT, &connection->flags); 1107 spin_unlock_irq(&connection->resource->req_lock); 1108 1109 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1110 mutex_unlock(peer_device->device->state_mutex); 1111 1112 rcu_read_lock(); 1113 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1114 struct drbd_device *device = peer_device->device; 1115 kref_get(&device->kref); 1116 rcu_read_unlock(); 1117 1118 if (discard_my_data) 1119 set_bit(DISCARD_MY_DATA, &device->flags); 1120 else 1121 clear_bit(DISCARD_MY_DATA, &device->flags); 1122 1123 drbd_connected(peer_device); 1124 kref_put(&device->kref, drbd_destroy_device); 1125 rcu_read_lock(); 1126 } 1127 rcu_read_unlock(); 1128 1129 rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); 1130 if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) { 1131 clear_bit(STATE_SENT, &connection->flags); 1132 return 0; 1133 } 1134 1135 drbd_thread_start(&connection->ack_receiver); 1136 /* opencoded create_singlethread_workqueue(), 1137 * to be able to use format string arguments */ 1138 connection->ack_sender = 1139 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); 1140 if (!connection->ack_sender) { 1141 drbd_err(connection, "Failed to create workqueue ack_sender\n"); 1142 return 0; 1143 } 1144 1145 mutex_lock(&connection->resource->conf_update); 1146 /* The discard_my_data flag is a single-shot modifier to the next 1147 * connection attempt, the handshake of which is now well underway. 1148 * No need for rcu style copying of the whole struct 1149 * just to clear a single value. */ 1150 connection->net_conf->discard_my_data = 0; 1151 mutex_unlock(&connection->resource->conf_update); 1152 1153 return h; 1154 1155 out_release_sockets: 1156 if (ad.s_listen) 1157 sock_release(ad.s_listen); 1158 if (sock.socket) 1159 sock_release(sock.socket); 1160 if (msock.socket) 1161 sock_release(msock.socket); 1162 return -1; 1163 } 1164 1165 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi) 1166 { 1167 unsigned int header_size = drbd_header_size(connection); 1168 1169 if (header_size == sizeof(struct p_header100) && 1170 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { 1171 struct p_header100 *h = header; 1172 if (h->pad != 0) { 1173 drbd_err(connection, "Header padding is not zero\n"); 1174 return -EINVAL; 1175 } 1176 pi->vnr = be16_to_cpu(h->volume); 1177 pi->cmd = be16_to_cpu(h->command); 1178 pi->size = be32_to_cpu(h->length); 1179 } else if (header_size == sizeof(struct p_header95) && 1180 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { 1181 struct p_header95 *h = header; 1182 pi->cmd = be16_to_cpu(h->command); 1183 pi->size = be32_to_cpu(h->length); 1184 pi->vnr = 0; 1185 } else if (header_size == sizeof(struct p_header80) && 1186 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { 1187 struct p_header80 *h = header; 1188 pi->cmd = be16_to_cpu(h->command); 1189 pi->size = be16_to_cpu(h->length); 1190 pi->vnr = 0; 1191 } else { 1192 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n", 1193 be32_to_cpu(*(__be32 *)header), 1194 connection->agreed_pro_version); 1195 return -EINVAL; 1196 } 1197 pi->data = header + header_size; 1198 return 0; 1199 } 1200 1201 static void drbd_unplug_all_devices(struct drbd_connection *connection) 1202 { 1203 if (current->plug == &connection->receiver_plug) { 1204 blk_finish_plug(&connection->receiver_plug); 1205 blk_start_plug(&connection->receiver_plug); 1206 } /* else: maybe just schedule() ?? */ 1207 } 1208 1209 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) 1210 { 1211 void *buffer = connection->data.rbuf; 1212 int err; 1213 1214 err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection)); 1215 if (err) 1216 return err; 1217 1218 err = decode_header(connection, buffer, pi); 1219 connection->last_received = jiffies; 1220 1221 return err; 1222 } 1223 1224 static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi) 1225 { 1226 void *buffer = connection->data.rbuf; 1227 unsigned int size = drbd_header_size(connection); 1228 int err; 1229 1230 err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT); 1231 if (err != size) { 1232 /* If we have nothing in the receive buffer now, to reduce 1233 * application latency, try to drain the backend queues as 1234 * quickly as possible, and let remote TCP know what we have 1235 * received so far. */ 1236 if (err == -EAGAIN) { 1237 drbd_tcp_quickack(connection->data.socket); 1238 drbd_unplug_all_devices(connection); 1239 } 1240 if (err > 0) { 1241 buffer += err; 1242 size -= err; 1243 } 1244 err = drbd_recv_all_warn(connection, buffer, size); 1245 if (err) 1246 return err; 1247 } 1248 1249 err = decode_header(connection, connection->data.rbuf, pi); 1250 connection->last_received = jiffies; 1251 1252 return err; 1253 } 1254 /* This is blkdev_issue_flush, but asynchronous. 1255 * We want to submit to all component volumes in parallel, 1256 * then wait for all completions. 1257 */ 1258 struct issue_flush_context { 1259 atomic_t pending; 1260 int error; 1261 struct completion done; 1262 }; 1263 struct one_flush_context { 1264 struct drbd_device *device; 1265 struct issue_flush_context *ctx; 1266 }; 1267 1268 static void one_flush_endio(struct bio *bio) 1269 { 1270 struct one_flush_context *octx = bio->bi_private; 1271 struct drbd_device *device = octx->device; 1272 struct issue_flush_context *ctx = octx->ctx; 1273 1274 if (bio->bi_status) { 1275 ctx->error = blk_status_to_errno(bio->bi_status); 1276 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status); 1277 } 1278 kfree(octx); 1279 bio_put(bio); 1280 1281 clear_bit(FLUSH_PENDING, &device->flags); 1282 put_ldev(device); 1283 kref_put(&device->kref, drbd_destroy_device); 1284 1285 if (atomic_dec_and_test(&ctx->pending)) 1286 complete(&ctx->done); 1287 } 1288 1289 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) 1290 { 1291 struct bio *bio = bio_alloc(GFP_NOIO, 0); 1292 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); 1293 if (!bio || !octx) { 1294 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); 1295 /* FIXME: what else can I do now? disconnecting or detaching 1296 * really does not help to improve the state of the world, either. 1297 */ 1298 kfree(octx); 1299 if (bio) 1300 bio_put(bio); 1301 1302 ctx->error = -ENOMEM; 1303 put_ldev(device); 1304 kref_put(&device->kref, drbd_destroy_device); 1305 return; 1306 } 1307 1308 octx->device = device; 1309 octx->ctx = ctx; 1310 bio_set_dev(bio, device->ldev->backing_bdev); 1311 bio->bi_private = octx; 1312 bio->bi_end_io = one_flush_endio; 1313 bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; 1314 1315 device->flush_jif = jiffies; 1316 set_bit(FLUSH_PENDING, &device->flags); 1317 atomic_inc(&ctx->pending); 1318 submit_bio(bio); 1319 } 1320 1321 static void drbd_flush(struct drbd_connection *connection) 1322 { 1323 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { 1324 struct drbd_peer_device *peer_device; 1325 struct issue_flush_context ctx; 1326 int vnr; 1327 1328 atomic_set(&ctx.pending, 1); 1329 ctx.error = 0; 1330 init_completion(&ctx.done); 1331 1332 rcu_read_lock(); 1333 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1334 struct drbd_device *device = peer_device->device; 1335 1336 if (!get_ldev(device)) 1337 continue; 1338 kref_get(&device->kref); 1339 rcu_read_unlock(); 1340 1341 submit_one_flush(device, &ctx); 1342 1343 rcu_read_lock(); 1344 } 1345 rcu_read_unlock(); 1346 1347 /* Do we want to add a timeout, 1348 * if disk-timeout is set? */ 1349 if (!atomic_dec_and_test(&ctx.pending)) 1350 wait_for_completion(&ctx.done); 1351 1352 if (ctx.error) { 1353 /* would rather check on EOPNOTSUPP, but that is not reliable. 1354 * don't try again for ANY return value != 0 1355 * if (rv == -EOPNOTSUPP) */ 1356 /* Any error is already reported by bio_endio callback. */ 1357 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); 1358 } 1359 } 1360 } 1361 1362 /** 1363 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. 1364 * @device: DRBD device. 1365 * @epoch: Epoch object. 1366 * @ev: Epoch event. 1367 */ 1368 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection, 1369 struct drbd_epoch *epoch, 1370 enum epoch_event ev) 1371 { 1372 int epoch_size; 1373 struct drbd_epoch *next_epoch; 1374 enum finish_epoch rv = FE_STILL_LIVE; 1375 1376 spin_lock(&connection->epoch_lock); 1377 do { 1378 next_epoch = NULL; 1379 1380 epoch_size = atomic_read(&epoch->epoch_size); 1381 1382 switch (ev & ~EV_CLEANUP) { 1383 case EV_PUT: 1384 atomic_dec(&epoch->active); 1385 break; 1386 case EV_GOT_BARRIER_NR: 1387 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1388 break; 1389 case EV_BECAME_LAST: 1390 /* nothing to do*/ 1391 break; 1392 } 1393 1394 if (epoch_size != 0 && 1395 atomic_read(&epoch->active) == 0 && 1396 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { 1397 if (!(ev & EV_CLEANUP)) { 1398 spin_unlock(&connection->epoch_lock); 1399 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size); 1400 spin_lock(&connection->epoch_lock); 1401 } 1402 #if 0 1403 /* FIXME: dec unacked on connection, once we have 1404 * something to count pending connection packets in. */ 1405 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) 1406 dec_unacked(epoch->connection); 1407 #endif 1408 1409 if (connection->current_epoch != epoch) { 1410 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); 1411 list_del(&epoch->list); 1412 ev = EV_BECAME_LAST | (ev & EV_CLEANUP); 1413 connection->epochs--; 1414 kfree(epoch); 1415 1416 if (rv == FE_STILL_LIVE) 1417 rv = FE_DESTROYED; 1418 } else { 1419 epoch->flags = 0; 1420 atomic_set(&epoch->epoch_size, 0); 1421 /* atomic_set(&epoch->active, 0); is already zero */ 1422 if (rv == FE_STILL_LIVE) 1423 rv = FE_RECYCLED; 1424 } 1425 } 1426 1427 if (!next_epoch) 1428 break; 1429 1430 epoch = next_epoch; 1431 } while (1); 1432 1433 spin_unlock(&connection->epoch_lock); 1434 1435 return rv; 1436 } 1437 1438 static enum write_ordering_e 1439 max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) 1440 { 1441 struct disk_conf *dc; 1442 1443 dc = rcu_dereference(bdev->disk_conf); 1444 1445 if (wo == WO_BDEV_FLUSH && !dc->disk_flushes) 1446 wo = WO_DRAIN_IO; 1447 if (wo == WO_DRAIN_IO && !dc->disk_drain) 1448 wo = WO_NONE; 1449 1450 return wo; 1451 } 1452 1453 /** 1454 * drbd_bump_write_ordering() - Fall back to an other write ordering method 1455 * @connection: DRBD connection. 1456 * @wo: Write ordering method to try. 1457 */ 1458 void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, 1459 enum write_ordering_e wo) 1460 { 1461 struct drbd_device *device; 1462 enum write_ordering_e pwo; 1463 int vnr; 1464 static char *write_ordering_str[] = { 1465 [WO_NONE] = "none", 1466 [WO_DRAIN_IO] = "drain", 1467 [WO_BDEV_FLUSH] = "flush", 1468 }; 1469 1470 pwo = resource->write_ordering; 1471 if (wo != WO_BDEV_FLUSH) 1472 wo = min(pwo, wo); 1473 rcu_read_lock(); 1474 idr_for_each_entry(&resource->devices, device, vnr) { 1475 if (get_ldev(device)) { 1476 wo = max_allowed_wo(device->ldev, wo); 1477 if (device->ldev == bdev) 1478 bdev = NULL; 1479 put_ldev(device); 1480 } 1481 } 1482 1483 if (bdev) 1484 wo = max_allowed_wo(bdev, wo); 1485 1486 rcu_read_unlock(); 1487 1488 resource->write_ordering = wo; 1489 if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH) 1490 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1491 } 1492 1493 /* 1494 * Mapping "discard" to ZEROOUT with UNMAP does not work for us: 1495 * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it 1496 * will directly go to fallback mode, submitting normal writes, and 1497 * never even try to UNMAP. 1498 * 1499 * And dm-thin does not do this (yet), mostly because in general it has 1500 * to assume that "skip_block_zeroing" is set. See also: 1501 * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html 1502 * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html 1503 * 1504 * We *may* ignore the discard-zeroes-data setting, if so configured. 1505 * 1506 * Assumption is that this "discard_zeroes_data=0" is only because the backend 1507 * may ignore partial unaligned discards. 1508 * 1509 * LVM/DM thin as of at least 1510 * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) 1511 * Library version: 1.02.93-RHEL7 (2015-01-28) 1512 * Driver version: 4.29.0 1513 * still behaves this way. 1514 * 1515 * For unaligned (wrt. alignment and granularity) or too small discards, 1516 * we zero-out the initial (and/or) trailing unaligned partial chunks, 1517 * but discard all the aligned full chunks. 1518 * 1519 * At least for LVM/DM thin, with skip_block_zeroing=false, 1520 * the result is effectively "discard_zeroes_data=1". 1521 */ 1522 /* flags: EE_TRIM|EE_ZEROOUT */ 1523 int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) 1524 { 1525 struct block_device *bdev = device->ldev->backing_bdev; 1526 struct request_queue *q = bdev_get_queue(bdev); 1527 sector_t tmp, nr; 1528 unsigned int max_discard_sectors, granularity; 1529 int alignment; 1530 int err = 0; 1531 1532 if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) 1533 goto zero_out; 1534 1535 /* Zero-sector (unknown) and one-sector granularities are the same. */ 1536 granularity = max(q->limits.discard_granularity >> 9, 1U); 1537 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; 1538 1539 max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); 1540 max_discard_sectors -= max_discard_sectors % granularity; 1541 if (unlikely(!max_discard_sectors)) 1542 goto zero_out; 1543 1544 if (nr_sectors < granularity) 1545 goto zero_out; 1546 1547 tmp = start; 1548 if (sector_div(tmp, granularity) != alignment) { 1549 if (nr_sectors < 2*granularity) 1550 goto zero_out; 1551 /* start + gran - (start + gran - align) % gran */ 1552 tmp = start + granularity - alignment; 1553 tmp = start + granularity - sector_div(tmp, granularity); 1554 1555 nr = tmp - start; 1556 /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many 1557 * layers are below us, some may have smaller granularity */ 1558 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); 1559 nr_sectors -= nr; 1560 start = tmp; 1561 } 1562 while (nr_sectors >= max_discard_sectors) { 1563 err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); 1564 nr_sectors -= max_discard_sectors; 1565 start += max_discard_sectors; 1566 } 1567 if (nr_sectors) { 1568 /* max_discard_sectors is unsigned int (and a multiple of 1569 * granularity, we made sure of that above already); 1570 * nr is < max_discard_sectors; 1571 * I don't need sector_div here, even though nr is sector_t */ 1572 nr = nr_sectors; 1573 nr -= (unsigned int)nr % granularity; 1574 if (nr) { 1575 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); 1576 nr_sectors -= nr; 1577 start += nr; 1578 } 1579 } 1580 zero_out: 1581 if (nr_sectors) { 1582 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 1583 (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); 1584 } 1585 return err != 0; 1586 } 1587 1588 static bool can_do_reliable_discards(struct drbd_device *device) 1589 { 1590 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); 1591 struct disk_conf *dc; 1592 bool can_do; 1593 1594 if (!blk_queue_discard(q)) 1595 return false; 1596 1597 rcu_read_lock(); 1598 dc = rcu_dereference(device->ldev->disk_conf); 1599 can_do = dc->discard_zeroes_if_aligned; 1600 rcu_read_unlock(); 1601 return can_do; 1602 } 1603 1604 static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) 1605 { 1606 /* If the backend cannot discard, or does not guarantee 1607 * read-back zeroes in discarded ranges, we fall back to 1608 * zero-out. Unless configuration specifically requested 1609 * otherwise. */ 1610 if (!can_do_reliable_discards(device)) 1611 peer_req->flags |= EE_ZEROOUT; 1612 1613 if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, 1614 peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) 1615 peer_req->flags |= EE_WAS_ERROR; 1616 drbd_endio_write_sec_final(peer_req); 1617 } 1618 1619 static void drbd_issue_peer_wsame(struct drbd_device *device, 1620 struct drbd_peer_request *peer_req) 1621 { 1622 struct block_device *bdev = device->ldev->backing_bdev; 1623 sector_t s = peer_req->i.sector; 1624 sector_t nr = peer_req->i.size >> 9; 1625 if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) 1626 peer_req->flags |= EE_WAS_ERROR; 1627 drbd_endio_write_sec_final(peer_req); 1628 } 1629 1630 1631 /** 1632 * drbd_submit_peer_request() 1633 * @device: DRBD device. 1634 * @peer_req: peer request 1635 * @rw: flag field, see bio->bi_opf 1636 * 1637 * May spread the pages to multiple bios, 1638 * depending on bio_add_page restrictions. 1639 * 1640 * Returns 0 if all bios have been submitted, 1641 * -ENOMEM if we could not allocate enough bios, 1642 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a 1643 * single page to an empty bio (which should never happen and likely indicates 1644 * that the lower level IO stack is in some way broken). This has been observed 1645 * on certain Xen deployments. 1646 */ 1647 /* TODO allocate from our own bio_set. */ 1648 int drbd_submit_peer_request(struct drbd_device *device, 1649 struct drbd_peer_request *peer_req, 1650 const unsigned op, const unsigned op_flags, 1651 const int fault_type) 1652 { 1653 struct bio *bios = NULL; 1654 struct bio *bio; 1655 struct page *page = peer_req->pages; 1656 sector_t sector = peer_req->i.sector; 1657 unsigned data_size = peer_req->i.size; 1658 unsigned n_bios = 0; 1659 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 1660 int err = -ENOMEM; 1661 1662 /* TRIM/DISCARD: for now, always use the helper function 1663 * blkdev_issue_zeroout(..., discard=true). 1664 * It's synchronous, but it does the right thing wrt. bio splitting. 1665 * Correctness first, performance later. Next step is to code an 1666 * asynchronous variant of the same. 1667 */ 1668 if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { 1669 /* wait for all pending IO completions, before we start 1670 * zeroing things out. */ 1671 conn_wait_active_ee_empty(peer_req->peer_device->connection); 1672 /* add it to the active list now, 1673 * so we can find it to present it in debugfs */ 1674 peer_req->submit_jif = jiffies; 1675 peer_req->flags |= EE_SUBMITTED; 1676 1677 /* If this was a resync request from receive_rs_deallocated(), 1678 * it is already on the sync_ee list */ 1679 if (list_empty(&peer_req->w.list)) { 1680 spin_lock_irq(&device->resource->req_lock); 1681 list_add_tail(&peer_req->w.list, &device->active_ee); 1682 spin_unlock_irq(&device->resource->req_lock); 1683 } 1684 1685 if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) 1686 drbd_issue_peer_discard_or_zero_out(device, peer_req); 1687 else /* EE_WRITE_SAME */ 1688 drbd_issue_peer_wsame(device, peer_req); 1689 return 0; 1690 } 1691 1692 /* In most cases, we will only need one bio. But in case the lower 1693 * level restrictions happen to be different at this offset on this 1694 * side than those of the sending peer, we may need to submit the 1695 * request in more than one bio. 1696 * 1697 * Plain bio_alloc is good enough here, this is no DRBD internally 1698 * generated bio, but a bio allocated on behalf of the peer. 1699 */ 1700 next_bio: 1701 bio = bio_alloc(GFP_NOIO, nr_pages); 1702 if (!bio) { 1703 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); 1704 goto fail; 1705 } 1706 /* > peer_req->i.sector, unless this is the first bio */ 1707 bio->bi_iter.bi_sector = sector; 1708 bio_set_dev(bio, device->ldev->backing_bdev); 1709 bio_set_op_attrs(bio, op, op_flags); 1710 bio->bi_private = peer_req; 1711 bio->bi_end_io = drbd_peer_request_endio; 1712 1713 bio->bi_next = bios; 1714 bios = bio; 1715 ++n_bios; 1716 1717 page_chain_for_each(page) { 1718 unsigned len = min_t(unsigned, data_size, PAGE_SIZE); 1719 if (!bio_add_page(bio, page, len, 0)) 1720 goto next_bio; 1721 data_size -= len; 1722 sector += len >> 9; 1723 --nr_pages; 1724 } 1725 D_ASSERT(device, data_size == 0); 1726 D_ASSERT(device, page == NULL); 1727 1728 atomic_set(&peer_req->pending_bios, n_bios); 1729 /* for debugfs: update timestamp, mark as submitted */ 1730 peer_req->submit_jif = jiffies; 1731 peer_req->flags |= EE_SUBMITTED; 1732 do { 1733 bio = bios; 1734 bios = bios->bi_next; 1735 bio->bi_next = NULL; 1736 1737 drbd_generic_make_request(device, fault_type, bio); 1738 } while (bios); 1739 return 0; 1740 1741 fail: 1742 while (bios) { 1743 bio = bios; 1744 bios = bios->bi_next; 1745 bio_put(bio); 1746 } 1747 return err; 1748 } 1749 1750 static void drbd_remove_epoch_entry_interval(struct drbd_device *device, 1751 struct drbd_peer_request *peer_req) 1752 { 1753 struct drbd_interval *i = &peer_req->i; 1754 1755 drbd_remove_interval(&device->write_requests, i); 1756 drbd_clear_interval(i); 1757 1758 /* Wake up any processes waiting for this peer request to complete. */ 1759 if (i->waiting) 1760 wake_up(&device->misc_wait); 1761 } 1762 1763 static void conn_wait_active_ee_empty(struct drbd_connection *connection) 1764 { 1765 struct drbd_peer_device *peer_device; 1766 int vnr; 1767 1768 rcu_read_lock(); 1769 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1770 struct drbd_device *device = peer_device->device; 1771 1772 kref_get(&device->kref); 1773 rcu_read_unlock(); 1774 drbd_wait_ee_list_empty(device, &device->active_ee); 1775 kref_put(&device->kref, drbd_destroy_device); 1776 rcu_read_lock(); 1777 } 1778 rcu_read_unlock(); 1779 } 1780 1781 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) 1782 { 1783 int rv; 1784 struct p_barrier *p = pi->data; 1785 struct drbd_epoch *epoch; 1786 1787 /* FIXME these are unacked on connection, 1788 * not a specific (peer)device. 1789 */ 1790 connection->current_epoch->barrier_nr = p->barrier; 1791 connection->current_epoch->connection = connection; 1792 rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR); 1793 1794 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from 1795 * the activity log, which means it would not be resynced in case the 1796 * R_PRIMARY crashes now. 1797 * Therefore we must send the barrier_ack after the barrier request was 1798 * completed. */ 1799 switch (connection->resource->write_ordering) { 1800 case WO_NONE: 1801 if (rv == FE_RECYCLED) 1802 return 0; 1803 1804 /* receiver context, in the writeout path of the other node. 1805 * avoid potential distributed deadlock */ 1806 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); 1807 if (epoch) 1808 break; 1809 else 1810 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); 1811 /* Fall through */ 1812 1813 case WO_BDEV_FLUSH: 1814 case WO_DRAIN_IO: 1815 conn_wait_active_ee_empty(connection); 1816 drbd_flush(connection); 1817 1818 if (atomic_read(&connection->current_epoch->epoch_size)) { 1819 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); 1820 if (epoch) 1821 break; 1822 } 1823 1824 return 0; 1825 default: 1826 drbd_err(connection, "Strangeness in connection->write_ordering %d\n", 1827 connection->resource->write_ordering); 1828 return -EIO; 1829 } 1830 1831 epoch->flags = 0; 1832 atomic_set(&epoch->epoch_size, 0); 1833 atomic_set(&epoch->active, 0); 1834 1835 spin_lock(&connection->epoch_lock); 1836 if (atomic_read(&connection->current_epoch->epoch_size)) { 1837 list_add(&epoch->list, &connection->current_epoch->list); 1838 connection->current_epoch = epoch; 1839 connection->epochs++; 1840 } else { 1841 /* The current_epoch got recycled while we allocated this one... */ 1842 kfree(epoch); 1843 } 1844 spin_unlock(&connection->epoch_lock); 1845 1846 return 0; 1847 } 1848 1849 /* quick wrapper in case payload size != request_size (write same) */ 1850 static void drbd_csum_ee_size(struct crypto_shash *h, 1851 struct drbd_peer_request *r, void *d, 1852 unsigned int payload_size) 1853 { 1854 unsigned int tmp = r->i.size; 1855 r->i.size = payload_size; 1856 drbd_csum_ee(h, r, d); 1857 r->i.size = tmp; 1858 } 1859 1860 /* used from receive_RSDataReply (recv_resync_read) 1861 * and from receive_Data. 1862 * data_size: actual payload ("data in") 1863 * for normal writes that is bi_size. 1864 * for discards, that is zero. 1865 * for write same, it is logical_block_size. 1866 * both trim and write same have the bi_size ("data len to be affected") 1867 * as extra argument in the packet header. 1868 */ 1869 static struct drbd_peer_request * 1870 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 1871 struct packet_info *pi) __must_hold(local) 1872 { 1873 struct drbd_device *device = peer_device->device; 1874 const sector_t capacity = drbd_get_capacity(device->this_bdev); 1875 struct drbd_peer_request *peer_req; 1876 struct page *page; 1877 int digest_size, err; 1878 unsigned int data_size = pi->size, ds; 1879 void *dig_in = peer_device->connection->int_dig_in; 1880 void *dig_vv = peer_device->connection->int_dig_vv; 1881 unsigned long *data; 1882 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; 1883 struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; 1884 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; 1885 1886 digest_size = 0; 1887 if (!trim && peer_device->connection->peer_integrity_tfm) { 1888 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); 1889 /* 1890 * FIXME: Receive the incoming digest into the receive buffer 1891 * here, together with its struct p_data? 1892 */ 1893 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); 1894 if (err) 1895 return NULL; 1896 data_size -= digest_size; 1897 } 1898 1899 /* assume request_size == data_size, but special case trim and wsame. */ 1900 ds = data_size; 1901 if (trim) { 1902 if (!expect(data_size == 0)) 1903 return NULL; 1904 ds = be32_to_cpu(trim->size); 1905 } else if (zeroes) { 1906 if (!expect(data_size == 0)) 1907 return NULL; 1908 ds = be32_to_cpu(zeroes->size); 1909 } else if (wsame) { 1910 if (data_size != queue_logical_block_size(device->rq_queue)) { 1911 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", 1912 data_size, queue_logical_block_size(device->rq_queue)); 1913 return NULL; 1914 } 1915 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { 1916 drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", 1917 data_size, bdev_logical_block_size(device->ldev->backing_bdev)); 1918 return NULL; 1919 } 1920 ds = be32_to_cpu(wsame->size); 1921 } 1922 1923 if (!expect(IS_ALIGNED(ds, 512))) 1924 return NULL; 1925 if (trim || wsame || zeroes) { 1926 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) 1927 return NULL; 1928 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) 1929 return NULL; 1930 1931 /* even though we trust out peer, 1932 * we sometimes have to double check. */ 1933 if (sector + (ds>>9) > capacity) { 1934 drbd_err(device, "request from peer beyond end of local disk: " 1935 "capacity: %llus < sector: %llus + size: %u\n", 1936 (unsigned long long)capacity, 1937 (unsigned long long)sector, ds); 1938 return NULL; 1939 } 1940 1941 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1942 * "criss-cross" setup, that might cause write-out on some other DRBD, 1943 * which in turn might block on the other node at this very place. */ 1944 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO); 1945 if (!peer_req) 1946 return NULL; 1947 1948 peer_req->flags |= EE_WRITE; 1949 if (trim) { 1950 peer_req->flags |= EE_TRIM; 1951 return peer_req; 1952 } 1953 if (zeroes) { 1954 peer_req->flags |= EE_ZEROOUT; 1955 return peer_req; 1956 } 1957 if (wsame) 1958 peer_req->flags |= EE_WRITE_SAME; 1959 1960 /* receive payload size bytes into page chain */ 1961 ds = data_size; 1962 page = peer_req->pages; 1963 page_chain_for_each(page) { 1964 unsigned len = min_t(int, ds, PAGE_SIZE); 1965 data = kmap(page); 1966 err = drbd_recv_all_warn(peer_device->connection, data, len); 1967 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { 1968 drbd_err(device, "Fault injection: Corrupting data on receive\n"); 1969 data[0] = data[0] ^ (unsigned long)-1; 1970 } 1971 kunmap(page); 1972 if (err) { 1973 drbd_free_peer_req(device, peer_req); 1974 return NULL; 1975 } 1976 ds -= len; 1977 } 1978 1979 if (digest_size) { 1980 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size); 1981 if (memcmp(dig_in, dig_vv, digest_size)) { 1982 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", 1983 (unsigned long long)sector, data_size); 1984 drbd_free_peer_req(device, peer_req); 1985 return NULL; 1986 } 1987 } 1988 device->recv_cnt += data_size >> 9; 1989 return peer_req; 1990 } 1991 1992 /* drbd_drain_block() just takes a data block 1993 * out of the socket input buffer, and discards it. 1994 */ 1995 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) 1996 { 1997 struct page *page; 1998 int err = 0; 1999 void *data; 2000 2001 if (!data_size) 2002 return 0; 2003 2004 page = drbd_alloc_pages(peer_device, 1, 1); 2005 2006 data = kmap(page); 2007 while (data_size) { 2008 unsigned int len = min_t(int, data_size, PAGE_SIZE); 2009 2010 err = drbd_recv_all_warn(peer_device->connection, data, len); 2011 if (err) 2012 break; 2013 data_size -= len; 2014 } 2015 kunmap(page); 2016 drbd_free_pages(peer_device->device, page, 0); 2017 return err; 2018 } 2019 2020 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req, 2021 sector_t sector, int data_size) 2022 { 2023 struct bio_vec bvec; 2024 struct bvec_iter iter; 2025 struct bio *bio; 2026 int digest_size, err, expect; 2027 void *dig_in = peer_device->connection->int_dig_in; 2028 void *dig_vv = peer_device->connection->int_dig_vv; 2029 2030 digest_size = 0; 2031 if (peer_device->connection->peer_integrity_tfm) { 2032 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); 2033 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); 2034 if (err) 2035 return err; 2036 data_size -= digest_size; 2037 } 2038 2039 /* optimistically update recv_cnt. if receiving fails below, 2040 * we disconnect anyways, and counters will be reset. */ 2041 peer_device->device->recv_cnt += data_size>>9; 2042 2043 bio = req->master_bio; 2044 D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); 2045 2046 bio_for_each_segment(bvec, bio, iter) { 2047 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; 2048 expect = min_t(int, data_size, bvec.bv_len); 2049 err = drbd_recv_all_warn(peer_device->connection, mapped, expect); 2050 kunmap(bvec.bv_page); 2051 if (err) 2052 return err; 2053 data_size -= expect; 2054 } 2055 2056 if (digest_size) { 2057 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv); 2058 if (memcmp(dig_in, dig_vv, digest_size)) { 2059 drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n"); 2060 return -EINVAL; 2061 } 2062 } 2063 2064 D_ASSERT(peer_device->device, data_size == 0); 2065 return 0; 2066 } 2067 2068 /* 2069 * e_end_resync_block() is called in ack_sender context via 2070 * drbd_finish_peer_reqs(). 2071 */ 2072 static int e_end_resync_block(struct drbd_work *w, int unused) 2073 { 2074 struct drbd_peer_request *peer_req = 2075 container_of(w, struct drbd_peer_request, w); 2076 struct drbd_peer_device *peer_device = peer_req->peer_device; 2077 struct drbd_device *device = peer_device->device; 2078 sector_t sector = peer_req->i.sector; 2079 int err; 2080 2081 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 2082 2083 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 2084 drbd_set_in_sync(device, sector, peer_req->i.size); 2085 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); 2086 } else { 2087 /* Record failure to sync */ 2088 drbd_rs_failed_io(device, sector, peer_req->i.size); 2089 2090 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); 2091 } 2092 dec_unacked(device); 2093 2094 return err; 2095 } 2096 2097 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, 2098 struct packet_info *pi) __releases(local) 2099 { 2100 struct drbd_device *device = peer_device->device; 2101 struct drbd_peer_request *peer_req; 2102 2103 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); 2104 if (!peer_req) 2105 goto fail; 2106 2107 dec_rs_pending(device); 2108 2109 inc_unacked(device); 2110 /* corresponding dec_unacked() in e_end_resync_block() 2111 * respective _drbd_clear_done_ee */ 2112 2113 peer_req->w.cb = e_end_resync_block; 2114 peer_req->submit_jif = jiffies; 2115 2116 spin_lock_irq(&device->resource->req_lock); 2117 list_add_tail(&peer_req->w.list, &device->sync_ee); 2118 spin_unlock_irq(&device->resource->req_lock); 2119 2120 atomic_add(pi->size >> 9, &device->rs_sect_ev); 2121 if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, 2122 DRBD_FAULT_RS_WR) == 0) 2123 return 0; 2124 2125 /* don't care for the reason here */ 2126 drbd_err(device, "submit failed, triggering re-connect\n"); 2127 spin_lock_irq(&device->resource->req_lock); 2128 list_del(&peer_req->w.list); 2129 spin_unlock_irq(&device->resource->req_lock); 2130 2131 drbd_free_peer_req(device, peer_req); 2132 fail: 2133 put_ldev(device); 2134 return -EIO; 2135 } 2136 2137 static struct drbd_request * 2138 find_request(struct drbd_device *device, struct rb_root *root, u64 id, 2139 sector_t sector, bool missing_ok, const char *func) 2140 { 2141 struct drbd_request *req; 2142 2143 /* Request object according to our peer */ 2144 req = (struct drbd_request *)(unsigned long)id; 2145 if (drbd_contains_interval(root, sector, &req->i) && req->i.local) 2146 return req; 2147 if (!missing_ok) { 2148 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func, 2149 (unsigned long)id, (unsigned long long)sector); 2150 } 2151 return NULL; 2152 } 2153 2154 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi) 2155 { 2156 struct drbd_peer_device *peer_device; 2157 struct drbd_device *device; 2158 struct drbd_request *req; 2159 sector_t sector; 2160 int err; 2161 struct p_data *p = pi->data; 2162 2163 peer_device = conn_peer_device(connection, pi->vnr); 2164 if (!peer_device) 2165 return -EIO; 2166 device = peer_device->device; 2167 2168 sector = be64_to_cpu(p->sector); 2169 2170 spin_lock_irq(&device->resource->req_lock); 2171 req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__); 2172 spin_unlock_irq(&device->resource->req_lock); 2173 if (unlikely(!req)) 2174 return -EIO; 2175 2176 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid 2177 * special casing it there for the various failure cases. 2178 * still no race with drbd_fail_pending_reads */ 2179 err = recv_dless_read(peer_device, req, sector, pi->size); 2180 if (!err) 2181 req_mod(req, DATA_RECEIVED); 2182 /* else: nothing. handled from drbd_disconnect... 2183 * I don't think we may complete this just yet 2184 * in case we are "on-disconnect: freeze" */ 2185 2186 return err; 2187 } 2188 2189 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi) 2190 { 2191 struct drbd_peer_device *peer_device; 2192 struct drbd_device *device; 2193 sector_t sector; 2194 int err; 2195 struct p_data *p = pi->data; 2196 2197 peer_device = conn_peer_device(connection, pi->vnr); 2198 if (!peer_device) 2199 return -EIO; 2200 device = peer_device->device; 2201 2202 sector = be64_to_cpu(p->sector); 2203 D_ASSERT(device, p->block_id == ID_SYNCER); 2204 2205 if (get_ldev(device)) { 2206 /* data is submitted to disk within recv_resync_read. 2207 * corresponding put_ldev done below on error, 2208 * or in drbd_peer_request_endio. */ 2209 err = recv_resync_read(peer_device, sector, pi); 2210 } else { 2211 if (__ratelimit(&drbd_ratelimit_state)) 2212 drbd_err(device, "Can not write resync data to local disk.\n"); 2213 2214 err = drbd_drain_block(peer_device, pi->size); 2215 2216 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); 2217 } 2218 2219 atomic_add(pi->size >> 9, &device->rs_sect_in); 2220 2221 return err; 2222 } 2223 2224 static void restart_conflicting_writes(struct drbd_device *device, 2225 sector_t sector, int size) 2226 { 2227 struct drbd_interval *i; 2228 struct drbd_request *req; 2229 2230 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2231 if (!i->local) 2232 continue; 2233 req = container_of(i, struct drbd_request, i); 2234 if (req->rq_state & RQ_LOCAL_PENDING || 2235 !(req->rq_state & RQ_POSTPONED)) 2236 continue; 2237 /* as it is RQ_POSTPONED, this will cause it to 2238 * be queued on the retry workqueue. */ 2239 __req_mod(req, CONFLICT_RESOLVED, NULL); 2240 } 2241 } 2242 2243 /* 2244 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). 2245 */ 2246 static int e_end_block(struct drbd_work *w, int cancel) 2247 { 2248 struct drbd_peer_request *peer_req = 2249 container_of(w, struct drbd_peer_request, w); 2250 struct drbd_peer_device *peer_device = peer_req->peer_device; 2251 struct drbd_device *device = peer_device->device; 2252 sector_t sector = peer_req->i.sector; 2253 int err = 0, pcmd; 2254 2255 if (peer_req->flags & EE_SEND_WRITE_ACK) { 2256 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 2257 pcmd = (device->state.conn >= C_SYNC_SOURCE && 2258 device->state.conn <= C_PAUSED_SYNC_T && 2259 peer_req->flags & EE_MAY_SET_IN_SYNC) ? 2260 P_RS_WRITE_ACK : P_WRITE_ACK; 2261 err = drbd_send_ack(peer_device, pcmd, peer_req); 2262 if (pcmd == P_RS_WRITE_ACK) 2263 drbd_set_in_sync(device, sector, peer_req->i.size); 2264 } else { 2265 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); 2266 /* we expect it to be marked out of sync anyways... 2267 * maybe assert this? */ 2268 } 2269 dec_unacked(device); 2270 } 2271 2272 /* we delete from the conflict detection hash _after_ we sent out the 2273 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 2274 if (peer_req->flags & EE_IN_INTERVAL_TREE) { 2275 spin_lock_irq(&device->resource->req_lock); 2276 D_ASSERT(device, !drbd_interval_empty(&peer_req->i)); 2277 drbd_remove_epoch_entry_interval(device, peer_req); 2278 if (peer_req->flags & EE_RESTART_REQUESTS) 2279 restart_conflicting_writes(device, sector, peer_req->i.size); 2280 spin_unlock_irq(&device->resource->req_lock); 2281 } else 2282 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 2283 2284 drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 2285 2286 return err; 2287 } 2288 2289 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) 2290 { 2291 struct drbd_peer_request *peer_req = 2292 container_of(w, struct drbd_peer_request, w); 2293 struct drbd_peer_device *peer_device = peer_req->peer_device; 2294 int err; 2295 2296 err = drbd_send_ack(peer_device, ack, peer_req); 2297 dec_unacked(peer_device->device); 2298 2299 return err; 2300 } 2301 2302 static int e_send_superseded(struct drbd_work *w, int unused) 2303 { 2304 return e_send_ack(w, P_SUPERSEDED); 2305 } 2306 2307 static int e_send_retry_write(struct drbd_work *w, int unused) 2308 { 2309 struct drbd_peer_request *peer_req = 2310 container_of(w, struct drbd_peer_request, w); 2311 struct drbd_connection *connection = peer_req->peer_device->connection; 2312 2313 return e_send_ack(w, connection->agreed_pro_version >= 100 ? 2314 P_RETRY_WRITE : P_SUPERSEDED); 2315 } 2316 2317 static bool seq_greater(u32 a, u32 b) 2318 { 2319 /* 2320 * We assume 32-bit wrap-around here. 2321 * For 24-bit wrap-around, we would have to shift: 2322 * a <<= 8; b <<= 8; 2323 */ 2324 return (s32)a - (s32)b > 0; 2325 } 2326 2327 static u32 seq_max(u32 a, u32 b) 2328 { 2329 return seq_greater(a, b) ? a : b; 2330 } 2331 2332 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq) 2333 { 2334 struct drbd_device *device = peer_device->device; 2335 unsigned int newest_peer_seq; 2336 2337 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) { 2338 spin_lock(&device->peer_seq_lock); 2339 newest_peer_seq = seq_max(device->peer_seq, peer_seq); 2340 device->peer_seq = newest_peer_seq; 2341 spin_unlock(&device->peer_seq_lock); 2342 /* wake up only if we actually changed device->peer_seq */ 2343 if (peer_seq == newest_peer_seq) 2344 wake_up(&device->seq_wait); 2345 } 2346 } 2347 2348 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) 2349 { 2350 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); 2351 } 2352 2353 /* maybe change sync_ee into interval trees as well? */ 2354 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) 2355 { 2356 struct drbd_peer_request *rs_req; 2357 bool rv = false; 2358 2359 spin_lock_irq(&device->resource->req_lock); 2360 list_for_each_entry(rs_req, &device->sync_ee, w.list) { 2361 if (overlaps(peer_req->i.sector, peer_req->i.size, 2362 rs_req->i.sector, rs_req->i.size)) { 2363 rv = true; 2364 break; 2365 } 2366 } 2367 spin_unlock_irq(&device->resource->req_lock); 2368 2369 return rv; 2370 } 2371 2372 /* Called from receive_Data. 2373 * Synchronize packets on sock with packets on msock. 2374 * 2375 * This is here so even when a P_DATA packet traveling via sock overtook an Ack 2376 * packet traveling on msock, they are still processed in the order they have 2377 * been sent. 2378 * 2379 * Note: we don't care for Ack packets overtaking P_DATA packets. 2380 * 2381 * In case packet_seq is larger than device->peer_seq number, there are 2382 * outstanding packets on the msock. We wait for them to arrive. 2383 * In case we are the logically next packet, we update device->peer_seq 2384 * ourselves. Correctly handles 32bit wrap around. 2385 * 2386 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, 2387 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds 2388 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have 2389 * 1<<9 == 512 seconds aka ages for the 32bit wrap around... 2390 * 2391 * returns 0 if we may process the packet, 2392 * -ERESTARTSYS if we were interrupted (by disconnect signal). */ 2393 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq) 2394 { 2395 struct drbd_device *device = peer_device->device; 2396 DEFINE_WAIT(wait); 2397 long timeout; 2398 int ret = 0, tp; 2399 2400 if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) 2401 return 0; 2402 2403 spin_lock(&device->peer_seq_lock); 2404 for (;;) { 2405 if (!seq_greater(peer_seq - 1, device->peer_seq)) { 2406 device->peer_seq = seq_max(device->peer_seq, peer_seq); 2407 break; 2408 } 2409 2410 if (signal_pending(current)) { 2411 ret = -ERESTARTSYS; 2412 break; 2413 } 2414 2415 rcu_read_lock(); 2416 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; 2417 rcu_read_unlock(); 2418 2419 if (!tp) 2420 break; 2421 2422 /* Only need to wait if two_primaries is enabled */ 2423 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE); 2424 spin_unlock(&device->peer_seq_lock); 2425 rcu_read_lock(); 2426 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10; 2427 rcu_read_unlock(); 2428 timeout = schedule_timeout(timeout); 2429 spin_lock(&device->peer_seq_lock); 2430 if (!timeout) { 2431 ret = -ETIMEDOUT; 2432 drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n"); 2433 break; 2434 } 2435 } 2436 spin_unlock(&device->peer_seq_lock); 2437 finish_wait(&device->seq_wait, &wait); 2438 return ret; 2439 } 2440 2441 /* see also bio_flags_to_wire() 2442 * DRBD_REQ_*, because we need to semantically map the flags to data packet 2443 * flags and back. We may replicate to other kernel versions. */ 2444 static unsigned long wire_flags_to_bio_flags(u32 dpf) 2445 { 2446 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | 2447 (dpf & DP_FUA ? REQ_FUA : 0) | 2448 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); 2449 } 2450 2451 static unsigned long wire_flags_to_bio_op(u32 dpf) 2452 { 2453 if (dpf & DP_ZEROES) 2454 return REQ_OP_WRITE_ZEROES; 2455 if (dpf & DP_DISCARD) 2456 return REQ_OP_DISCARD; 2457 if (dpf & DP_WSAME) 2458 return REQ_OP_WRITE_SAME; 2459 else 2460 return REQ_OP_WRITE; 2461 } 2462 2463 static void fail_postponed_requests(struct drbd_device *device, sector_t sector, 2464 unsigned int size) 2465 { 2466 struct drbd_interval *i; 2467 2468 repeat: 2469 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2470 struct drbd_request *req; 2471 struct bio_and_error m; 2472 2473 if (!i->local) 2474 continue; 2475 req = container_of(i, struct drbd_request, i); 2476 if (!(req->rq_state & RQ_POSTPONED)) 2477 continue; 2478 req->rq_state &= ~RQ_POSTPONED; 2479 __req_mod(req, NEG_ACKED, &m); 2480 spin_unlock_irq(&device->resource->req_lock); 2481 if (m.bio) 2482 complete_master_bio(device, &m); 2483 spin_lock_irq(&device->resource->req_lock); 2484 goto repeat; 2485 } 2486 } 2487 2488 static int handle_write_conflicts(struct drbd_device *device, 2489 struct drbd_peer_request *peer_req) 2490 { 2491 struct drbd_connection *connection = peer_req->peer_device->connection; 2492 bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags); 2493 sector_t sector = peer_req->i.sector; 2494 const unsigned int size = peer_req->i.size; 2495 struct drbd_interval *i; 2496 bool equal; 2497 int err; 2498 2499 /* 2500 * Inserting the peer request into the write_requests tree will prevent 2501 * new conflicting local requests from being added. 2502 */ 2503 drbd_insert_interval(&device->write_requests, &peer_req->i); 2504 2505 repeat: 2506 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2507 if (i == &peer_req->i) 2508 continue; 2509 if (i->completed) 2510 continue; 2511 2512 if (!i->local) { 2513 /* 2514 * Our peer has sent a conflicting remote request; this 2515 * should not happen in a two-node setup. Wait for the 2516 * earlier peer request to complete. 2517 */ 2518 err = drbd_wait_misc(device, i); 2519 if (err) 2520 goto out; 2521 goto repeat; 2522 } 2523 2524 equal = i->sector == sector && i->size == size; 2525 if (resolve_conflicts) { 2526 /* 2527 * If the peer request is fully contained within the 2528 * overlapping request, it can be considered overwritten 2529 * and thus superseded; otherwise, it will be retried 2530 * once all overlapping requests have completed. 2531 */ 2532 bool superseded = i->sector <= sector && i->sector + 2533 (i->size >> 9) >= sector + (size >> 9); 2534 2535 if (!equal) 2536 drbd_alert(device, "Concurrent writes detected: " 2537 "local=%llus +%u, remote=%llus +%u, " 2538 "assuming %s came first\n", 2539 (unsigned long long)i->sector, i->size, 2540 (unsigned long long)sector, size, 2541 superseded ? "local" : "remote"); 2542 2543 peer_req->w.cb = superseded ? e_send_superseded : 2544 e_send_retry_write; 2545 list_add_tail(&peer_req->w.list, &device->done_ee); 2546 queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); 2547 2548 err = -ENOENT; 2549 goto out; 2550 } else { 2551 struct drbd_request *req = 2552 container_of(i, struct drbd_request, i); 2553 2554 if (!equal) 2555 drbd_alert(device, "Concurrent writes detected: " 2556 "local=%llus +%u, remote=%llus +%u\n", 2557 (unsigned long long)i->sector, i->size, 2558 (unsigned long long)sector, size); 2559 2560 if (req->rq_state & RQ_LOCAL_PENDING || 2561 !(req->rq_state & RQ_POSTPONED)) { 2562 /* 2563 * Wait for the node with the discard flag to 2564 * decide if this request has been superseded 2565 * or needs to be retried. 2566 * Requests that have been superseded will 2567 * disappear from the write_requests tree. 2568 * 2569 * In addition, wait for the conflicting 2570 * request to finish locally before submitting 2571 * the conflicting peer request. 2572 */ 2573 err = drbd_wait_misc(device, &req->i); 2574 if (err) { 2575 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); 2576 fail_postponed_requests(device, sector, size); 2577 goto out; 2578 } 2579 goto repeat; 2580 } 2581 /* 2582 * Remember to restart the conflicting requests after 2583 * the new peer request has completed. 2584 */ 2585 peer_req->flags |= EE_RESTART_REQUESTS; 2586 } 2587 } 2588 err = 0; 2589 2590 out: 2591 if (err) 2592 drbd_remove_epoch_entry_interval(device, peer_req); 2593 return err; 2594 } 2595 2596 /* mirrored write */ 2597 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi) 2598 { 2599 struct drbd_peer_device *peer_device; 2600 struct drbd_device *device; 2601 struct net_conf *nc; 2602 sector_t sector; 2603 struct drbd_peer_request *peer_req; 2604 struct p_data *p = pi->data; 2605 u32 peer_seq = be32_to_cpu(p->seq_num); 2606 int op, op_flags; 2607 u32 dp_flags; 2608 int err, tp; 2609 2610 peer_device = conn_peer_device(connection, pi->vnr); 2611 if (!peer_device) 2612 return -EIO; 2613 device = peer_device->device; 2614 2615 if (!get_ldev(device)) { 2616 int err2; 2617 2618 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2619 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); 2620 atomic_inc(&connection->current_epoch->epoch_size); 2621 err2 = drbd_drain_block(peer_device, pi->size); 2622 if (!err) 2623 err = err2; 2624 return err; 2625 } 2626 2627 /* 2628 * Corresponding put_ldev done either below (on various errors), or in 2629 * drbd_peer_request_endio, if we successfully submit the data at the 2630 * end of this function. 2631 */ 2632 2633 sector = be64_to_cpu(p->sector); 2634 peer_req = read_in_block(peer_device, p->block_id, sector, pi); 2635 if (!peer_req) { 2636 put_ldev(device); 2637 return -EIO; 2638 } 2639 2640 peer_req->w.cb = e_end_block; 2641 peer_req->submit_jif = jiffies; 2642 peer_req->flags |= EE_APPLICATION; 2643 2644 dp_flags = be32_to_cpu(p->dp_flags); 2645 op = wire_flags_to_bio_op(dp_flags); 2646 op_flags = wire_flags_to_bio_flags(dp_flags); 2647 if (pi->cmd == P_TRIM) { 2648 D_ASSERT(peer_device, peer_req->i.size > 0); 2649 D_ASSERT(peer_device, op == REQ_OP_DISCARD); 2650 D_ASSERT(peer_device, peer_req->pages == NULL); 2651 /* need to play safe: an older DRBD sender 2652 * may mean zero-out while sending P_TRIM. */ 2653 if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) 2654 peer_req->flags |= EE_ZEROOUT; 2655 } else if (pi->cmd == P_ZEROES) { 2656 D_ASSERT(peer_device, peer_req->i.size > 0); 2657 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); 2658 D_ASSERT(peer_device, peer_req->pages == NULL); 2659 /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ 2660 if (dp_flags & DP_DISCARD) 2661 peer_req->flags |= EE_TRIM; 2662 } else if (peer_req->pages == NULL) { 2663 D_ASSERT(device, peer_req->i.size == 0); 2664 D_ASSERT(device, dp_flags & DP_FLUSH); 2665 } 2666 2667 if (dp_flags & DP_MAY_SET_IN_SYNC) 2668 peer_req->flags |= EE_MAY_SET_IN_SYNC; 2669 2670 spin_lock(&connection->epoch_lock); 2671 peer_req->epoch = connection->current_epoch; 2672 atomic_inc(&peer_req->epoch->epoch_size); 2673 atomic_inc(&peer_req->epoch->active); 2674 spin_unlock(&connection->epoch_lock); 2675 2676 rcu_read_lock(); 2677 nc = rcu_dereference(peer_device->connection->net_conf); 2678 tp = nc->two_primaries; 2679 if (peer_device->connection->agreed_pro_version < 100) { 2680 switch (nc->wire_protocol) { 2681 case DRBD_PROT_C: 2682 dp_flags |= DP_SEND_WRITE_ACK; 2683 break; 2684 case DRBD_PROT_B: 2685 dp_flags |= DP_SEND_RECEIVE_ACK; 2686 break; 2687 } 2688 } 2689 rcu_read_unlock(); 2690 2691 if (dp_flags & DP_SEND_WRITE_ACK) { 2692 peer_req->flags |= EE_SEND_WRITE_ACK; 2693 inc_unacked(device); 2694 /* corresponding dec_unacked() in e_end_block() 2695 * respective _drbd_clear_done_ee */ 2696 } 2697 2698 if (dp_flags & DP_SEND_RECEIVE_ACK) { 2699 /* I really don't like it that the receiver thread 2700 * sends on the msock, but anyways */ 2701 drbd_send_ack(peer_device, P_RECV_ACK, peer_req); 2702 } 2703 2704 if (tp) { 2705 /* two primaries implies protocol C */ 2706 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); 2707 peer_req->flags |= EE_IN_INTERVAL_TREE; 2708 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2709 if (err) 2710 goto out_interrupted; 2711 spin_lock_irq(&device->resource->req_lock); 2712 err = handle_write_conflicts(device, peer_req); 2713 if (err) { 2714 spin_unlock_irq(&device->resource->req_lock); 2715 if (err == -ENOENT) { 2716 put_ldev(device); 2717 return 0; 2718 } 2719 goto out_interrupted; 2720 } 2721 } else { 2722 update_peer_seq(peer_device, peer_seq); 2723 spin_lock_irq(&device->resource->req_lock); 2724 } 2725 /* TRIM and WRITE_SAME are processed synchronously, 2726 * we wait for all pending requests, respectively wait for 2727 * active_ee to become empty in drbd_submit_peer_request(); 2728 * better not add ourselves here. */ 2729 if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) 2730 list_add_tail(&peer_req->w.list, &device->active_ee); 2731 spin_unlock_irq(&device->resource->req_lock); 2732 2733 if (device->state.conn == C_SYNC_TARGET) 2734 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); 2735 2736 if (device->state.pdsk < D_INCONSISTENT) { 2737 /* In case we have the only disk of the cluster, */ 2738 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); 2739 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2740 drbd_al_begin_io(device, &peer_req->i); 2741 peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2742 } 2743 2744 err = drbd_submit_peer_request(device, peer_req, op, op_flags, 2745 DRBD_FAULT_DT_WR); 2746 if (!err) 2747 return 0; 2748 2749 /* don't care for the reason here */ 2750 drbd_err(device, "submit failed, triggering re-connect\n"); 2751 spin_lock_irq(&device->resource->req_lock); 2752 list_del(&peer_req->w.list); 2753 drbd_remove_epoch_entry_interval(device, peer_req); 2754 spin_unlock_irq(&device->resource->req_lock); 2755 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { 2756 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 2757 drbd_al_complete_io(device, &peer_req->i); 2758 } 2759 2760 out_interrupted: 2761 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP); 2762 put_ldev(device); 2763 drbd_free_peer_req(device, peer_req); 2764 return err; 2765 } 2766 2767 /* We may throttle resync, if the lower device seems to be busy, 2768 * and current sync rate is above c_min_rate. 2769 * 2770 * To decide whether or not the lower device is busy, we use a scheme similar 2771 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" 2772 * (more than 64 sectors) of activity we cannot account for with our own resync 2773 * activity, it obviously is "busy". 2774 * 2775 * The current sync rate used here uses only the most recent two step marks, 2776 * to have a short time average so we can react faster. 2777 */ 2778 bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 2779 bool throttle_if_app_is_waiting) 2780 { 2781 struct lc_element *tmp; 2782 bool throttle = drbd_rs_c_min_rate_throttle(device); 2783 2784 if (!throttle || throttle_if_app_is_waiting) 2785 return throttle; 2786 2787 spin_lock_irq(&device->al_lock); 2788 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); 2789 if (tmp) { 2790 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2791 if (test_bit(BME_PRIORITY, &bm_ext->flags)) 2792 throttle = false; 2793 /* Do not slow down if app IO is already waiting for this extent, 2794 * and our progress is necessary for application IO to complete. */ 2795 } 2796 spin_unlock_irq(&device->al_lock); 2797 2798 return throttle; 2799 } 2800 2801 bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) 2802 { 2803 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; 2804 unsigned long db, dt, dbdt; 2805 unsigned int c_min_rate; 2806 int curr_events; 2807 2808 rcu_read_lock(); 2809 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; 2810 rcu_read_unlock(); 2811 2812 /* feature disabled? */ 2813 if (c_min_rate == 0) 2814 return false; 2815 2816 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - 2817 atomic_read(&device->rs_sect_ev); 2818 2819 if (atomic_read(&device->ap_actlog_cnt) 2820 || curr_events - device->rs_last_events > 64) { 2821 unsigned long rs_left; 2822 int i; 2823 2824 device->rs_last_events = curr_events; 2825 2826 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, 2827 * approx. */ 2828 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; 2829 2830 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) 2831 rs_left = device->ov_left; 2832 else 2833 rs_left = drbd_bm_total_weight(device) - device->rs_failed; 2834 2835 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; 2836 if (!dt) 2837 dt++; 2838 db = device->rs_mark_left[i] - rs_left; 2839 dbdt = Bit2KB(db/dt); 2840 2841 if (dbdt > c_min_rate) 2842 return true; 2843 } 2844 return false; 2845 } 2846 2847 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) 2848 { 2849 struct drbd_peer_device *peer_device; 2850 struct drbd_device *device; 2851 sector_t sector; 2852 sector_t capacity; 2853 struct drbd_peer_request *peer_req; 2854 struct digest_info *di = NULL; 2855 int size, verb; 2856 unsigned int fault_type; 2857 struct p_block_req *p = pi->data; 2858 2859 peer_device = conn_peer_device(connection, pi->vnr); 2860 if (!peer_device) 2861 return -EIO; 2862 device = peer_device->device; 2863 capacity = drbd_get_capacity(device->this_bdev); 2864 2865 sector = be64_to_cpu(p->sector); 2866 size = be32_to_cpu(p->blksize); 2867 2868 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 2869 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 2870 (unsigned long long)sector, size); 2871 return -EINVAL; 2872 } 2873 if (sector + (size>>9) > capacity) { 2874 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 2875 (unsigned long long)sector, size); 2876 return -EINVAL; 2877 } 2878 2879 if (!get_ldev_if_state(device, D_UP_TO_DATE)) { 2880 verb = 1; 2881 switch (pi->cmd) { 2882 case P_DATA_REQUEST: 2883 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); 2884 break; 2885 case P_RS_THIN_REQ: 2886 case P_RS_DATA_REQUEST: 2887 case P_CSUM_RS_REQUEST: 2888 case P_OV_REQUEST: 2889 drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p); 2890 break; 2891 case P_OV_REPLY: 2892 verb = 0; 2893 dec_rs_pending(device); 2894 drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); 2895 break; 2896 default: 2897 BUG(); 2898 } 2899 if (verb && __ratelimit(&drbd_ratelimit_state)) 2900 drbd_err(device, "Can not satisfy peer's read request, " 2901 "no local data.\n"); 2902 2903 /* drain possibly payload */ 2904 return drbd_drain_block(peer_device, pi->size); 2905 } 2906 2907 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2908 * "criss-cross" setup, that might cause write-out on some other DRBD, 2909 * which in turn might block on the other node at this very place. */ 2910 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, 2911 size, GFP_NOIO); 2912 if (!peer_req) { 2913 put_ldev(device); 2914 return -ENOMEM; 2915 } 2916 2917 switch (pi->cmd) { 2918 case P_DATA_REQUEST: 2919 peer_req->w.cb = w_e_end_data_req; 2920 fault_type = DRBD_FAULT_DT_RD; 2921 /* application IO, don't drbd_rs_begin_io */ 2922 peer_req->flags |= EE_APPLICATION; 2923 goto submit; 2924 2925 case P_RS_THIN_REQ: 2926 /* If at some point in the future we have a smart way to 2927 find out if this data block is completely deallocated, 2928 then we would do something smarter here than reading 2929 the block... */ 2930 peer_req->flags |= EE_RS_THIN_REQ; 2931 /* fall through */ 2932 case P_RS_DATA_REQUEST: 2933 peer_req->w.cb = w_e_end_rsdata_req; 2934 fault_type = DRBD_FAULT_RS_RD; 2935 /* used in the sector offset progress display */ 2936 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2937 break; 2938 2939 case P_OV_REPLY: 2940 case P_CSUM_RS_REQUEST: 2941 fault_type = DRBD_FAULT_RS_RD; 2942 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); 2943 if (!di) 2944 goto out_free_e; 2945 2946 di->digest_size = pi->size; 2947 di->digest = (((char *)di)+sizeof(struct digest_info)); 2948 2949 peer_req->digest = di; 2950 peer_req->flags |= EE_HAS_DIGEST; 2951 2952 if (drbd_recv_all(peer_device->connection, di->digest, pi->size)) 2953 goto out_free_e; 2954 2955 if (pi->cmd == P_CSUM_RS_REQUEST) { 2956 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); 2957 peer_req->w.cb = w_e_end_csum_rs_req; 2958 /* used in the sector offset progress display */ 2959 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2960 /* remember to report stats in drbd_resync_finished */ 2961 device->use_csums = true; 2962 } else if (pi->cmd == P_OV_REPLY) { 2963 /* track progress, we may need to throttle */ 2964 atomic_add(size >> 9, &device->rs_sect_in); 2965 peer_req->w.cb = w_e_end_ov_reply; 2966 dec_rs_pending(device); 2967 /* drbd_rs_begin_io done when we sent this request, 2968 * but accounting still needs to be done. */ 2969 goto submit_for_resync; 2970 } 2971 break; 2972 2973 case P_OV_REQUEST: 2974 if (device->ov_start_sector == ~(sector_t)0 && 2975 peer_device->connection->agreed_pro_version >= 90) { 2976 unsigned long now = jiffies; 2977 int i; 2978 device->ov_start_sector = sector; 2979 device->ov_position = sector; 2980 device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector); 2981 device->rs_total = device->ov_left; 2982 for (i = 0; i < DRBD_SYNC_MARKS; i++) { 2983 device->rs_mark_left[i] = device->ov_left; 2984 device->rs_mark_time[i] = now; 2985 } 2986 drbd_info(device, "Online Verify start sector: %llu\n", 2987 (unsigned long long)sector); 2988 } 2989 peer_req->w.cb = w_e_end_ov_req; 2990 fault_type = DRBD_FAULT_RS_RD; 2991 break; 2992 2993 default: 2994 BUG(); 2995 } 2996 2997 /* Throttle, drbd_rs_begin_io and submit should become asynchronous 2998 * wrt the receiver, but it is not as straightforward as it may seem. 2999 * Various places in the resync start and stop logic assume resync 3000 * requests are processed in order, requeuing this on the worker thread 3001 * introduces a bunch of new code for synchronization between threads. 3002 * 3003 * Unlimited throttling before drbd_rs_begin_io may stall the resync 3004 * "forever", throttling after drbd_rs_begin_io will lock that extent 3005 * for application writes for the same time. For now, just throttle 3006 * here, where the rest of the code expects the receiver to sleep for 3007 * a while, anyways. 3008 */ 3009 3010 /* Throttle before drbd_rs_begin_io, as that locks out application IO; 3011 * this defers syncer requests for some time, before letting at least 3012 * on request through. The resync controller on the receiving side 3013 * will adapt to the incoming rate accordingly. 3014 * 3015 * We cannot throttle here if remote is Primary/SyncTarget: 3016 * we would also throttle its application reads. 3017 * In that case, throttling is done on the SyncTarget only. 3018 */ 3019 3020 /* Even though this may be a resync request, we do add to "read_ee"; 3021 * "sync_ee" is only used for resync WRITEs. 3022 * Add to list early, so debugfs can find this request 3023 * even if we have to sleep below. */ 3024 spin_lock_irq(&device->resource->req_lock); 3025 list_add_tail(&peer_req->w.list, &device->read_ee); 3026 spin_unlock_irq(&device->resource->req_lock); 3027 3028 update_receiver_timing_details(connection, drbd_rs_should_slow_down); 3029 if (device->state.peer != R_PRIMARY 3030 && drbd_rs_should_slow_down(device, sector, false)) 3031 schedule_timeout_uninterruptible(HZ/10); 3032 update_receiver_timing_details(connection, drbd_rs_begin_io); 3033 if (drbd_rs_begin_io(device, sector)) 3034 goto out_free_e; 3035 3036 submit_for_resync: 3037 atomic_add(size >> 9, &device->rs_sect_ev); 3038 3039 submit: 3040 update_receiver_timing_details(connection, drbd_submit_peer_request); 3041 inc_unacked(device); 3042 if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, 3043 fault_type) == 0) 3044 return 0; 3045 3046 /* don't care for the reason here */ 3047 drbd_err(device, "submit failed, triggering re-connect\n"); 3048 3049 out_free_e: 3050 spin_lock_irq(&device->resource->req_lock); 3051 list_del(&peer_req->w.list); 3052 spin_unlock_irq(&device->resource->req_lock); 3053 /* no drbd_rs_complete_io(), we are dropping the connection anyways */ 3054 3055 put_ldev(device); 3056 drbd_free_peer_req(device, peer_req); 3057 return -EIO; 3058 } 3059 3060 /** 3061 * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries 3062 */ 3063 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) 3064 { 3065 struct drbd_device *device = peer_device->device; 3066 int self, peer, rv = -100; 3067 unsigned long ch_self, ch_peer; 3068 enum drbd_after_sb_p after_sb_0p; 3069 3070 self = device->ldev->md.uuid[UI_BITMAP] & 1; 3071 peer = device->p_uuid[UI_BITMAP] & 1; 3072 3073 ch_peer = device->p_uuid[UI_SIZE]; 3074 ch_self = device->comm_bm_set; 3075 3076 rcu_read_lock(); 3077 after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p; 3078 rcu_read_unlock(); 3079 switch (after_sb_0p) { 3080 case ASB_CONSENSUS: 3081 case ASB_DISCARD_SECONDARY: 3082 case ASB_CALL_HELPER: 3083 case ASB_VIOLENTLY: 3084 drbd_err(device, "Configuration error.\n"); 3085 break; 3086 case ASB_DISCONNECT: 3087 break; 3088 case ASB_DISCARD_YOUNGER_PRI: 3089 if (self == 0 && peer == 1) { 3090 rv = -1; 3091 break; 3092 } 3093 if (self == 1 && peer == 0) { 3094 rv = 1; 3095 break; 3096 } 3097 /* Else fall through - to one of the other strategies... */ 3098 case ASB_DISCARD_OLDER_PRI: 3099 if (self == 0 && peer == 1) { 3100 rv = 1; 3101 break; 3102 } 3103 if (self == 1 && peer == 0) { 3104 rv = -1; 3105 break; 3106 } 3107 /* Else fall through to one of the other strategies... */ 3108 drbd_warn(device, "Discard younger/older primary did not find a decision\n" 3109 "Using discard-least-changes instead\n"); 3110 /* fall through */ 3111 case ASB_DISCARD_ZERO_CHG: 3112 if (ch_peer == 0 && ch_self == 0) { 3113 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) 3114 ? -1 : 1; 3115 break; 3116 } else { 3117 if (ch_peer == 0) { rv = 1; break; } 3118 if (ch_self == 0) { rv = -1; break; } 3119 } 3120 if (after_sb_0p == ASB_DISCARD_ZERO_CHG) 3121 break; 3122 /* else, fall through */ 3123 case ASB_DISCARD_LEAST_CHG: 3124 if (ch_self < ch_peer) 3125 rv = -1; 3126 else if (ch_self > ch_peer) 3127 rv = 1; 3128 else /* ( ch_self == ch_peer ) */ 3129 /* Well, then use something else. */ 3130 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) 3131 ? -1 : 1; 3132 break; 3133 case ASB_DISCARD_LOCAL: 3134 rv = -1; 3135 break; 3136 case ASB_DISCARD_REMOTE: 3137 rv = 1; 3138 } 3139 3140 return rv; 3141 } 3142 3143 /** 3144 * drbd_asb_recover_1p - Recover after split-brain with one remaining primary 3145 */ 3146 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) 3147 { 3148 struct drbd_device *device = peer_device->device; 3149 int hg, rv = -100; 3150 enum drbd_after_sb_p after_sb_1p; 3151 3152 rcu_read_lock(); 3153 after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p; 3154 rcu_read_unlock(); 3155 switch (after_sb_1p) { 3156 case ASB_DISCARD_YOUNGER_PRI: 3157 case ASB_DISCARD_OLDER_PRI: 3158 case ASB_DISCARD_LEAST_CHG: 3159 case ASB_DISCARD_LOCAL: 3160 case ASB_DISCARD_REMOTE: 3161 case ASB_DISCARD_ZERO_CHG: 3162 drbd_err(device, "Configuration error.\n"); 3163 break; 3164 case ASB_DISCONNECT: 3165 break; 3166 case ASB_CONSENSUS: 3167 hg = drbd_asb_recover_0p(peer_device); 3168 if (hg == -1 && device->state.role == R_SECONDARY) 3169 rv = hg; 3170 if (hg == 1 && device->state.role == R_PRIMARY) 3171 rv = hg; 3172 break; 3173 case ASB_VIOLENTLY: 3174 rv = drbd_asb_recover_0p(peer_device); 3175 break; 3176 case ASB_DISCARD_SECONDARY: 3177 return device->state.role == R_PRIMARY ? 1 : -1; 3178 case ASB_CALL_HELPER: 3179 hg = drbd_asb_recover_0p(peer_device); 3180 if (hg == -1 && device->state.role == R_PRIMARY) { 3181 enum drbd_state_rv rv2; 3182 3183 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 3184 * we might be here in C_WF_REPORT_PARAMS which is transient. 3185 * we do not need to wait for the after state change work either. */ 3186 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); 3187 if (rv2 != SS_SUCCESS) { 3188 drbd_khelper(device, "pri-lost-after-sb"); 3189 } else { 3190 drbd_warn(device, "Successfully gave up primary role.\n"); 3191 rv = hg; 3192 } 3193 } else 3194 rv = hg; 3195 } 3196 3197 return rv; 3198 } 3199 3200 /** 3201 * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries 3202 */ 3203 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) 3204 { 3205 struct drbd_device *device = peer_device->device; 3206 int hg, rv = -100; 3207 enum drbd_after_sb_p after_sb_2p; 3208 3209 rcu_read_lock(); 3210 after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p; 3211 rcu_read_unlock(); 3212 switch (after_sb_2p) { 3213 case ASB_DISCARD_YOUNGER_PRI: 3214 case ASB_DISCARD_OLDER_PRI: 3215 case ASB_DISCARD_LEAST_CHG: 3216 case ASB_DISCARD_LOCAL: 3217 case ASB_DISCARD_REMOTE: 3218 case ASB_CONSENSUS: 3219 case ASB_DISCARD_SECONDARY: 3220 case ASB_DISCARD_ZERO_CHG: 3221 drbd_err(device, "Configuration error.\n"); 3222 break; 3223 case ASB_VIOLENTLY: 3224 rv = drbd_asb_recover_0p(peer_device); 3225 break; 3226 case ASB_DISCONNECT: 3227 break; 3228 case ASB_CALL_HELPER: 3229 hg = drbd_asb_recover_0p(peer_device); 3230 if (hg == -1) { 3231 enum drbd_state_rv rv2; 3232 3233 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 3234 * we might be here in C_WF_REPORT_PARAMS which is transient. 3235 * we do not need to wait for the after state change work either. */ 3236 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); 3237 if (rv2 != SS_SUCCESS) { 3238 drbd_khelper(device, "pri-lost-after-sb"); 3239 } else { 3240 drbd_warn(device, "Successfully gave up primary role.\n"); 3241 rv = hg; 3242 } 3243 } else 3244 rv = hg; 3245 } 3246 3247 return rv; 3248 } 3249 3250 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, 3251 u64 bits, u64 flags) 3252 { 3253 if (!uuid) { 3254 drbd_info(device, "%s uuid info vanished while I was looking!\n", text); 3255 return; 3256 } 3257 drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", 3258 text, 3259 (unsigned long long)uuid[UI_CURRENT], 3260 (unsigned long long)uuid[UI_BITMAP], 3261 (unsigned long long)uuid[UI_HISTORY_START], 3262 (unsigned long long)uuid[UI_HISTORY_END], 3263 (unsigned long long)bits, 3264 (unsigned long long)flags); 3265 } 3266 3267 /* 3268 100 after split brain try auto recover 3269 2 C_SYNC_SOURCE set BitMap 3270 1 C_SYNC_SOURCE use BitMap 3271 0 no Sync 3272 -1 C_SYNC_TARGET use BitMap 3273 -2 C_SYNC_TARGET set BitMap 3274 -100 after split brain, disconnect 3275 -1000 unrelated data 3276 -1091 requires proto 91 3277 -1096 requires proto 96 3278 */ 3279 3280 static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) 3281 { 3282 struct drbd_peer_device *const peer_device = first_peer_device(device); 3283 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 3284 u64 self, peer; 3285 int i, j; 3286 3287 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 3288 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3289 3290 *rule_nr = 10; 3291 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) 3292 return 0; 3293 3294 *rule_nr = 20; 3295 if ((self == UUID_JUST_CREATED || self == (u64)0) && 3296 peer != UUID_JUST_CREATED) 3297 return -2; 3298 3299 *rule_nr = 30; 3300 if (self != UUID_JUST_CREATED && 3301 (peer == UUID_JUST_CREATED || peer == (u64)0)) 3302 return 2; 3303 3304 if (self == peer) { 3305 int rct, dc; /* roles at crash time */ 3306 3307 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { 3308 3309 if (connection->agreed_pro_version < 91) 3310 return -1091; 3311 3312 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 3313 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { 3314 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n"); 3315 drbd_uuid_move_history(device); 3316 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; 3317 device->ldev->md.uuid[UI_BITMAP] = 0; 3318 3319 drbd_uuid_dump(device, "self", device->ldev->md.uuid, 3320 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); 3321 *rule_nr = 34; 3322 } else { 3323 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n"); 3324 *rule_nr = 36; 3325 } 3326 3327 return 1; 3328 } 3329 3330 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { 3331 3332 if (connection->agreed_pro_version < 91) 3333 return -1091; 3334 3335 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && 3336 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) { 3337 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); 3338 3339 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START]; 3340 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP]; 3341 device->p_uuid[UI_BITMAP] = 0UL; 3342 3343 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3344 *rule_nr = 35; 3345 } else { 3346 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n"); 3347 *rule_nr = 37; 3348 } 3349 3350 return -1; 3351 } 3352 3353 /* Common power [off|failure] */ 3354 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) + 3355 (device->p_uuid[UI_FLAGS] & 2); 3356 /* lowest bit is set when we were primary, 3357 * next bit (weight 2) is set when peer was primary */ 3358 *rule_nr = 40; 3359 3360 /* Neither has the "crashed primary" flag set, 3361 * only a replication link hickup. */ 3362 if (rct == 0) 3363 return 0; 3364 3365 /* Current UUID equal and no bitmap uuid; does not necessarily 3366 * mean this was a "simultaneous hard crash", maybe IO was 3367 * frozen, so no UUID-bump happened. 3368 * This is a protocol change, overload DRBD_FF_WSAME as flag 3369 * for "new-enough" peer DRBD version. */ 3370 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) { 3371 *rule_nr = 41; 3372 if (!(connection->agreed_features & DRBD_FF_WSAME)) { 3373 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n"); 3374 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8)); 3375 } 3376 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) { 3377 /* At least one has the "crashed primary" bit set, 3378 * both are primary now, but neither has rotated its UUIDs? 3379 * "Can not happen." */ 3380 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n"); 3381 return -100; 3382 } 3383 if (device->state.role == R_PRIMARY) 3384 return 1; 3385 return -1; 3386 } 3387 3388 /* Both are secondary. 3389 * Really looks like recovery from simultaneous hard crash. 3390 * Check which had been primary before, and arbitrate. */ 3391 switch (rct) { 3392 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */ 3393 case 1: /* self_pri && !peer_pri */ return 1; 3394 case 2: /* !self_pri && peer_pri */ return -1; 3395 case 3: /* self_pri && peer_pri */ 3396 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); 3397 return dc ? -1 : 1; 3398 } 3399 } 3400 3401 *rule_nr = 50; 3402 peer = device->p_uuid[UI_BITMAP] & ~((u64)1); 3403 if (self == peer) 3404 return -1; 3405 3406 *rule_nr = 51; 3407 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); 3408 if (self == peer) { 3409 if (connection->agreed_pro_version < 96 ? 3410 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == 3411 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : 3412 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { 3413 /* The last P_SYNC_UUID did not get though. Undo the last start of 3414 resync as sync source modifications of the peer's UUIDs. */ 3415 3416 if (connection->agreed_pro_version < 91) 3417 return -1091; 3418 3419 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; 3420 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1]; 3421 3422 drbd_info(device, "Lost last syncUUID packet, corrected:\n"); 3423 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3424 3425 return -1; 3426 } 3427 } 3428 3429 *rule_nr = 60; 3430 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 3431 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3432 peer = device->p_uuid[i] & ~((u64)1); 3433 if (self == peer) 3434 return -2; 3435 } 3436 3437 *rule_nr = 70; 3438 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 3439 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3440 if (self == peer) 3441 return 1; 3442 3443 *rule_nr = 71; 3444 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 3445 if (self == peer) { 3446 if (connection->agreed_pro_version < 96 ? 3447 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == 3448 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : 3449 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { 3450 /* The last P_SYNC_UUID did not get though. Undo the last start of 3451 resync as sync source modifications of our UUIDs. */ 3452 3453 if (connection->agreed_pro_version < 91) 3454 return -1091; 3455 3456 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); 3457 __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]); 3458 3459 drbd_info(device, "Last syncUUID did not get through, corrected:\n"); 3460 drbd_uuid_dump(device, "self", device->ldev->md.uuid, 3461 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); 3462 3463 return 1; 3464 } 3465 } 3466 3467 3468 *rule_nr = 80; 3469 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3470 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3471 self = device->ldev->md.uuid[i] & ~((u64)1); 3472 if (self == peer) 3473 return 2; 3474 } 3475 3476 *rule_nr = 90; 3477 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 3478 peer = device->p_uuid[UI_BITMAP] & ~((u64)1); 3479 if (self == peer && self != ((u64)0)) 3480 return 100; 3481 3482 *rule_nr = 100; 3483 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3484 self = device->ldev->md.uuid[i] & ~((u64)1); 3485 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { 3486 peer = device->p_uuid[j] & ~((u64)1); 3487 if (self == peer) 3488 return -100; 3489 } 3490 } 3491 3492 return -1000; 3493 } 3494 3495 /* drbd_sync_handshake() returns the new conn state on success, or 3496 CONN_MASK (-1) on failure. 3497 */ 3498 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, 3499 enum drbd_role peer_role, 3500 enum drbd_disk_state peer_disk) __must_hold(local) 3501 { 3502 struct drbd_device *device = peer_device->device; 3503 enum drbd_conns rv = C_MASK; 3504 enum drbd_disk_state mydisk; 3505 struct net_conf *nc; 3506 int hg, rule_nr, rr_conflict, tentative, always_asbp; 3507 3508 mydisk = device->state.disk; 3509 if (mydisk == D_NEGOTIATING) 3510 mydisk = device->new_state_tmp.disk; 3511 3512 drbd_info(device, "drbd_sync_handshake:\n"); 3513 3514 spin_lock_irq(&device->ldev->md.uuid_lock); 3515 drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0); 3516 drbd_uuid_dump(device, "peer", device->p_uuid, 3517 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3518 3519 hg = drbd_uuid_compare(device, peer_role, &rule_nr); 3520 spin_unlock_irq(&device->ldev->md.uuid_lock); 3521 3522 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); 3523 3524 if (hg == -1000) { 3525 drbd_alert(device, "Unrelated data, aborting!\n"); 3526 return C_MASK; 3527 } 3528 if (hg < -0x10000) { 3529 int proto, fflags; 3530 hg = -hg; 3531 proto = hg & 0xff; 3532 fflags = (hg >> 8) & 0xff; 3533 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n", 3534 proto, fflags); 3535 return C_MASK; 3536 } 3537 if (hg < -1000) { 3538 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); 3539 return C_MASK; 3540 } 3541 3542 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || 3543 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { 3544 int f = (hg == -100) || abs(hg) == 2; 3545 hg = mydisk > D_INCONSISTENT ? 1 : -1; 3546 if (f) 3547 hg = hg*2; 3548 drbd_info(device, "Becoming sync %s due to disk states.\n", 3549 hg > 0 ? "source" : "target"); 3550 } 3551 3552 if (abs(hg) == 100) 3553 drbd_khelper(device, "initial-split-brain"); 3554 3555 rcu_read_lock(); 3556 nc = rcu_dereference(peer_device->connection->net_conf); 3557 always_asbp = nc->always_asbp; 3558 rr_conflict = nc->rr_conflict; 3559 tentative = nc->tentative; 3560 rcu_read_unlock(); 3561 3562 if (hg == 100 || (hg == -100 && always_asbp)) { 3563 int pcount = (device->state.role == R_PRIMARY) 3564 + (peer_role == R_PRIMARY); 3565 int forced = (hg == -100); 3566 3567 switch (pcount) { 3568 case 0: 3569 hg = drbd_asb_recover_0p(peer_device); 3570 break; 3571 case 1: 3572 hg = drbd_asb_recover_1p(peer_device); 3573 break; 3574 case 2: 3575 hg = drbd_asb_recover_2p(peer_device); 3576 break; 3577 } 3578 if (abs(hg) < 100) { 3579 drbd_warn(device, "Split-Brain detected, %d primaries, " 3580 "automatically solved. Sync from %s node\n", 3581 pcount, (hg < 0) ? "peer" : "this"); 3582 if (forced) { 3583 drbd_warn(device, "Doing a full sync, since" 3584 " UUIDs where ambiguous.\n"); 3585 hg = hg*2; 3586 } 3587 } 3588 } 3589 3590 if (hg == -100) { 3591 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1)) 3592 hg = -1; 3593 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1)) 3594 hg = 1; 3595 3596 if (abs(hg) < 100) 3597 drbd_warn(device, "Split-Brain detected, manually solved. " 3598 "Sync from %s node\n", 3599 (hg < 0) ? "peer" : "this"); 3600 } 3601 3602 if (hg == -100) { 3603 /* FIXME this log message is not correct if we end up here 3604 * after an attempted attach on a diskless node. 3605 * We just refuse to attach -- well, we drop the "connection" 3606 * to that disk, in a way... */ 3607 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n"); 3608 drbd_khelper(device, "split-brain"); 3609 return C_MASK; 3610 } 3611 3612 if (hg > 0 && mydisk <= D_INCONSISTENT) { 3613 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n"); 3614 return C_MASK; 3615 } 3616 3617 if (hg < 0 && /* by intention we do not use mydisk here. */ 3618 device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) { 3619 switch (rr_conflict) { 3620 case ASB_CALL_HELPER: 3621 drbd_khelper(device, "pri-lost"); 3622 /* fall through */ 3623 case ASB_DISCONNECT: 3624 drbd_err(device, "I shall become SyncTarget, but I am primary!\n"); 3625 return C_MASK; 3626 case ASB_VIOLENTLY: 3627 drbd_warn(device, "Becoming SyncTarget, violating the stable-data" 3628 "assumption\n"); 3629 } 3630 } 3631 3632 if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) { 3633 if (hg == 0) 3634 drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n"); 3635 else 3636 drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.", 3637 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), 3638 abs(hg) >= 2 ? "full" : "bit-map based"); 3639 return C_MASK; 3640 } 3641 3642 if (abs(hg) >= 2) { 3643 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); 3644 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", 3645 BM_LOCKED_SET_ALLOWED)) 3646 return C_MASK; 3647 } 3648 3649 if (hg > 0) { /* become sync source. */ 3650 rv = C_WF_BITMAP_S; 3651 } else if (hg < 0) { /* become sync target */ 3652 rv = C_WF_BITMAP_T; 3653 } else { 3654 rv = C_CONNECTED; 3655 if (drbd_bm_total_weight(device)) { 3656 drbd_info(device, "No resync, but %lu bits in bitmap!\n", 3657 drbd_bm_total_weight(device)); 3658 } 3659 } 3660 3661 return rv; 3662 } 3663 3664 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) 3665 { 3666 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ 3667 if (peer == ASB_DISCARD_REMOTE) 3668 return ASB_DISCARD_LOCAL; 3669 3670 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ 3671 if (peer == ASB_DISCARD_LOCAL) 3672 return ASB_DISCARD_REMOTE; 3673 3674 /* everything else is valid if they are equal on both sides. */ 3675 return peer; 3676 } 3677 3678 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi) 3679 { 3680 struct p_protocol *p = pi->data; 3681 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; 3682 int p_proto, p_discard_my_data, p_two_primaries, cf; 3683 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; 3684 char integrity_alg[SHARED_SECRET_MAX] = ""; 3685 struct crypto_shash *peer_integrity_tfm = NULL; 3686 void *int_dig_in = NULL, *int_dig_vv = NULL; 3687 3688 p_proto = be32_to_cpu(p->protocol); 3689 p_after_sb_0p = be32_to_cpu(p->after_sb_0p); 3690 p_after_sb_1p = be32_to_cpu(p->after_sb_1p); 3691 p_after_sb_2p = be32_to_cpu(p->after_sb_2p); 3692 p_two_primaries = be32_to_cpu(p->two_primaries); 3693 cf = be32_to_cpu(p->conn_flags); 3694 p_discard_my_data = cf & CF_DISCARD_MY_DATA; 3695 3696 if (connection->agreed_pro_version >= 87) { 3697 int err; 3698 3699 if (pi->size > sizeof(integrity_alg)) 3700 return -EIO; 3701 err = drbd_recv_all(connection, integrity_alg, pi->size); 3702 if (err) 3703 return err; 3704 integrity_alg[SHARED_SECRET_MAX - 1] = 0; 3705 } 3706 3707 if (pi->cmd != P_PROTOCOL_UPDATE) { 3708 clear_bit(CONN_DRY_RUN, &connection->flags); 3709 3710 if (cf & CF_DRY_RUN) 3711 set_bit(CONN_DRY_RUN, &connection->flags); 3712 3713 rcu_read_lock(); 3714 nc = rcu_dereference(connection->net_conf); 3715 3716 if (p_proto != nc->wire_protocol) { 3717 drbd_err(connection, "incompatible %s settings\n", "protocol"); 3718 goto disconnect_rcu_unlock; 3719 } 3720 3721 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { 3722 drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri"); 3723 goto disconnect_rcu_unlock; 3724 } 3725 3726 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { 3727 drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri"); 3728 goto disconnect_rcu_unlock; 3729 } 3730 3731 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { 3732 drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri"); 3733 goto disconnect_rcu_unlock; 3734 } 3735 3736 if (p_discard_my_data && nc->discard_my_data) { 3737 drbd_err(connection, "incompatible %s settings\n", "discard-my-data"); 3738 goto disconnect_rcu_unlock; 3739 } 3740 3741 if (p_two_primaries != nc->two_primaries) { 3742 drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries"); 3743 goto disconnect_rcu_unlock; 3744 } 3745 3746 if (strcmp(integrity_alg, nc->integrity_alg)) { 3747 drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg"); 3748 goto disconnect_rcu_unlock; 3749 } 3750 3751 rcu_read_unlock(); 3752 } 3753 3754 if (integrity_alg[0]) { 3755 int hash_size; 3756 3757 /* 3758 * We can only change the peer data integrity algorithm 3759 * here. Changing our own data integrity algorithm 3760 * requires that we send a P_PROTOCOL_UPDATE packet at 3761 * the same time; otherwise, the peer has no way to 3762 * tell between which packets the algorithm should 3763 * change. 3764 */ 3765 3766 peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0); 3767 if (IS_ERR(peer_integrity_tfm)) { 3768 peer_integrity_tfm = NULL; 3769 drbd_err(connection, "peer data-integrity-alg %s not supported\n", 3770 integrity_alg); 3771 goto disconnect; 3772 } 3773 3774 hash_size = crypto_shash_digestsize(peer_integrity_tfm); 3775 int_dig_in = kmalloc(hash_size, GFP_KERNEL); 3776 int_dig_vv = kmalloc(hash_size, GFP_KERNEL); 3777 if (!(int_dig_in && int_dig_vv)) { 3778 drbd_err(connection, "Allocation of buffers for data integrity checking failed\n"); 3779 goto disconnect; 3780 } 3781 } 3782 3783 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); 3784 if (!new_net_conf) { 3785 drbd_err(connection, "Allocation of new net_conf failed\n"); 3786 goto disconnect; 3787 } 3788 3789 mutex_lock(&connection->data.mutex); 3790 mutex_lock(&connection->resource->conf_update); 3791 old_net_conf = connection->net_conf; 3792 *new_net_conf = *old_net_conf; 3793 3794 new_net_conf->wire_protocol = p_proto; 3795 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); 3796 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); 3797 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); 3798 new_net_conf->two_primaries = p_two_primaries; 3799 3800 rcu_assign_pointer(connection->net_conf, new_net_conf); 3801 mutex_unlock(&connection->resource->conf_update); 3802 mutex_unlock(&connection->data.mutex); 3803 3804 crypto_free_shash(connection->peer_integrity_tfm); 3805 kfree(connection->int_dig_in); 3806 kfree(connection->int_dig_vv); 3807 connection->peer_integrity_tfm = peer_integrity_tfm; 3808 connection->int_dig_in = int_dig_in; 3809 connection->int_dig_vv = int_dig_vv; 3810 3811 if (strcmp(old_net_conf->integrity_alg, integrity_alg)) 3812 drbd_info(connection, "peer data-integrity-alg: %s\n", 3813 integrity_alg[0] ? integrity_alg : "(none)"); 3814 3815 synchronize_rcu(); 3816 kfree(old_net_conf); 3817 return 0; 3818 3819 disconnect_rcu_unlock: 3820 rcu_read_unlock(); 3821 disconnect: 3822 crypto_free_shash(peer_integrity_tfm); 3823 kfree(int_dig_in); 3824 kfree(int_dig_vv); 3825 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 3826 return -EIO; 3827 } 3828 3829 /* helper function 3830 * input: alg name, feature name 3831 * return: NULL (alg name was "") 3832 * ERR_PTR(error) if something goes wrong 3833 * or the crypto hash ptr, if it worked out ok. */ 3834 static struct crypto_shash *drbd_crypto_alloc_digest_safe( 3835 const struct drbd_device *device, 3836 const char *alg, const char *name) 3837 { 3838 struct crypto_shash *tfm; 3839 3840 if (!alg[0]) 3841 return NULL; 3842 3843 tfm = crypto_alloc_shash(alg, 0, 0); 3844 if (IS_ERR(tfm)) { 3845 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", 3846 alg, name, PTR_ERR(tfm)); 3847 return tfm; 3848 } 3849 return tfm; 3850 } 3851 3852 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi) 3853 { 3854 void *buffer = connection->data.rbuf; 3855 int size = pi->size; 3856 3857 while (size) { 3858 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); 3859 s = drbd_recv(connection, buffer, s); 3860 if (s <= 0) { 3861 if (s < 0) 3862 return s; 3863 break; 3864 } 3865 size -= s; 3866 } 3867 if (size) 3868 return -EIO; 3869 return 0; 3870 } 3871 3872 /* 3873 * config_unknown_volume - device configuration command for unknown volume 3874 * 3875 * When a device is added to an existing connection, the node on which the 3876 * device is added first will send configuration commands to its peer but the 3877 * peer will not know about the device yet. It will warn and ignore these 3878 * commands. Once the device is added on the second node, the second node will 3879 * send the same device configuration commands, but in the other direction. 3880 * 3881 * (We can also end up here if drbd is misconfigured.) 3882 */ 3883 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi) 3884 { 3885 drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n", 3886 cmdname(pi->cmd), pi->vnr); 3887 return ignore_remaining_packet(connection, pi); 3888 } 3889 3890 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi) 3891 { 3892 struct drbd_peer_device *peer_device; 3893 struct drbd_device *device; 3894 struct p_rs_param_95 *p; 3895 unsigned int header_size, data_size, exp_max_sz; 3896 struct crypto_shash *verify_tfm = NULL; 3897 struct crypto_shash *csums_tfm = NULL; 3898 struct net_conf *old_net_conf, *new_net_conf = NULL; 3899 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; 3900 const int apv = connection->agreed_pro_version; 3901 struct fifo_buffer *old_plan = NULL, *new_plan = NULL; 3902 int fifo_size = 0; 3903 int err; 3904 3905 peer_device = conn_peer_device(connection, pi->vnr); 3906 if (!peer_device) 3907 return config_unknown_volume(connection, pi); 3908 device = peer_device->device; 3909 3910 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) 3911 : apv == 88 ? sizeof(struct p_rs_param) 3912 + SHARED_SECRET_MAX 3913 : apv <= 94 ? sizeof(struct p_rs_param_89) 3914 : /* apv >= 95 */ sizeof(struct p_rs_param_95); 3915 3916 if (pi->size > exp_max_sz) { 3917 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n", 3918 pi->size, exp_max_sz); 3919 return -EIO; 3920 } 3921 3922 if (apv <= 88) { 3923 header_size = sizeof(struct p_rs_param); 3924 data_size = pi->size - header_size; 3925 } else if (apv <= 94) { 3926 header_size = sizeof(struct p_rs_param_89); 3927 data_size = pi->size - header_size; 3928 D_ASSERT(device, data_size == 0); 3929 } else { 3930 header_size = sizeof(struct p_rs_param_95); 3931 data_size = pi->size - header_size; 3932 D_ASSERT(device, data_size == 0); 3933 } 3934 3935 /* initialize verify_alg and csums_alg */ 3936 p = pi->data; 3937 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 3938 3939 err = drbd_recv_all(peer_device->connection, p, header_size); 3940 if (err) 3941 return err; 3942 3943 mutex_lock(&connection->resource->conf_update); 3944 old_net_conf = peer_device->connection->net_conf; 3945 if (get_ldev(device)) { 3946 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); 3947 if (!new_disk_conf) { 3948 put_ldev(device); 3949 mutex_unlock(&connection->resource->conf_update); 3950 drbd_err(device, "Allocation of new disk_conf failed\n"); 3951 return -ENOMEM; 3952 } 3953 3954 old_disk_conf = device->ldev->disk_conf; 3955 *new_disk_conf = *old_disk_conf; 3956 3957 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); 3958 } 3959 3960 if (apv >= 88) { 3961 if (apv == 88) { 3962 if (data_size > SHARED_SECRET_MAX || data_size == 0) { 3963 drbd_err(device, "verify-alg of wrong size, " 3964 "peer wants %u, accepting only up to %u byte\n", 3965 data_size, SHARED_SECRET_MAX); 3966 err = -EIO; 3967 goto reconnect; 3968 } 3969 3970 err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size); 3971 if (err) 3972 goto reconnect; 3973 /* we expect NUL terminated string */ 3974 /* but just in case someone tries to be evil */ 3975 D_ASSERT(device, p->verify_alg[data_size-1] == 0); 3976 p->verify_alg[data_size-1] = 0; 3977 3978 } else /* apv >= 89 */ { 3979 /* we still expect NUL terminated strings */ 3980 /* but just in case someone tries to be evil */ 3981 D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0); 3982 D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0); 3983 p->verify_alg[SHARED_SECRET_MAX-1] = 0; 3984 p->csums_alg[SHARED_SECRET_MAX-1] = 0; 3985 } 3986 3987 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { 3988 if (device->state.conn == C_WF_REPORT_PARAMS) { 3989 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", 3990 old_net_conf->verify_alg, p->verify_alg); 3991 goto disconnect; 3992 } 3993 verify_tfm = drbd_crypto_alloc_digest_safe(device, 3994 p->verify_alg, "verify-alg"); 3995 if (IS_ERR(verify_tfm)) { 3996 verify_tfm = NULL; 3997 goto disconnect; 3998 } 3999 } 4000 4001 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { 4002 if (device->state.conn == C_WF_REPORT_PARAMS) { 4003 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", 4004 old_net_conf->csums_alg, p->csums_alg); 4005 goto disconnect; 4006 } 4007 csums_tfm = drbd_crypto_alloc_digest_safe(device, 4008 p->csums_alg, "csums-alg"); 4009 if (IS_ERR(csums_tfm)) { 4010 csums_tfm = NULL; 4011 goto disconnect; 4012 } 4013 } 4014 4015 if (apv > 94 && new_disk_conf) { 4016 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); 4017 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); 4018 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); 4019 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); 4020 4021 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; 4022 if (fifo_size != device->rs_plan_s->size) { 4023 new_plan = fifo_alloc(fifo_size); 4024 if (!new_plan) { 4025 drbd_err(device, "kmalloc of fifo_buffer failed"); 4026 put_ldev(device); 4027 goto disconnect; 4028 } 4029 } 4030 } 4031 4032 if (verify_tfm || csums_tfm) { 4033 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); 4034 if (!new_net_conf) { 4035 drbd_err(device, "Allocation of new net_conf failed\n"); 4036 goto disconnect; 4037 } 4038 4039 *new_net_conf = *old_net_conf; 4040 4041 if (verify_tfm) { 4042 strcpy(new_net_conf->verify_alg, p->verify_alg); 4043 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; 4044 crypto_free_shash(peer_device->connection->verify_tfm); 4045 peer_device->connection->verify_tfm = verify_tfm; 4046 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); 4047 } 4048 if (csums_tfm) { 4049 strcpy(new_net_conf->csums_alg, p->csums_alg); 4050 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; 4051 crypto_free_shash(peer_device->connection->csums_tfm); 4052 peer_device->connection->csums_tfm = csums_tfm; 4053 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); 4054 } 4055 rcu_assign_pointer(connection->net_conf, new_net_conf); 4056 } 4057 } 4058 4059 if (new_disk_conf) { 4060 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); 4061 put_ldev(device); 4062 } 4063 4064 if (new_plan) { 4065 old_plan = device->rs_plan_s; 4066 rcu_assign_pointer(device->rs_plan_s, new_plan); 4067 } 4068 4069 mutex_unlock(&connection->resource->conf_update); 4070 synchronize_rcu(); 4071 if (new_net_conf) 4072 kfree(old_net_conf); 4073 kfree(old_disk_conf); 4074 kfree(old_plan); 4075 4076 return 0; 4077 4078 reconnect: 4079 if (new_disk_conf) { 4080 put_ldev(device); 4081 kfree(new_disk_conf); 4082 } 4083 mutex_unlock(&connection->resource->conf_update); 4084 return -EIO; 4085 4086 disconnect: 4087 kfree(new_plan); 4088 if (new_disk_conf) { 4089 put_ldev(device); 4090 kfree(new_disk_conf); 4091 } 4092 mutex_unlock(&connection->resource->conf_update); 4093 /* just for completeness: actually not needed, 4094 * as this is not reached if csums_tfm was ok. */ 4095 crypto_free_shash(csums_tfm); 4096 /* but free the verify_tfm again, if csums_tfm did not work out */ 4097 crypto_free_shash(verify_tfm); 4098 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4099 return -EIO; 4100 } 4101 4102 /* warn if the arguments differ by more than 12.5% */ 4103 static void warn_if_differ_considerably(struct drbd_device *device, 4104 const char *s, sector_t a, sector_t b) 4105 { 4106 sector_t d; 4107 if (a == 0 || b == 0) 4108 return; 4109 d = (a > b) ? (a - b) : (b - a); 4110 if (d > (a>>3) || d > (b>>3)) 4111 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s, 4112 (unsigned long long)a, (unsigned long long)b); 4113 } 4114 4115 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi) 4116 { 4117 struct drbd_peer_device *peer_device; 4118 struct drbd_device *device; 4119 struct p_sizes *p = pi->data; 4120 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; 4121 enum determine_dev_size dd = DS_UNCHANGED; 4122 sector_t p_size, p_usize, p_csize, my_usize; 4123 sector_t new_size, cur_size; 4124 int ldsc = 0; /* local disk size changed */ 4125 enum dds_flags ddsf; 4126 4127 peer_device = conn_peer_device(connection, pi->vnr); 4128 if (!peer_device) 4129 return config_unknown_volume(connection, pi); 4130 device = peer_device->device; 4131 cur_size = drbd_get_capacity(device->this_bdev); 4132 4133 p_size = be64_to_cpu(p->d_size); 4134 p_usize = be64_to_cpu(p->u_size); 4135 p_csize = be64_to_cpu(p->c_size); 4136 4137 /* just store the peer's disk size for now. 4138 * we still need to figure out whether we accept that. */ 4139 device->p_size = p_size; 4140 4141 if (get_ldev(device)) { 4142 rcu_read_lock(); 4143 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; 4144 rcu_read_unlock(); 4145 4146 warn_if_differ_considerably(device, "lower level device sizes", 4147 p_size, drbd_get_max_capacity(device->ldev)); 4148 warn_if_differ_considerably(device, "user requested size", 4149 p_usize, my_usize); 4150 4151 /* if this is the first connect, or an otherwise expected 4152 * param exchange, choose the minimum */ 4153 if (device->state.conn == C_WF_REPORT_PARAMS) 4154 p_usize = min_not_zero(my_usize, p_usize); 4155 4156 /* Never shrink a device with usable data during connect, 4157 * or "attach" on the peer. 4158 * But allow online shrinking if we are connected. */ 4159 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); 4160 if (new_size < cur_size && 4161 device->state.disk >= D_OUTDATED && 4162 (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) { 4163 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", 4164 (unsigned long long)new_size, (unsigned long long)cur_size); 4165 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4166 put_ldev(device); 4167 return -EIO; 4168 } 4169 4170 if (my_usize != p_usize) { 4171 struct disk_conf *old_disk_conf, *new_disk_conf = NULL; 4172 4173 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); 4174 if (!new_disk_conf) { 4175 drbd_err(device, "Allocation of new disk_conf failed\n"); 4176 put_ldev(device); 4177 return -ENOMEM; 4178 } 4179 4180 mutex_lock(&connection->resource->conf_update); 4181 old_disk_conf = device->ldev->disk_conf; 4182 *new_disk_conf = *old_disk_conf; 4183 new_disk_conf->disk_size = p_usize; 4184 4185 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); 4186 mutex_unlock(&connection->resource->conf_update); 4187 synchronize_rcu(); 4188 kfree(old_disk_conf); 4189 4190 drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", 4191 (unsigned long)p_usize, (unsigned long)my_usize); 4192 } 4193 4194 put_ldev(device); 4195 } 4196 4197 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 4198 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size(). 4199 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 4200 drbd_reconsider_queue_parameters(), we can be sure that after 4201 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ 4202 4203 ddsf = be16_to_cpu(p->dds_flags); 4204 if (get_ldev(device)) { 4205 drbd_reconsider_queue_parameters(device, device->ldev, o); 4206 dd = drbd_determine_dev_size(device, ddsf, NULL); 4207 put_ldev(device); 4208 if (dd == DS_ERROR) 4209 return -EIO; 4210 drbd_md_sync(device); 4211 } else { 4212 /* 4213 * I am diskless, need to accept the peer's *current* size. 4214 * I must NOT accept the peers backing disk size, 4215 * it may have been larger than mine all along... 4216 * 4217 * At this point, the peer knows more about my disk, or at 4218 * least about what we last agreed upon, than myself. 4219 * So if his c_size is less than his d_size, the most likely 4220 * reason is that *my* d_size was smaller last time we checked. 4221 * 4222 * However, if he sends a zero current size, 4223 * take his (user-capped or) backing disk size anyways. 4224 * 4225 * Unless of course he does not have a disk himself. 4226 * In which case we ignore this completely. 4227 */ 4228 sector_t new_size = p_csize ?: p_usize ?: p_size; 4229 drbd_reconsider_queue_parameters(device, NULL, o); 4230 if (new_size == 0) { 4231 /* Ignore, peer does not know nothing. */ 4232 } else if (new_size == cur_size) { 4233 /* nothing to do */ 4234 } else if (cur_size != 0 && p_size == 0) { 4235 drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n", 4236 (unsigned long long)new_size, (unsigned long long)cur_size); 4237 } else if (new_size < cur_size && device->state.role == R_PRIMARY) { 4238 drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n", 4239 (unsigned long long)new_size, (unsigned long long)cur_size); 4240 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4241 return -EIO; 4242 } else { 4243 /* I believe the peer, if 4244 * - I don't have a current size myself 4245 * - we agree on the size anyways 4246 * - I do have a current size, am Secondary, 4247 * and he has the only disk 4248 * - I do have a current size, am Primary, 4249 * and he has the only disk, 4250 * which is larger than my current size 4251 */ 4252 drbd_set_my_capacity(device, new_size); 4253 } 4254 } 4255 4256 if (get_ldev(device)) { 4257 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { 4258 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); 4259 ldsc = 1; 4260 } 4261 4262 put_ldev(device); 4263 } 4264 4265 if (device->state.conn > C_WF_REPORT_PARAMS) { 4266 if (be64_to_cpu(p->c_size) != 4267 drbd_get_capacity(device->this_bdev) || ldsc) { 4268 /* we have different sizes, probably peer 4269 * needs to know my new size... */ 4270 drbd_send_sizes(peer_device, 0, ddsf); 4271 } 4272 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) || 4273 (dd == DS_GREW && device->state.conn == C_CONNECTED)) { 4274 if (device->state.pdsk >= D_INCONSISTENT && 4275 device->state.disk >= D_INCONSISTENT) { 4276 if (ddsf & DDSF_NO_RESYNC) 4277 drbd_info(device, "Resync of new storage suppressed with --assume-clean\n"); 4278 else 4279 resync_after_online_grow(device); 4280 } else 4281 set_bit(RESYNC_AFTER_NEG, &device->flags); 4282 } 4283 } 4284 4285 return 0; 4286 } 4287 4288 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi) 4289 { 4290 struct drbd_peer_device *peer_device; 4291 struct drbd_device *device; 4292 struct p_uuids *p = pi->data; 4293 u64 *p_uuid; 4294 int i, updated_uuids = 0; 4295 4296 peer_device = conn_peer_device(connection, pi->vnr); 4297 if (!peer_device) 4298 return config_unknown_volume(connection, pi); 4299 device = peer_device->device; 4300 4301 p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO); 4302 if (!p_uuid) { 4303 drbd_err(device, "kmalloc of p_uuid failed\n"); 4304 return false; 4305 } 4306 4307 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) 4308 p_uuid[i] = be64_to_cpu(p->uuid[i]); 4309 4310 kfree(device->p_uuid); 4311 device->p_uuid = p_uuid; 4312 4313 if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) && 4314 device->state.disk < D_INCONSISTENT && 4315 device->state.role == R_PRIMARY && 4316 (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { 4317 drbd_err(device, "Can only connect to data with current UUID=%016llX\n", 4318 (unsigned long long)device->ed_uuid); 4319 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4320 return -EIO; 4321 } 4322 4323 if (get_ldev(device)) { 4324 int skip_initial_sync = 4325 device->state.conn == C_CONNECTED && 4326 peer_device->connection->agreed_pro_version >= 90 && 4327 device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && 4328 (p_uuid[UI_FLAGS] & 8); 4329 if (skip_initial_sync) { 4330 drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); 4331 drbd_bitmap_io(device, &drbd_bmio_clear_n_write, 4332 "clear_n_write from receive_uuids", 4333 BM_LOCKED_TEST_ALLOWED); 4334 _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); 4335 _drbd_uuid_set(device, UI_BITMAP, 0); 4336 _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 4337 CS_VERBOSE, NULL); 4338 drbd_md_sync(device); 4339 updated_uuids = 1; 4340 } 4341 put_ldev(device); 4342 } else if (device->state.disk < D_INCONSISTENT && 4343 device->state.role == R_PRIMARY) { 4344 /* I am a diskless primary, the peer just created a new current UUID 4345 for me. */ 4346 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); 4347 } 4348 4349 /* Before we test for the disk state, we should wait until an eventually 4350 ongoing cluster wide state change is finished. That is important if 4351 we are primary and are detaching from our disk. We need to see the 4352 new disk state... */ 4353 mutex_lock(device->state_mutex); 4354 mutex_unlock(device->state_mutex); 4355 if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT) 4356 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); 4357 4358 if (updated_uuids) 4359 drbd_print_uuids(device, "receiver updated UUIDs to"); 4360 4361 return 0; 4362 } 4363 4364 /** 4365 * convert_state() - Converts the peer's view of the cluster state to our point of view 4366 * @ps: The state as seen by the peer. 4367 */ 4368 static union drbd_state convert_state(union drbd_state ps) 4369 { 4370 union drbd_state ms; 4371 4372 static enum drbd_conns c_tab[] = { 4373 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, 4374 [C_CONNECTED] = C_CONNECTED, 4375 4376 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, 4377 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, 4378 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ 4379 [C_VERIFY_S] = C_VERIFY_T, 4380 [C_MASK] = C_MASK, 4381 }; 4382 4383 ms.i = ps.i; 4384 4385 ms.conn = c_tab[ps.conn]; 4386 ms.peer = ps.role; 4387 ms.role = ps.peer; 4388 ms.pdsk = ps.disk; 4389 ms.disk = ps.pdsk; 4390 ms.peer_isp = (ps.aftr_isp | ps.user_isp); 4391 4392 return ms; 4393 } 4394 4395 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi) 4396 { 4397 struct drbd_peer_device *peer_device; 4398 struct drbd_device *device; 4399 struct p_req_state *p = pi->data; 4400 union drbd_state mask, val; 4401 enum drbd_state_rv rv; 4402 4403 peer_device = conn_peer_device(connection, pi->vnr); 4404 if (!peer_device) 4405 return -EIO; 4406 device = peer_device->device; 4407 4408 mask.i = be32_to_cpu(p->mask); 4409 val.i = be32_to_cpu(p->val); 4410 4411 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) && 4412 mutex_is_locked(device->state_mutex)) { 4413 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG); 4414 return 0; 4415 } 4416 4417 mask = convert_state(mask); 4418 val = convert_state(val); 4419 4420 rv = drbd_change_state(device, CS_VERBOSE, mask, val); 4421 drbd_send_sr_reply(peer_device, rv); 4422 4423 drbd_md_sync(device); 4424 4425 return 0; 4426 } 4427 4428 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi) 4429 { 4430 struct p_req_state *p = pi->data; 4431 union drbd_state mask, val; 4432 enum drbd_state_rv rv; 4433 4434 mask.i = be32_to_cpu(p->mask); 4435 val.i = be32_to_cpu(p->val); 4436 4437 if (test_bit(RESOLVE_CONFLICTS, &connection->flags) && 4438 mutex_is_locked(&connection->cstate_mutex)) { 4439 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG); 4440 return 0; 4441 } 4442 4443 mask = convert_state(mask); 4444 val = convert_state(val); 4445 4446 rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); 4447 conn_send_sr_reply(connection, rv); 4448 4449 return 0; 4450 } 4451 4452 static int receive_state(struct drbd_connection *connection, struct packet_info *pi) 4453 { 4454 struct drbd_peer_device *peer_device; 4455 struct drbd_device *device; 4456 struct p_state *p = pi->data; 4457 union drbd_state os, ns, peer_state; 4458 enum drbd_disk_state real_peer_disk; 4459 enum chg_state_flags cs_flags; 4460 int rv; 4461 4462 peer_device = conn_peer_device(connection, pi->vnr); 4463 if (!peer_device) 4464 return config_unknown_volume(connection, pi); 4465 device = peer_device->device; 4466 4467 peer_state.i = be32_to_cpu(p->state); 4468 4469 real_peer_disk = peer_state.disk; 4470 if (peer_state.disk == D_NEGOTIATING) { 4471 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; 4472 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); 4473 } 4474 4475 spin_lock_irq(&device->resource->req_lock); 4476 retry: 4477 os = ns = drbd_read_state(device); 4478 spin_unlock_irq(&device->resource->req_lock); 4479 4480 /* If some other part of the code (ack_receiver thread, timeout) 4481 * already decided to close the connection again, 4482 * we must not "re-establish" it here. */ 4483 if (os.conn <= C_TEAR_DOWN) 4484 return -ECONNRESET; 4485 4486 /* If this is the "end of sync" confirmation, usually the peer disk 4487 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits 4488 * set) resync started in PausedSyncT, or if the timing of pause-/ 4489 * unpause-sync events has been "just right", the peer disk may 4490 * transition from D_CONSISTENT to D_UP_TO_DATE as well. 4491 */ 4492 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && 4493 real_peer_disk == D_UP_TO_DATE && 4494 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { 4495 /* If we are (becoming) SyncSource, but peer is still in sync 4496 * preparation, ignore its uptodate-ness to avoid flapping, it 4497 * will change to inconsistent once the peer reaches active 4498 * syncing states. 4499 * It may have changed syncer-paused flags, however, so we 4500 * cannot ignore this completely. */ 4501 if (peer_state.conn > C_CONNECTED && 4502 peer_state.conn < C_SYNC_SOURCE) 4503 real_peer_disk = D_INCONSISTENT; 4504 4505 /* if peer_state changes to connected at the same time, 4506 * it explicitly notifies us that it finished resync. 4507 * Maybe we should finish it up, too? */ 4508 else if (os.conn >= C_SYNC_SOURCE && 4509 peer_state.conn == C_CONNECTED) { 4510 if (drbd_bm_total_weight(device) <= device->rs_failed) 4511 drbd_resync_finished(device); 4512 return 0; 4513 } 4514 } 4515 4516 /* explicit verify finished notification, stop sector reached. */ 4517 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && 4518 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { 4519 ov_out_of_sync_print(device); 4520 drbd_resync_finished(device); 4521 return 0; 4522 } 4523 4524 /* peer says his disk is inconsistent, while we think it is uptodate, 4525 * and this happens while the peer still thinks we have a sync going on, 4526 * but we think we are already done with the sync. 4527 * We ignore this to avoid flapping pdsk. 4528 * This should not happen, if the peer is a recent version of drbd. */ 4529 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && 4530 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) 4531 real_peer_disk = D_UP_TO_DATE; 4532 4533 if (ns.conn == C_WF_REPORT_PARAMS) 4534 ns.conn = C_CONNECTED; 4535 4536 if (peer_state.conn == C_AHEAD) 4537 ns.conn = C_BEHIND; 4538 4539 /* TODO: 4540 * if (primary and diskless and peer uuid != effective uuid) 4541 * abort attach on peer; 4542 * 4543 * If this node does not have good data, was already connected, but 4544 * the peer did a late attach only now, trying to "negotiate" with me, 4545 * AND I am currently Primary, possibly frozen, with some specific 4546 * "effective" uuid, this should never be reached, really, because 4547 * we first send the uuids, then the current state. 4548 * 4549 * In this scenario, we already dropped the connection hard 4550 * when we received the unsuitable uuids (receive_uuids(). 4551 * 4552 * Should we want to change this, that is: not drop the connection in 4553 * receive_uuids() already, then we would need to add a branch here 4554 * that aborts the attach of "unsuitable uuids" on the peer in case 4555 * this node is currently Diskless Primary. 4556 */ 4557 4558 if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && 4559 get_ldev_if_state(device, D_NEGOTIATING)) { 4560 int cr; /* consider resync */ 4561 4562 /* if we established a new connection */ 4563 cr = (os.conn < C_CONNECTED); 4564 /* if we had an established connection 4565 * and one of the nodes newly attaches a disk */ 4566 cr |= (os.conn == C_CONNECTED && 4567 (peer_state.disk == D_NEGOTIATING || 4568 os.disk == D_NEGOTIATING)); 4569 /* if we have both been inconsistent, and the peer has been 4570 * forced to be UpToDate with --force */ 4571 cr |= test_bit(CONSIDER_RESYNC, &device->flags); 4572 /* if we had been plain connected, and the admin requested to 4573 * start a sync by "invalidate" or "invalidate-remote" */ 4574 cr |= (os.conn == C_CONNECTED && 4575 (peer_state.conn >= C_STARTING_SYNC_S && 4576 peer_state.conn <= C_WF_BITMAP_T)); 4577 4578 if (cr) 4579 ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk); 4580 4581 put_ldev(device); 4582 if (ns.conn == C_MASK) { 4583 ns.conn = C_CONNECTED; 4584 if (device->state.disk == D_NEGOTIATING) { 4585 drbd_force_state(device, NS(disk, D_FAILED)); 4586 } else if (peer_state.disk == D_NEGOTIATING) { 4587 drbd_err(device, "Disk attach process on the peer node was aborted.\n"); 4588 peer_state.disk = D_DISKLESS; 4589 real_peer_disk = D_DISKLESS; 4590 } else { 4591 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags)) 4592 return -EIO; 4593 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS); 4594 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4595 return -EIO; 4596 } 4597 } 4598 } 4599 4600 spin_lock_irq(&device->resource->req_lock); 4601 if (os.i != drbd_read_state(device).i) 4602 goto retry; 4603 clear_bit(CONSIDER_RESYNC, &device->flags); 4604 ns.peer = peer_state.role; 4605 ns.pdsk = real_peer_disk; 4606 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); 4607 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) 4608 ns.disk = device->new_state_tmp.disk; 4609 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); 4610 if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && 4611 test_bit(NEW_CUR_UUID, &device->flags)) { 4612 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this 4613 for temporal network outages! */ 4614 spin_unlock_irq(&device->resource->req_lock); 4615 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); 4616 tl_clear(peer_device->connection); 4617 drbd_uuid_new_current(device); 4618 clear_bit(NEW_CUR_UUID, &device->flags); 4619 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); 4620 return -EIO; 4621 } 4622 rv = _drbd_set_state(device, ns, cs_flags, NULL); 4623 ns = drbd_read_state(device); 4624 spin_unlock_irq(&device->resource->req_lock); 4625 4626 if (rv < SS_SUCCESS) { 4627 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4628 return -EIO; 4629 } 4630 4631 if (os.conn > C_WF_REPORT_PARAMS) { 4632 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && 4633 peer_state.disk != D_NEGOTIATING ) { 4634 /* we want resync, peer has not yet decided to sync... */ 4635 /* Nowadays only used when forcing a node into primary role and 4636 setting its disk to UpToDate with that */ 4637 drbd_send_uuids(peer_device); 4638 drbd_send_current_state(peer_device); 4639 } 4640 } 4641 4642 clear_bit(DISCARD_MY_DATA, &device->flags); 4643 4644 drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */ 4645 4646 return 0; 4647 } 4648 4649 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi) 4650 { 4651 struct drbd_peer_device *peer_device; 4652 struct drbd_device *device; 4653 struct p_rs_uuid *p = pi->data; 4654 4655 peer_device = conn_peer_device(connection, pi->vnr); 4656 if (!peer_device) 4657 return -EIO; 4658 device = peer_device->device; 4659 4660 wait_event(device->misc_wait, 4661 device->state.conn == C_WF_SYNC_UUID || 4662 device->state.conn == C_BEHIND || 4663 device->state.conn < C_CONNECTED || 4664 device->state.disk < D_NEGOTIATING); 4665 4666 /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */ 4667 4668 /* Here the _drbd_uuid_ functions are right, current should 4669 _not_ be rotated into the history */ 4670 if (get_ldev_if_state(device, D_NEGOTIATING)) { 4671 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid)); 4672 _drbd_uuid_set(device, UI_BITMAP, 0UL); 4673 4674 drbd_print_uuids(device, "updated sync uuid"); 4675 drbd_start_resync(device, C_SYNC_TARGET); 4676 4677 put_ldev(device); 4678 } else 4679 drbd_err(device, "Ignoring SyncUUID packet!\n"); 4680 4681 return 0; 4682 } 4683 4684 /** 4685 * receive_bitmap_plain 4686 * 4687 * Return 0 when done, 1 when another iteration is needed, and a negative error 4688 * code upon failure. 4689 */ 4690 static int 4691 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size, 4692 unsigned long *p, struct bm_xfer_ctx *c) 4693 { 4694 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - 4695 drbd_header_size(peer_device->connection); 4696 unsigned int num_words = min_t(size_t, data_size / sizeof(*p), 4697 c->bm_words - c->word_offset); 4698 unsigned int want = num_words * sizeof(*p); 4699 int err; 4700 4701 if (want != size) { 4702 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size); 4703 return -EIO; 4704 } 4705 if (want == 0) 4706 return 0; 4707 err = drbd_recv_all(peer_device->connection, p, want); 4708 if (err) 4709 return err; 4710 4711 drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p); 4712 4713 c->word_offset += num_words; 4714 c->bit_offset = c->word_offset * BITS_PER_LONG; 4715 if (c->bit_offset > c->bm_bits) 4716 c->bit_offset = c->bm_bits; 4717 4718 return 1; 4719 } 4720 4721 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) 4722 { 4723 return (enum drbd_bitmap_code)(p->encoding & 0x0f); 4724 } 4725 4726 static int dcbp_get_start(struct p_compressed_bm *p) 4727 { 4728 return (p->encoding & 0x80) != 0; 4729 } 4730 4731 static int dcbp_get_pad_bits(struct p_compressed_bm *p) 4732 { 4733 return (p->encoding >> 4) & 0x7; 4734 } 4735 4736 /** 4737 * recv_bm_rle_bits 4738 * 4739 * Return 0 when done, 1 when another iteration is needed, and a negative error 4740 * code upon failure. 4741 */ 4742 static int 4743 recv_bm_rle_bits(struct drbd_peer_device *peer_device, 4744 struct p_compressed_bm *p, 4745 struct bm_xfer_ctx *c, 4746 unsigned int len) 4747 { 4748 struct bitstream bs; 4749 u64 look_ahead; 4750 u64 rl; 4751 u64 tmp; 4752 unsigned long s = c->bit_offset; 4753 unsigned long e; 4754 int toggle = dcbp_get_start(p); 4755 int have; 4756 int bits; 4757 4758 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); 4759 4760 bits = bitstream_get_bits(&bs, &look_ahead, 64); 4761 if (bits < 0) 4762 return -EIO; 4763 4764 for (have = bits; have > 0; s += rl, toggle = !toggle) { 4765 bits = vli_decode_bits(&rl, look_ahead); 4766 if (bits <= 0) 4767 return -EIO; 4768 4769 if (toggle) { 4770 e = s + rl -1; 4771 if (e >= c->bm_bits) { 4772 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); 4773 return -EIO; 4774 } 4775 _drbd_bm_set_bits(peer_device->device, s, e); 4776 } 4777 4778 if (have < bits) { 4779 drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", 4780 have, bits, look_ahead, 4781 (unsigned int)(bs.cur.b - p->code), 4782 (unsigned int)bs.buf_len); 4783 return -EIO; 4784 } 4785 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ 4786 if (likely(bits < 64)) 4787 look_ahead >>= bits; 4788 else 4789 look_ahead = 0; 4790 have -= bits; 4791 4792 bits = bitstream_get_bits(&bs, &tmp, 64 - have); 4793 if (bits < 0) 4794 return -EIO; 4795 look_ahead |= tmp << have; 4796 have += bits; 4797 } 4798 4799 c->bit_offset = s; 4800 bm_xfer_ctx_bit_to_word_offset(c); 4801 4802 return (s != c->bm_bits); 4803 } 4804 4805 /** 4806 * decode_bitmap_c 4807 * 4808 * Return 0 when done, 1 when another iteration is needed, and a negative error 4809 * code upon failure. 4810 */ 4811 static int 4812 decode_bitmap_c(struct drbd_peer_device *peer_device, 4813 struct p_compressed_bm *p, 4814 struct bm_xfer_ctx *c, 4815 unsigned int len) 4816 { 4817 if (dcbp_get_code(p) == RLE_VLI_Bits) 4818 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p)); 4819 4820 /* other variants had been implemented for evaluation, 4821 * but have been dropped as this one turned out to be "best" 4822 * during all our tests. */ 4823 4824 drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding); 4825 conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); 4826 return -EIO; 4827 } 4828 4829 void INFO_bm_xfer_stats(struct drbd_device *device, 4830 const char *direction, struct bm_xfer_ctx *c) 4831 { 4832 /* what would it take to transfer it "plaintext" */ 4833 unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); 4834 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; 4835 unsigned int plain = 4836 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + 4837 c->bm_words * sizeof(unsigned long); 4838 unsigned int total = c->bytes[0] + c->bytes[1]; 4839 unsigned int r; 4840 4841 /* total can not be zero. but just in case: */ 4842 if (total == 0) 4843 return; 4844 4845 /* don't report if not compressed */ 4846 if (total >= plain) 4847 return; 4848 4849 /* total < plain. check for overflow, still */ 4850 r = (total > UINT_MAX/1000) ? (total / (plain/1000)) 4851 : (1000 * total / plain); 4852 4853 if (r > 1000) 4854 r = 1000; 4855 4856 r = 1000 - r; 4857 drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " 4858 "total %u; compression: %u.%u%%\n", 4859 direction, 4860 c->bytes[1], c->packets[1], 4861 c->bytes[0], c->packets[0], 4862 total, r/10, r % 10); 4863 } 4864 4865 /* Since we are processing the bitfield from lower addresses to higher, 4866 it does not matter if the process it in 32 bit chunks or 64 bit 4867 chunks as long as it is little endian. (Understand it as byte stream, 4868 beginning with the lowest byte...) If we would use big endian 4869 we would need to process it from the highest address to the lowest, 4870 in order to be agnostic to the 32 vs 64 bits issue. 4871 4872 returns 0 on failure, 1 if we successfully received it. */ 4873 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi) 4874 { 4875 struct drbd_peer_device *peer_device; 4876 struct drbd_device *device; 4877 struct bm_xfer_ctx c; 4878 int err; 4879 4880 peer_device = conn_peer_device(connection, pi->vnr); 4881 if (!peer_device) 4882 return -EIO; 4883 device = peer_device->device; 4884 4885 drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED); 4886 /* you are supposed to send additional out-of-sync information 4887 * if you actually set bits during this phase */ 4888 4889 c = (struct bm_xfer_ctx) { 4890 .bm_bits = drbd_bm_bits(device), 4891 .bm_words = drbd_bm_words(device), 4892 }; 4893 4894 for(;;) { 4895 if (pi->cmd == P_BITMAP) 4896 err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c); 4897 else if (pi->cmd == P_COMPRESSED_BITMAP) { 4898 /* MAYBE: sanity check that we speak proto >= 90, 4899 * and the feature is enabled! */ 4900 struct p_compressed_bm *p = pi->data; 4901 4902 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) { 4903 drbd_err(device, "ReportCBitmap packet too large\n"); 4904 err = -EIO; 4905 goto out; 4906 } 4907 if (pi->size <= sizeof(*p)) { 4908 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size); 4909 err = -EIO; 4910 goto out; 4911 } 4912 err = drbd_recv_all(peer_device->connection, p, pi->size); 4913 if (err) 4914 goto out; 4915 err = decode_bitmap_c(peer_device, p, &c, pi->size); 4916 } else { 4917 drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); 4918 err = -EIO; 4919 goto out; 4920 } 4921 4922 c.packets[pi->cmd == P_BITMAP]++; 4923 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size; 4924 4925 if (err <= 0) { 4926 if (err < 0) 4927 goto out; 4928 break; 4929 } 4930 err = drbd_recv_header(peer_device->connection, pi); 4931 if (err) 4932 goto out; 4933 } 4934 4935 INFO_bm_xfer_stats(device, "receive", &c); 4936 4937 if (device->state.conn == C_WF_BITMAP_T) { 4938 enum drbd_state_rv rv; 4939 4940 err = drbd_send_bitmap(device); 4941 if (err) 4942 goto out; 4943 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ 4944 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 4945 D_ASSERT(device, rv == SS_SUCCESS); 4946 } else if (device->state.conn != C_WF_BITMAP_S) { 4947 /* admin may have requested C_DISCONNECTING, 4948 * other threads may have noticed network errors */ 4949 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n", 4950 drbd_conn_str(device->state.conn)); 4951 } 4952 err = 0; 4953 4954 out: 4955 drbd_bm_unlock(device); 4956 if (!err && device->state.conn == C_WF_BITMAP_S) 4957 drbd_start_resync(device, C_SYNC_SOURCE); 4958 return err; 4959 } 4960 4961 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi) 4962 { 4963 drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n", 4964 pi->cmd, pi->size); 4965 4966 return ignore_remaining_packet(connection, pi); 4967 } 4968 4969 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi) 4970 { 4971 /* Make sure we've acked all the TCP data associated 4972 * with the data requests being unplugged */ 4973 drbd_tcp_quickack(connection->data.socket); 4974 4975 return 0; 4976 } 4977 4978 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi) 4979 { 4980 struct drbd_peer_device *peer_device; 4981 struct drbd_device *device; 4982 struct p_block_desc *p = pi->data; 4983 4984 peer_device = conn_peer_device(connection, pi->vnr); 4985 if (!peer_device) 4986 return -EIO; 4987 device = peer_device->device; 4988 4989 switch (device->state.conn) { 4990 case C_WF_SYNC_UUID: 4991 case C_WF_BITMAP_T: 4992 case C_BEHIND: 4993 break; 4994 default: 4995 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n", 4996 drbd_conn_str(device->state.conn)); 4997 } 4998 4999 drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); 5000 5001 return 0; 5002 } 5003 5004 static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi) 5005 { 5006 struct drbd_peer_device *peer_device; 5007 struct p_block_desc *p = pi->data; 5008 struct drbd_device *device; 5009 sector_t sector; 5010 int size, err = 0; 5011 5012 peer_device = conn_peer_device(connection, pi->vnr); 5013 if (!peer_device) 5014 return -EIO; 5015 device = peer_device->device; 5016 5017 sector = be64_to_cpu(p->sector); 5018 size = be32_to_cpu(p->blksize); 5019 5020 dec_rs_pending(device); 5021 5022 if (get_ldev(device)) { 5023 struct drbd_peer_request *peer_req; 5024 const int op = REQ_OP_WRITE_ZEROES; 5025 5026 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, 5027 size, 0, GFP_NOIO); 5028 if (!peer_req) { 5029 put_ldev(device); 5030 return -ENOMEM; 5031 } 5032 5033 peer_req->w.cb = e_end_resync_block; 5034 peer_req->submit_jif = jiffies; 5035 peer_req->flags |= EE_TRIM; 5036 5037 spin_lock_irq(&device->resource->req_lock); 5038 list_add_tail(&peer_req->w.list, &device->sync_ee); 5039 spin_unlock_irq(&device->resource->req_lock); 5040 5041 atomic_add(pi->size >> 9, &device->rs_sect_ev); 5042 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); 5043 5044 if (err) { 5045 spin_lock_irq(&device->resource->req_lock); 5046 list_del(&peer_req->w.list); 5047 spin_unlock_irq(&device->resource->req_lock); 5048 5049 drbd_free_peer_req(device, peer_req); 5050 put_ldev(device); 5051 err = 0; 5052 goto fail; 5053 } 5054 5055 inc_unacked(device); 5056 5057 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(), 5058 as well as drbd_rs_complete_io() */ 5059 } else { 5060 fail: 5061 drbd_rs_complete_io(device, sector); 5062 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER); 5063 } 5064 5065 atomic_add(size >> 9, &device->rs_sect_in); 5066 5067 return err; 5068 } 5069 5070 struct data_cmd { 5071 int expect_payload; 5072 unsigned int pkt_size; 5073 int (*fn)(struct drbd_connection *, struct packet_info *); 5074 }; 5075 5076 static struct data_cmd drbd_cmd_handler[] = { 5077 [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, 5078 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, 5079 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , 5080 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , 5081 [P_BITMAP] = { 1, 0, receive_bitmap } , 5082 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , 5083 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, 5084 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 5085 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 5086 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, 5087 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, 5088 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, 5089 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, 5090 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, 5091 [P_STATE] = { 0, sizeof(struct p_state), receive_state }, 5092 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, 5093 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, 5094 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 5095 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 5096 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 5097 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 5098 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, 5099 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, 5100 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 5101 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 5102 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, 5103 [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, 5104 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, 5105 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, 5106 }; 5107 5108 static void drbdd(struct drbd_connection *connection) 5109 { 5110 struct packet_info pi; 5111 size_t shs; /* sub header size */ 5112 int err; 5113 5114 while (get_t_state(&connection->receiver) == RUNNING) { 5115 struct data_cmd const *cmd; 5116 5117 drbd_thread_current_set_cpu(&connection->receiver); 5118 update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug); 5119 if (drbd_recv_header_maybe_unplug(connection, &pi)) 5120 goto err_out; 5121 5122 cmd = &drbd_cmd_handler[pi.cmd]; 5123 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { 5124 drbd_err(connection, "Unexpected data packet %s (0x%04x)", 5125 cmdname(pi.cmd), pi.cmd); 5126 goto err_out; 5127 } 5128 5129 shs = cmd->pkt_size; 5130 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME) 5131 shs += sizeof(struct o_qlim); 5132 if (pi.size > shs && !cmd->expect_payload) { 5133 drbd_err(connection, "No payload expected %s l:%d\n", 5134 cmdname(pi.cmd), pi.size); 5135 goto err_out; 5136 } 5137 if (pi.size < shs) { 5138 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n", 5139 cmdname(pi.cmd), (int)shs, pi.size); 5140 goto err_out; 5141 } 5142 5143 if (shs) { 5144 update_receiver_timing_details(connection, drbd_recv_all_warn); 5145 err = drbd_recv_all_warn(connection, pi.data, shs); 5146 if (err) 5147 goto err_out; 5148 pi.size -= shs; 5149 } 5150 5151 update_receiver_timing_details(connection, cmd->fn); 5152 err = cmd->fn(connection, &pi); 5153 if (err) { 5154 drbd_err(connection, "error receiving %s, e: %d l: %d!\n", 5155 cmdname(pi.cmd), err, pi.size); 5156 goto err_out; 5157 } 5158 } 5159 return; 5160 5161 err_out: 5162 conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); 5163 } 5164 5165 static void conn_disconnect(struct drbd_connection *connection) 5166 { 5167 struct drbd_peer_device *peer_device; 5168 enum drbd_conns oc; 5169 int vnr; 5170 5171 if (connection->cstate == C_STANDALONE) 5172 return; 5173 5174 /* We are about to start the cleanup after connection loss. 5175 * Make sure drbd_make_request knows about that. 5176 * Usually we should be in some network failure state already, 5177 * but just in case we are not, we fix it up here. 5178 */ 5179 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 5180 5181 /* ack_receiver does not clean up anything. it must not interfere, either */ 5182 drbd_thread_stop(&connection->ack_receiver); 5183 if (connection->ack_sender) { 5184 destroy_workqueue(connection->ack_sender); 5185 connection->ack_sender = NULL; 5186 } 5187 drbd_free_sock(connection); 5188 5189 rcu_read_lock(); 5190 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 5191 struct drbd_device *device = peer_device->device; 5192 kref_get(&device->kref); 5193 rcu_read_unlock(); 5194 drbd_disconnected(peer_device); 5195 kref_put(&device->kref, drbd_destroy_device); 5196 rcu_read_lock(); 5197 } 5198 rcu_read_unlock(); 5199 5200 if (!list_empty(&connection->current_epoch->list)) 5201 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n"); 5202 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ 5203 atomic_set(&connection->current_epoch->epoch_size, 0); 5204 connection->send.seen_any_write_yet = false; 5205 5206 drbd_info(connection, "Connection closed\n"); 5207 5208 if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN) 5209 conn_try_outdate_peer_async(connection); 5210 5211 spin_lock_irq(&connection->resource->req_lock); 5212 oc = connection->cstate; 5213 if (oc >= C_UNCONNECTED) 5214 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); 5215 5216 spin_unlock_irq(&connection->resource->req_lock); 5217 5218 if (oc == C_DISCONNECTING) 5219 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); 5220 } 5221 5222 static int drbd_disconnected(struct drbd_peer_device *peer_device) 5223 { 5224 struct drbd_device *device = peer_device->device; 5225 unsigned int i; 5226 5227 /* wait for current activity to cease. */ 5228 spin_lock_irq(&device->resource->req_lock); 5229 _drbd_wait_ee_list_empty(device, &device->active_ee); 5230 _drbd_wait_ee_list_empty(device, &device->sync_ee); 5231 _drbd_wait_ee_list_empty(device, &device->read_ee); 5232 spin_unlock_irq(&device->resource->req_lock); 5233 5234 /* We do not have data structures that would allow us to 5235 * get the rs_pending_cnt down to 0 again. 5236 * * On C_SYNC_TARGET we do not have any data structures describing 5237 * the pending RSDataRequest's we have sent. 5238 * * On C_SYNC_SOURCE there is no data structure that tracks 5239 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. 5240 * And no, it is not the sum of the reference counts in the 5241 * resync_LRU. The resync_LRU tracks the whole operation including 5242 * the disk-IO, while the rs_pending_cnt only tracks the blocks 5243 * on the fly. */ 5244 drbd_rs_cancel_all(device); 5245 device->rs_total = 0; 5246 device->rs_failed = 0; 5247 atomic_set(&device->rs_pending_cnt, 0); 5248 wake_up(&device->misc_wait); 5249 5250 del_timer_sync(&device->resync_timer); 5251 resync_timer_fn(&device->resync_timer); 5252 5253 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, 5254 * w_make_resync_request etc. which may still be on the worker queue 5255 * to be "canceled" */ 5256 drbd_flush_workqueue(&peer_device->connection->sender_work); 5257 5258 drbd_finish_peer_reqs(device); 5259 5260 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() 5261 might have issued a work again. The one before drbd_finish_peer_reqs() is 5262 necessary to reclain net_ee in drbd_finish_peer_reqs(). */ 5263 drbd_flush_workqueue(&peer_device->connection->sender_work); 5264 5265 /* need to do it again, drbd_finish_peer_reqs() may have populated it 5266 * again via drbd_try_clear_on_disk_bm(). */ 5267 drbd_rs_cancel_all(device); 5268 5269 kfree(device->p_uuid); 5270 device->p_uuid = NULL; 5271 5272 if (!drbd_suspended(device)) 5273 tl_clear(peer_device->connection); 5274 5275 drbd_md_sync(device); 5276 5277 if (get_ldev(device)) { 5278 drbd_bitmap_io(device, &drbd_bm_write_copy_pages, 5279 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); 5280 put_ldev(device); 5281 } 5282 5283 /* tcp_close and release of sendpage pages can be deferred. I don't 5284 * want to use SO_LINGER, because apparently it can be deferred for 5285 * more than 20 seconds (longest time I checked). 5286 * 5287 * Actually we don't care for exactly when the network stack does its 5288 * put_page(), but release our reference on these pages right here. 5289 */ 5290 i = drbd_free_peer_reqs(device, &device->net_ee); 5291 if (i) 5292 drbd_info(device, "net_ee not empty, killed %u entries\n", i); 5293 i = atomic_read(&device->pp_in_use_by_net); 5294 if (i) 5295 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); 5296 i = atomic_read(&device->pp_in_use); 5297 if (i) 5298 drbd_info(device, "pp_in_use = %d, expected 0\n", i); 5299 5300 D_ASSERT(device, list_empty(&device->read_ee)); 5301 D_ASSERT(device, list_empty(&device->active_ee)); 5302 D_ASSERT(device, list_empty(&device->sync_ee)); 5303 D_ASSERT(device, list_empty(&device->done_ee)); 5304 5305 return 0; 5306 } 5307 5308 /* 5309 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version 5310 * we can agree on is stored in agreed_pro_version. 5311 * 5312 * feature flags and the reserved array should be enough room for future 5313 * enhancements of the handshake protocol, and possible plugins... 5314 * 5315 * for now, they are expected to be zero, but ignored. 5316 */ 5317 static int drbd_send_features(struct drbd_connection *connection) 5318 { 5319 struct drbd_socket *sock; 5320 struct p_connection_features *p; 5321 5322 sock = &connection->data; 5323 p = conn_prepare_command(connection, sock); 5324 if (!p) 5325 return -EIO; 5326 memset(p, 0, sizeof(*p)); 5327 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 5328 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 5329 p->feature_flags = cpu_to_be32(PRO_FEATURES); 5330 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); 5331 } 5332 5333 /* 5334 * return values: 5335 * 1 yes, we have a valid connection 5336 * 0 oops, did not work out, please try again 5337 * -1 peer talks different language, 5338 * no point in trying again, please go standalone. 5339 */ 5340 static int drbd_do_features(struct drbd_connection *connection) 5341 { 5342 /* ASSERT current == connection->receiver ... */ 5343 struct p_connection_features *p; 5344 const int expect = sizeof(struct p_connection_features); 5345 struct packet_info pi; 5346 int err; 5347 5348 err = drbd_send_features(connection); 5349 if (err) 5350 return 0; 5351 5352 err = drbd_recv_header(connection, &pi); 5353 if (err) 5354 return 0; 5355 5356 if (pi.cmd != P_CONNECTION_FEATURES) { 5357 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", 5358 cmdname(pi.cmd), pi.cmd); 5359 return -1; 5360 } 5361 5362 if (pi.size != expect) { 5363 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n", 5364 expect, pi.size); 5365 return -1; 5366 } 5367 5368 p = pi.data; 5369 err = drbd_recv_all_warn(connection, p, expect); 5370 if (err) 5371 return 0; 5372 5373 p->protocol_min = be32_to_cpu(p->protocol_min); 5374 p->protocol_max = be32_to_cpu(p->protocol_max); 5375 if (p->protocol_max == 0) 5376 p->protocol_max = p->protocol_min; 5377 5378 if (PRO_VERSION_MAX < p->protocol_min || 5379 PRO_VERSION_MIN > p->protocol_max) 5380 goto incompat; 5381 5382 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); 5383 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); 5384 5385 drbd_info(connection, "Handshake successful: " 5386 "Agreed network protocol version %d\n", connection->agreed_pro_version); 5387 5388 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", 5389 connection->agreed_features, 5390 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", 5391 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", 5392 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", 5393 connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : 5394 connection->agreed_features ? "" : " none"); 5395 5396 return 1; 5397 5398 incompat: 5399 drbd_err(connection, "incompatible DRBD dialects: " 5400 "I support %d-%d, peer supports %d-%d\n", 5401 PRO_VERSION_MIN, PRO_VERSION_MAX, 5402 p->protocol_min, p->protocol_max); 5403 return -1; 5404 } 5405 5406 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) 5407 static int drbd_do_auth(struct drbd_connection *connection) 5408 { 5409 drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); 5410 drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); 5411 return -1; 5412 } 5413 #else 5414 #define CHALLENGE_LEN 64 5415 5416 /* Return value: 5417 1 - auth succeeded, 5418 0 - failed, try again (network error), 5419 -1 - auth failed, don't try again. 5420 */ 5421 5422 static int drbd_do_auth(struct drbd_connection *connection) 5423 { 5424 struct drbd_socket *sock; 5425 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ 5426 char *response = NULL; 5427 char *right_response = NULL; 5428 char *peers_ch = NULL; 5429 unsigned int key_len; 5430 char secret[SHARED_SECRET_MAX]; /* 64 byte */ 5431 unsigned int resp_size; 5432 SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm); 5433 struct packet_info pi; 5434 struct net_conf *nc; 5435 int err, rv; 5436 5437 /* FIXME: Put the challenge/response into the preallocated socket buffer. */ 5438 5439 rcu_read_lock(); 5440 nc = rcu_dereference(connection->net_conf); 5441 key_len = strlen(nc->shared_secret); 5442 memcpy(secret, nc->shared_secret, key_len); 5443 rcu_read_unlock(); 5444 5445 desc->tfm = connection->cram_hmac_tfm; 5446 5447 rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); 5448 if (rv) { 5449 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv); 5450 rv = -1; 5451 goto fail; 5452 } 5453 5454 get_random_bytes(my_challenge, CHALLENGE_LEN); 5455 5456 sock = &connection->data; 5457 if (!conn_prepare_command(connection, sock)) { 5458 rv = 0; 5459 goto fail; 5460 } 5461 rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0, 5462 my_challenge, CHALLENGE_LEN); 5463 if (!rv) 5464 goto fail; 5465 5466 err = drbd_recv_header(connection, &pi); 5467 if (err) { 5468 rv = 0; 5469 goto fail; 5470 } 5471 5472 if (pi.cmd != P_AUTH_CHALLENGE) { 5473 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", 5474 cmdname(pi.cmd), pi.cmd); 5475 rv = -1; 5476 goto fail; 5477 } 5478 5479 if (pi.size > CHALLENGE_LEN * 2) { 5480 drbd_err(connection, "expected AuthChallenge payload too big.\n"); 5481 rv = -1; 5482 goto fail; 5483 } 5484 5485 if (pi.size < CHALLENGE_LEN) { 5486 drbd_err(connection, "AuthChallenge payload too small.\n"); 5487 rv = -1; 5488 goto fail; 5489 } 5490 5491 peers_ch = kmalloc(pi.size, GFP_NOIO); 5492 if (peers_ch == NULL) { 5493 drbd_err(connection, "kmalloc of peers_ch failed\n"); 5494 rv = -1; 5495 goto fail; 5496 } 5497 5498 err = drbd_recv_all_warn(connection, peers_ch, pi.size); 5499 if (err) { 5500 rv = 0; 5501 goto fail; 5502 } 5503 5504 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { 5505 drbd_err(connection, "Peer presented the same challenge!\n"); 5506 rv = -1; 5507 goto fail; 5508 } 5509 5510 resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm); 5511 response = kmalloc(resp_size, GFP_NOIO); 5512 if (response == NULL) { 5513 drbd_err(connection, "kmalloc of response failed\n"); 5514 rv = -1; 5515 goto fail; 5516 } 5517 5518 rv = crypto_shash_digest(desc, peers_ch, pi.size, response); 5519 if (rv) { 5520 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); 5521 rv = -1; 5522 goto fail; 5523 } 5524 5525 if (!conn_prepare_command(connection, sock)) { 5526 rv = 0; 5527 goto fail; 5528 } 5529 rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0, 5530 response, resp_size); 5531 if (!rv) 5532 goto fail; 5533 5534 err = drbd_recv_header(connection, &pi); 5535 if (err) { 5536 rv = 0; 5537 goto fail; 5538 } 5539 5540 if (pi.cmd != P_AUTH_RESPONSE) { 5541 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n", 5542 cmdname(pi.cmd), pi.cmd); 5543 rv = 0; 5544 goto fail; 5545 } 5546 5547 if (pi.size != resp_size) { 5548 drbd_err(connection, "expected AuthResponse payload of wrong size\n"); 5549 rv = 0; 5550 goto fail; 5551 } 5552 5553 err = drbd_recv_all_warn(connection, response , resp_size); 5554 if (err) { 5555 rv = 0; 5556 goto fail; 5557 } 5558 5559 right_response = kmalloc(resp_size, GFP_NOIO); 5560 if (right_response == NULL) { 5561 drbd_err(connection, "kmalloc of right_response failed\n"); 5562 rv = -1; 5563 goto fail; 5564 } 5565 5566 rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN, 5567 right_response); 5568 if (rv) { 5569 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); 5570 rv = -1; 5571 goto fail; 5572 } 5573 5574 rv = !memcmp(response, right_response, resp_size); 5575 5576 if (rv) 5577 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n", 5578 resp_size); 5579 else 5580 rv = -1; 5581 5582 fail: 5583 kfree(peers_ch); 5584 kfree(response); 5585 kfree(right_response); 5586 shash_desc_zero(desc); 5587 5588 return rv; 5589 } 5590 #endif 5591 5592 int drbd_receiver(struct drbd_thread *thi) 5593 { 5594 struct drbd_connection *connection = thi->connection; 5595 int h; 5596 5597 drbd_info(connection, "receiver (re)started\n"); 5598 5599 do { 5600 h = conn_connect(connection); 5601 if (h == 0) { 5602 conn_disconnect(connection); 5603 schedule_timeout_interruptible(HZ); 5604 } 5605 if (h == -1) { 5606 drbd_warn(connection, "Discarding network configuration.\n"); 5607 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 5608 } 5609 } while (h == 0); 5610 5611 if (h > 0) { 5612 blk_start_plug(&connection->receiver_plug); 5613 drbdd(connection); 5614 blk_finish_plug(&connection->receiver_plug); 5615 } 5616 5617 conn_disconnect(connection); 5618 5619 drbd_info(connection, "receiver terminated\n"); 5620 return 0; 5621 } 5622 5623 /* ********* acknowledge sender ******** */ 5624 5625 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi) 5626 { 5627 struct p_req_state_reply *p = pi->data; 5628 int retcode = be32_to_cpu(p->retcode); 5629 5630 if (retcode >= SS_SUCCESS) { 5631 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags); 5632 } else { 5633 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags); 5634 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n", 5635 drbd_set_st_err_str(retcode), retcode); 5636 } 5637 wake_up(&connection->ping_wait); 5638 5639 return 0; 5640 } 5641 5642 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi) 5643 { 5644 struct drbd_peer_device *peer_device; 5645 struct drbd_device *device; 5646 struct p_req_state_reply *p = pi->data; 5647 int retcode = be32_to_cpu(p->retcode); 5648 5649 peer_device = conn_peer_device(connection, pi->vnr); 5650 if (!peer_device) 5651 return -EIO; 5652 device = peer_device->device; 5653 5654 if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) { 5655 D_ASSERT(device, connection->agreed_pro_version < 100); 5656 return got_conn_RqSReply(connection, pi); 5657 } 5658 5659 if (retcode >= SS_SUCCESS) { 5660 set_bit(CL_ST_CHG_SUCCESS, &device->flags); 5661 } else { 5662 set_bit(CL_ST_CHG_FAIL, &device->flags); 5663 drbd_err(device, "Requested state change failed by peer: %s (%d)\n", 5664 drbd_set_st_err_str(retcode), retcode); 5665 } 5666 wake_up(&device->state_wait); 5667 5668 return 0; 5669 } 5670 5671 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi) 5672 { 5673 return drbd_send_ping_ack(connection); 5674 5675 } 5676 5677 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi) 5678 { 5679 /* restore idle timeout */ 5680 connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ; 5681 if (!test_and_set_bit(GOT_PING_ACK, &connection->flags)) 5682 wake_up(&connection->ping_wait); 5683 5684 return 0; 5685 } 5686 5687 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi) 5688 { 5689 struct drbd_peer_device *peer_device; 5690 struct drbd_device *device; 5691 struct p_block_ack *p = pi->data; 5692 sector_t sector = be64_to_cpu(p->sector); 5693 int blksize = be32_to_cpu(p->blksize); 5694 5695 peer_device = conn_peer_device(connection, pi->vnr); 5696 if (!peer_device) 5697 return -EIO; 5698 device = peer_device->device; 5699 5700 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); 5701 5702 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5703 5704 if (get_ldev(device)) { 5705 drbd_rs_complete_io(device, sector); 5706 drbd_set_in_sync(device, sector, blksize); 5707 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ 5708 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); 5709 put_ldev(device); 5710 } 5711 dec_rs_pending(device); 5712 atomic_add(blksize >> 9, &device->rs_sect_in); 5713 5714 return 0; 5715 } 5716 5717 static int 5718 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, 5719 struct rb_root *root, const char *func, 5720 enum drbd_req_event what, bool missing_ok) 5721 { 5722 struct drbd_request *req; 5723 struct bio_and_error m; 5724 5725 spin_lock_irq(&device->resource->req_lock); 5726 req = find_request(device, root, id, sector, missing_ok, func); 5727 if (unlikely(!req)) { 5728 spin_unlock_irq(&device->resource->req_lock); 5729 return -EIO; 5730 } 5731 __req_mod(req, what, &m); 5732 spin_unlock_irq(&device->resource->req_lock); 5733 5734 if (m.bio) 5735 complete_master_bio(device, &m); 5736 return 0; 5737 } 5738 5739 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi) 5740 { 5741 struct drbd_peer_device *peer_device; 5742 struct drbd_device *device; 5743 struct p_block_ack *p = pi->data; 5744 sector_t sector = be64_to_cpu(p->sector); 5745 int blksize = be32_to_cpu(p->blksize); 5746 enum drbd_req_event what; 5747 5748 peer_device = conn_peer_device(connection, pi->vnr); 5749 if (!peer_device) 5750 return -EIO; 5751 device = peer_device->device; 5752 5753 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5754 5755 if (p->block_id == ID_SYNCER) { 5756 drbd_set_in_sync(device, sector, blksize); 5757 dec_rs_pending(device); 5758 return 0; 5759 } 5760 switch (pi->cmd) { 5761 case P_RS_WRITE_ACK: 5762 what = WRITE_ACKED_BY_PEER_AND_SIS; 5763 break; 5764 case P_WRITE_ACK: 5765 what = WRITE_ACKED_BY_PEER; 5766 break; 5767 case P_RECV_ACK: 5768 what = RECV_ACKED_BY_PEER; 5769 break; 5770 case P_SUPERSEDED: 5771 what = CONFLICT_RESOLVED; 5772 break; 5773 case P_RETRY_WRITE: 5774 what = POSTPONE_WRITE; 5775 break; 5776 default: 5777 BUG(); 5778 } 5779 5780 return validate_req_change_req_state(device, p->block_id, sector, 5781 &device->write_requests, __func__, 5782 what, false); 5783 } 5784 5785 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi) 5786 { 5787 struct drbd_peer_device *peer_device; 5788 struct drbd_device *device; 5789 struct p_block_ack *p = pi->data; 5790 sector_t sector = be64_to_cpu(p->sector); 5791 int size = be32_to_cpu(p->blksize); 5792 int err; 5793 5794 peer_device = conn_peer_device(connection, pi->vnr); 5795 if (!peer_device) 5796 return -EIO; 5797 device = peer_device->device; 5798 5799 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5800 5801 if (p->block_id == ID_SYNCER) { 5802 dec_rs_pending(device); 5803 drbd_rs_failed_io(device, sector, size); 5804 return 0; 5805 } 5806 5807 err = validate_req_change_req_state(device, p->block_id, sector, 5808 &device->write_requests, __func__, 5809 NEG_ACKED, true); 5810 if (err) { 5811 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. 5812 The master bio might already be completed, therefore the 5813 request is no longer in the collision hash. */ 5814 /* In Protocol B we might already have got a P_RECV_ACK 5815 but then get a P_NEG_ACK afterwards. */ 5816 drbd_set_out_of_sync(device, sector, size); 5817 } 5818 return 0; 5819 } 5820 5821 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi) 5822 { 5823 struct drbd_peer_device *peer_device; 5824 struct drbd_device *device; 5825 struct p_block_ack *p = pi->data; 5826 sector_t sector = be64_to_cpu(p->sector); 5827 5828 peer_device = conn_peer_device(connection, pi->vnr); 5829 if (!peer_device) 5830 return -EIO; 5831 device = peer_device->device; 5832 5833 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5834 5835 drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", 5836 (unsigned long long)sector, be32_to_cpu(p->blksize)); 5837 5838 return validate_req_change_req_state(device, p->block_id, sector, 5839 &device->read_requests, __func__, 5840 NEG_ACKED, false); 5841 } 5842 5843 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi) 5844 { 5845 struct drbd_peer_device *peer_device; 5846 struct drbd_device *device; 5847 sector_t sector; 5848 int size; 5849 struct p_block_ack *p = pi->data; 5850 5851 peer_device = conn_peer_device(connection, pi->vnr); 5852 if (!peer_device) 5853 return -EIO; 5854 device = peer_device->device; 5855 5856 sector = be64_to_cpu(p->sector); 5857 size = be32_to_cpu(p->blksize); 5858 5859 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5860 5861 dec_rs_pending(device); 5862 5863 if (get_ldev_if_state(device, D_FAILED)) { 5864 drbd_rs_complete_io(device, sector); 5865 switch (pi->cmd) { 5866 case P_NEG_RS_DREPLY: 5867 drbd_rs_failed_io(device, sector, size); 5868 case P_RS_CANCEL: 5869 break; 5870 default: 5871 BUG(); 5872 } 5873 put_ldev(device); 5874 } 5875 5876 return 0; 5877 } 5878 5879 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi) 5880 { 5881 struct p_barrier_ack *p = pi->data; 5882 struct drbd_peer_device *peer_device; 5883 int vnr; 5884 5885 tl_release(connection, p->barrier, be32_to_cpu(p->set_size)); 5886 5887 rcu_read_lock(); 5888 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 5889 struct drbd_device *device = peer_device->device; 5890 5891 if (device->state.conn == C_AHEAD && 5892 atomic_read(&device->ap_in_flight) == 0 && 5893 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) { 5894 device->start_resync_timer.expires = jiffies + HZ; 5895 add_timer(&device->start_resync_timer); 5896 } 5897 } 5898 rcu_read_unlock(); 5899 5900 return 0; 5901 } 5902 5903 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi) 5904 { 5905 struct drbd_peer_device *peer_device; 5906 struct drbd_device *device; 5907 struct p_block_ack *p = pi->data; 5908 struct drbd_device_work *dw; 5909 sector_t sector; 5910 int size; 5911 5912 peer_device = conn_peer_device(connection, pi->vnr); 5913 if (!peer_device) 5914 return -EIO; 5915 device = peer_device->device; 5916 5917 sector = be64_to_cpu(p->sector); 5918 size = be32_to_cpu(p->blksize); 5919 5920 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5921 5922 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) 5923 drbd_ov_out_of_sync_found(device, sector, size); 5924 else 5925 ov_out_of_sync_print(device); 5926 5927 if (!get_ldev(device)) 5928 return 0; 5929 5930 drbd_rs_complete_io(device, sector); 5931 dec_rs_pending(device); 5932 5933 --device->ov_left; 5934 5935 /* let's advance progress step marks only for every other megabyte */ 5936 if ((device->ov_left & 0x200) == 0x200) 5937 drbd_advance_rs_marks(device, device->ov_left); 5938 5939 if (device->ov_left == 0) { 5940 dw = kmalloc(sizeof(*dw), GFP_NOIO); 5941 if (dw) { 5942 dw->w.cb = w_ov_finished; 5943 dw->device = device; 5944 drbd_queue_work(&peer_device->connection->sender_work, &dw->w); 5945 } else { 5946 drbd_err(device, "kmalloc(dw) failed."); 5947 ov_out_of_sync_print(device); 5948 drbd_resync_finished(device); 5949 } 5950 } 5951 put_ldev(device); 5952 return 0; 5953 } 5954 5955 static int got_skip(struct drbd_connection *connection, struct packet_info *pi) 5956 { 5957 return 0; 5958 } 5959 5960 struct meta_sock_cmd { 5961 size_t pkt_size; 5962 int (*fn)(struct drbd_connection *connection, struct packet_info *); 5963 }; 5964 5965 static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) 5966 { 5967 long t; 5968 struct net_conf *nc; 5969 5970 rcu_read_lock(); 5971 nc = rcu_dereference(connection->net_conf); 5972 t = ping_timeout ? nc->ping_timeo : nc->ping_int; 5973 rcu_read_unlock(); 5974 5975 t *= HZ; 5976 if (ping_timeout) 5977 t /= 10; 5978 5979 connection->meta.socket->sk->sk_rcvtimeo = t; 5980 } 5981 5982 static void set_ping_timeout(struct drbd_connection *connection) 5983 { 5984 set_rcvtimeo(connection, 1); 5985 } 5986 5987 static void set_idle_timeout(struct drbd_connection *connection) 5988 { 5989 set_rcvtimeo(connection, 0); 5990 } 5991 5992 static struct meta_sock_cmd ack_receiver_tbl[] = { 5993 [P_PING] = { 0, got_Ping }, 5994 [P_PING_ACK] = { 0, got_PingAck }, 5995 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5996 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5997 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5998 [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, 5999 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, 6000 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, 6001 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, 6002 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, 6003 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 6004 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 6005 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 6006 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, 6007 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, 6008 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, 6009 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, 6010 }; 6011 6012 int drbd_ack_receiver(struct drbd_thread *thi) 6013 { 6014 struct drbd_connection *connection = thi->connection; 6015 struct meta_sock_cmd *cmd = NULL; 6016 struct packet_info pi; 6017 unsigned long pre_recv_jif; 6018 int rv; 6019 void *buf = connection->meta.rbuf; 6020 int received = 0; 6021 unsigned int header_size = drbd_header_size(connection); 6022 int expect = header_size; 6023 bool ping_timeout_active = false; 6024 struct sched_param param = { .sched_priority = 2 }; 6025 6026 rv = sched_setscheduler(current, SCHED_RR, ¶m); 6027 if (rv < 0) 6028 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); 6029 6030 while (get_t_state(thi) == RUNNING) { 6031 drbd_thread_current_set_cpu(thi); 6032 6033 conn_reclaim_net_peer_reqs(connection); 6034 6035 if (test_and_clear_bit(SEND_PING, &connection->flags)) { 6036 if (drbd_send_ping(connection)) { 6037 drbd_err(connection, "drbd_send_ping has failed\n"); 6038 goto reconnect; 6039 } 6040 set_ping_timeout(connection); 6041 ping_timeout_active = true; 6042 } 6043 6044 pre_recv_jif = jiffies; 6045 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); 6046 6047 /* Note: 6048 * -EINTR (on meta) we got a signal 6049 * -EAGAIN (on meta) rcvtimeo expired 6050 * -ECONNRESET other side closed the connection 6051 * -ERESTARTSYS (on data) we got a signal 6052 * rv < 0 other than above: unexpected error! 6053 * rv == expected: full header or command 6054 * rv < expected: "woken" by signal during receive 6055 * rv == 0 : "connection shut down by peer" 6056 */ 6057 if (likely(rv > 0)) { 6058 received += rv; 6059 buf += rv; 6060 } else if (rv == 0) { 6061 if (test_bit(DISCONNECT_SENT, &connection->flags)) { 6062 long t; 6063 rcu_read_lock(); 6064 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; 6065 rcu_read_unlock(); 6066 6067 t = wait_event_timeout(connection->ping_wait, 6068 connection->cstate < C_WF_REPORT_PARAMS, 6069 t); 6070 if (t) 6071 break; 6072 } 6073 drbd_err(connection, "meta connection shut down by peer.\n"); 6074 goto reconnect; 6075 } else if (rv == -EAGAIN) { 6076 /* If the data socket received something meanwhile, 6077 * that is good enough: peer is still alive. */ 6078 if (time_after(connection->last_received, pre_recv_jif)) 6079 continue; 6080 if (ping_timeout_active) { 6081 drbd_err(connection, "PingAck did not arrive in time.\n"); 6082 goto reconnect; 6083 } 6084 set_bit(SEND_PING, &connection->flags); 6085 continue; 6086 } else if (rv == -EINTR) { 6087 /* maybe drbd_thread_stop(): the while condition will notice. 6088 * maybe woken for send_ping: we'll send a ping above, 6089 * and change the rcvtimeo */ 6090 flush_signals(current); 6091 continue; 6092 } else { 6093 drbd_err(connection, "sock_recvmsg returned %d\n", rv); 6094 goto reconnect; 6095 } 6096 6097 if (received == expect && cmd == NULL) { 6098 if (decode_header(connection, connection->meta.rbuf, &pi)) 6099 goto reconnect; 6100 cmd = &ack_receiver_tbl[pi.cmd]; 6101 if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { 6102 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", 6103 cmdname(pi.cmd), pi.cmd); 6104 goto disconnect; 6105 } 6106 expect = header_size + cmd->pkt_size; 6107 if (pi.size != expect - header_size) { 6108 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n", 6109 pi.cmd, pi.size); 6110 goto reconnect; 6111 } 6112 } 6113 if (received == expect) { 6114 bool err; 6115 6116 err = cmd->fn(connection, &pi); 6117 if (err) { 6118 drbd_err(connection, "%ps failed\n", cmd->fn); 6119 goto reconnect; 6120 } 6121 6122 connection->last_received = jiffies; 6123 6124 if (cmd == &ack_receiver_tbl[P_PING_ACK]) { 6125 set_idle_timeout(connection); 6126 ping_timeout_active = false; 6127 } 6128 6129 buf = connection->meta.rbuf; 6130 received = 0; 6131 expect = header_size; 6132 cmd = NULL; 6133 } 6134 } 6135 6136 if (0) { 6137 reconnect: 6138 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 6139 conn_md_sync(connection); 6140 } 6141 if (0) { 6142 disconnect: 6143 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 6144 } 6145 6146 drbd_info(connection, "ack_receiver terminated\n"); 6147 6148 return 0; 6149 } 6150 6151 void drbd_send_acks_wf(struct work_struct *ws) 6152 { 6153 struct drbd_peer_device *peer_device = 6154 container_of(ws, struct drbd_peer_device, send_acks_work); 6155 struct drbd_connection *connection = peer_device->connection; 6156 struct drbd_device *device = peer_device->device; 6157 struct net_conf *nc; 6158 int tcp_cork, err; 6159 6160 rcu_read_lock(); 6161 nc = rcu_dereference(connection->net_conf); 6162 tcp_cork = nc->tcp_cork; 6163 rcu_read_unlock(); 6164 6165 if (tcp_cork) 6166 drbd_tcp_cork(connection->meta.socket); 6167 6168 err = drbd_finish_peer_reqs(device); 6169 kref_put(&device->kref, drbd_destroy_device); 6170 /* get is in drbd_endio_write_sec_final(). That is necessary to keep the 6171 struct work_struct send_acks_work alive, which is in the peer_device object */ 6172 6173 if (err) { 6174 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 6175 return; 6176 } 6177 6178 if (tcp_cork) 6179 drbd_tcp_uncork(connection->meta.socket); 6180 6181 return; 6182 } 6183