1 /* 2 drbd_receiver.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25 26 #include <linux/module.h> 27 28 #include <linux/uaccess.h> 29 #include <net/sock.h> 30 31 #include <linux/drbd.h> 32 #include <linux/fs.h> 33 #include <linux/file.h> 34 #include <linux/in.h> 35 #include <linux/mm.h> 36 #include <linux/memcontrol.h> 37 #include <linux/mm_inline.h> 38 #include <linux/slab.h> 39 #include <uapi/linux/sched/types.h> 40 #include <linux/sched/signal.h> 41 #include <linux/pkt_sched.h> 42 #define __KERNEL_SYSCALLS__ 43 #include <linux/unistd.h> 44 #include <linux/vmalloc.h> 45 #include <linux/random.h> 46 #include <linux/string.h> 47 #include <linux/scatterlist.h> 48 #include "drbd_int.h" 49 #include "drbd_protocol.h" 50 #include "drbd_req.h" 51 #include "drbd_vli.h" 52 53 #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) 54 55 struct packet_info { 56 enum drbd_packet cmd; 57 unsigned int size; 58 unsigned int vnr; 59 void *data; 60 }; 61 62 enum finish_epoch { 63 FE_STILL_LIVE, 64 FE_DESTROYED, 65 FE_RECYCLED, 66 }; 67 68 static int drbd_do_features(struct drbd_connection *connection); 69 static int drbd_do_auth(struct drbd_connection *connection); 70 static int drbd_disconnected(struct drbd_peer_device *); 71 static void conn_wait_active_ee_empty(struct drbd_connection *connection); 72 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); 73 static int e_end_block(struct drbd_work *, int); 74 75 76 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 77 78 /* 79 * some helper functions to deal with single linked page lists, 80 * page->private being our "next" pointer. 81 */ 82 83 /* If at least n pages are linked at head, get n pages off. 84 * Otherwise, don't modify head, and return NULL. 85 * Locking is the responsibility of the caller. 86 */ 87 static struct page *page_chain_del(struct page **head, int n) 88 { 89 struct page *page; 90 struct page *tmp; 91 92 BUG_ON(!n); 93 BUG_ON(!head); 94 95 page = *head; 96 97 if (!page) 98 return NULL; 99 100 while (page) { 101 tmp = page_chain_next(page); 102 if (--n == 0) 103 break; /* found sufficient pages */ 104 if (tmp == NULL) 105 /* insufficient pages, don't use any of them. */ 106 return NULL; 107 page = tmp; 108 } 109 110 /* add end of list marker for the returned list */ 111 set_page_private(page, 0); 112 /* actual return value, and adjustment of head */ 113 page = *head; 114 *head = tmp; 115 return page; 116 } 117 118 /* may be used outside of locks to find the tail of a (usually short) 119 * "private" page chain, before adding it back to a global chain head 120 * with page_chain_add() under a spinlock. */ 121 static struct page *page_chain_tail(struct page *page, int *len) 122 { 123 struct page *tmp; 124 int i = 1; 125 while ((tmp = page_chain_next(page))) 126 ++i, page = tmp; 127 if (len) 128 *len = i; 129 return page; 130 } 131 132 static int page_chain_free(struct page *page) 133 { 134 struct page *tmp; 135 int i = 0; 136 page_chain_for_each_safe(page, tmp) { 137 put_page(page); 138 ++i; 139 } 140 return i; 141 } 142 143 static void page_chain_add(struct page **head, 144 struct page *chain_first, struct page *chain_last) 145 { 146 #if 1 147 struct page *tmp; 148 tmp = page_chain_tail(chain_first, NULL); 149 BUG_ON(tmp != chain_last); 150 #endif 151 152 /* add chain to head */ 153 set_page_private(chain_last, (unsigned long)*head); 154 *head = chain_first; 155 } 156 157 static struct page *__drbd_alloc_pages(struct drbd_device *device, 158 unsigned int number) 159 { 160 struct page *page = NULL; 161 struct page *tmp = NULL; 162 unsigned int i = 0; 163 164 /* Yes, testing drbd_pp_vacant outside the lock is racy. 165 * So what. It saves a spin_lock. */ 166 if (drbd_pp_vacant >= number) { 167 spin_lock(&drbd_pp_lock); 168 page = page_chain_del(&drbd_pp_pool, number); 169 if (page) 170 drbd_pp_vacant -= number; 171 spin_unlock(&drbd_pp_lock); 172 if (page) 173 return page; 174 } 175 176 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD 177 * "criss-cross" setup, that might cause write-out on some other DRBD, 178 * which in turn might block on the other node at this very place. */ 179 for (i = 0; i < number; i++) { 180 tmp = alloc_page(GFP_TRY); 181 if (!tmp) 182 break; 183 set_page_private(tmp, (unsigned long)page); 184 page = tmp; 185 } 186 187 if (i == number) 188 return page; 189 190 /* Not enough pages immediately available this time. 191 * No need to jump around here, drbd_alloc_pages will retry this 192 * function "soon". */ 193 if (page) { 194 tmp = page_chain_tail(page, NULL); 195 spin_lock(&drbd_pp_lock); 196 page_chain_add(&drbd_pp_pool, page, tmp); 197 drbd_pp_vacant += i; 198 spin_unlock(&drbd_pp_lock); 199 } 200 return NULL; 201 } 202 203 static void reclaim_finished_net_peer_reqs(struct drbd_device *device, 204 struct list_head *to_be_freed) 205 { 206 struct drbd_peer_request *peer_req, *tmp; 207 208 /* The EEs are always appended to the end of the list. Since 209 they are sent in order over the wire, they have to finish 210 in order. As soon as we see the first not finished we can 211 stop to examine the list... */ 212 213 list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { 214 if (drbd_peer_req_has_active_page(peer_req)) 215 break; 216 list_move(&peer_req->w.list, to_be_freed); 217 } 218 } 219 220 static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) 221 { 222 LIST_HEAD(reclaimed); 223 struct drbd_peer_request *peer_req, *t; 224 225 spin_lock_irq(&device->resource->req_lock); 226 reclaim_finished_net_peer_reqs(device, &reclaimed); 227 spin_unlock_irq(&device->resource->req_lock); 228 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) 229 drbd_free_net_peer_req(device, peer_req); 230 } 231 232 static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) 233 { 234 struct drbd_peer_device *peer_device; 235 int vnr; 236 237 rcu_read_lock(); 238 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 239 struct drbd_device *device = peer_device->device; 240 if (!atomic_read(&device->pp_in_use_by_net)) 241 continue; 242 243 kref_get(&device->kref); 244 rcu_read_unlock(); 245 drbd_reclaim_net_peer_reqs(device); 246 kref_put(&device->kref, drbd_destroy_device); 247 rcu_read_lock(); 248 } 249 rcu_read_unlock(); 250 } 251 252 /** 253 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) 254 * @device: DRBD device. 255 * @number: number of pages requested 256 * @retry: whether to retry, if not enough pages are available right now 257 * 258 * Tries to allocate number pages, first from our own page pool, then from 259 * the kernel. 260 * Possibly retry until DRBD frees sufficient pages somewhere else. 261 * 262 * If this allocation would exceed the max_buffers setting, we throttle 263 * allocation (schedule_timeout) to give the system some room to breathe. 264 * 265 * We do not use max-buffers as hard limit, because it could lead to 266 * congestion and further to a distributed deadlock during online-verify or 267 * (checksum based) resync, if the max-buffers, socket buffer sizes and 268 * resync-rate settings are mis-configured. 269 * 270 * Returns a page chain linked via page->private. 271 */ 272 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, 273 bool retry) 274 { 275 struct drbd_device *device = peer_device->device; 276 struct page *page = NULL; 277 struct net_conf *nc; 278 DEFINE_WAIT(wait); 279 unsigned int mxb; 280 281 rcu_read_lock(); 282 nc = rcu_dereference(peer_device->connection->net_conf); 283 mxb = nc ? nc->max_buffers : 1000000; 284 rcu_read_unlock(); 285 286 if (atomic_read(&device->pp_in_use) < mxb) 287 page = __drbd_alloc_pages(device, number); 288 289 /* Try to keep the fast path fast, but occasionally we need 290 * to reclaim the pages we lended to the network stack. */ 291 if (page && atomic_read(&device->pp_in_use_by_net) > 512) 292 drbd_reclaim_net_peer_reqs(device); 293 294 while (page == NULL) { 295 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 296 297 drbd_reclaim_net_peer_reqs(device); 298 299 if (atomic_read(&device->pp_in_use) < mxb) { 300 page = __drbd_alloc_pages(device, number); 301 if (page) 302 break; 303 } 304 305 if (!retry) 306 break; 307 308 if (signal_pending(current)) { 309 drbd_warn(device, "drbd_alloc_pages interrupted!\n"); 310 break; 311 } 312 313 if (schedule_timeout(HZ/10) == 0) 314 mxb = UINT_MAX; 315 } 316 finish_wait(&drbd_pp_wait, &wait); 317 318 if (page) 319 atomic_add(number, &device->pp_in_use); 320 return page; 321 } 322 323 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. 324 * Is also used from inside an other spin_lock_irq(&resource->req_lock); 325 * Either links the page chain back to the global pool, 326 * or returns all pages to the system. */ 327 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) 328 { 329 atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; 330 int i; 331 332 if (page == NULL) 333 return; 334 335 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) 336 i = page_chain_free(page); 337 else { 338 struct page *tmp; 339 tmp = page_chain_tail(page, &i); 340 spin_lock(&drbd_pp_lock); 341 page_chain_add(&drbd_pp_pool, page, tmp); 342 drbd_pp_vacant += i; 343 spin_unlock(&drbd_pp_lock); 344 } 345 i = atomic_sub_return(i, a); 346 if (i < 0) 347 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", 348 is_net ? "pp_in_use_by_net" : "pp_in_use", i); 349 wake_up(&drbd_pp_wait); 350 } 351 352 /* 353 You need to hold the req_lock: 354 _drbd_wait_ee_list_empty() 355 356 You must not have the req_lock: 357 drbd_free_peer_req() 358 drbd_alloc_peer_req() 359 drbd_free_peer_reqs() 360 drbd_ee_fix_bhs() 361 drbd_finish_peer_reqs() 362 drbd_clear_done_ee() 363 drbd_wait_ee_list_empty() 364 */ 365 366 /* normal: payload_size == request size (bi_size) 367 * w_same: payload_size == logical_block_size 368 * trim: payload_size == 0 */ 369 struct drbd_peer_request * 370 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 371 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local) 372 { 373 struct drbd_device *device = peer_device->device; 374 struct drbd_peer_request *peer_req; 375 struct page *page = NULL; 376 unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; 377 378 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) 379 return NULL; 380 381 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); 382 if (!peer_req) { 383 if (!(gfp_mask & __GFP_NOWARN)) 384 drbd_err(device, "%s: allocation failed\n", __func__); 385 return NULL; 386 } 387 388 if (nr_pages) { 389 page = drbd_alloc_pages(peer_device, nr_pages, 390 gfpflags_allow_blocking(gfp_mask)); 391 if (!page) 392 goto fail; 393 } 394 395 memset(peer_req, 0, sizeof(*peer_req)); 396 INIT_LIST_HEAD(&peer_req->w.list); 397 drbd_clear_interval(&peer_req->i); 398 peer_req->i.size = request_size; 399 peer_req->i.sector = sector; 400 peer_req->submit_jif = jiffies; 401 peer_req->peer_device = peer_device; 402 peer_req->pages = page; 403 /* 404 * The block_id is opaque to the receiver. It is not endianness 405 * converted, and sent back to the sender unchanged. 406 */ 407 peer_req->block_id = id; 408 409 return peer_req; 410 411 fail: 412 mempool_free(peer_req, drbd_ee_mempool); 413 return NULL; 414 } 415 416 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, 417 int is_net) 418 { 419 might_sleep(); 420 if (peer_req->flags & EE_HAS_DIGEST) 421 kfree(peer_req->digest); 422 drbd_free_pages(device, peer_req->pages, is_net); 423 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); 424 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 425 if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { 426 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 427 drbd_al_complete_io(device, &peer_req->i); 428 } 429 mempool_free(peer_req, drbd_ee_mempool); 430 } 431 432 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) 433 { 434 LIST_HEAD(work_list); 435 struct drbd_peer_request *peer_req, *t; 436 int count = 0; 437 int is_net = list == &device->net_ee; 438 439 spin_lock_irq(&device->resource->req_lock); 440 list_splice_init(list, &work_list); 441 spin_unlock_irq(&device->resource->req_lock); 442 443 list_for_each_entry_safe(peer_req, t, &work_list, w.list) { 444 __drbd_free_peer_req(device, peer_req, is_net); 445 count++; 446 } 447 return count; 448 } 449 450 /* 451 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. 452 */ 453 static int drbd_finish_peer_reqs(struct drbd_device *device) 454 { 455 LIST_HEAD(work_list); 456 LIST_HEAD(reclaimed); 457 struct drbd_peer_request *peer_req, *t; 458 int err = 0; 459 460 spin_lock_irq(&device->resource->req_lock); 461 reclaim_finished_net_peer_reqs(device, &reclaimed); 462 list_splice_init(&device->done_ee, &work_list); 463 spin_unlock_irq(&device->resource->req_lock); 464 465 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) 466 drbd_free_net_peer_req(device, peer_req); 467 468 /* possible callbacks here: 469 * e_end_block, and e_end_resync_block, e_send_superseded. 470 * all ignore the last argument. 471 */ 472 list_for_each_entry_safe(peer_req, t, &work_list, w.list) { 473 int err2; 474 475 /* list_del not necessary, next/prev members not touched */ 476 err2 = peer_req->w.cb(&peer_req->w, !!err); 477 if (!err) 478 err = err2; 479 drbd_free_peer_req(device, peer_req); 480 } 481 wake_up(&device->ee_wait); 482 483 return err; 484 } 485 486 static void _drbd_wait_ee_list_empty(struct drbd_device *device, 487 struct list_head *head) 488 { 489 DEFINE_WAIT(wait); 490 491 /* avoids spin_lock/unlock 492 * and calling prepare_to_wait in the fast path */ 493 while (!list_empty(head)) { 494 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE); 495 spin_unlock_irq(&device->resource->req_lock); 496 io_schedule(); 497 finish_wait(&device->ee_wait, &wait); 498 spin_lock_irq(&device->resource->req_lock); 499 } 500 } 501 502 static void drbd_wait_ee_list_empty(struct drbd_device *device, 503 struct list_head *head) 504 { 505 spin_lock_irq(&device->resource->req_lock); 506 _drbd_wait_ee_list_empty(device, head); 507 spin_unlock_irq(&device->resource->req_lock); 508 } 509 510 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) 511 { 512 struct kvec iov = { 513 .iov_base = buf, 514 .iov_len = size, 515 }; 516 struct msghdr msg = { 517 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) 518 }; 519 return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); 520 } 521 522 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size) 523 { 524 int rv; 525 526 rv = drbd_recv_short(connection->data.socket, buf, size, 0); 527 528 if (rv < 0) { 529 if (rv == -ECONNRESET) 530 drbd_info(connection, "sock was reset by peer\n"); 531 else if (rv != -ERESTARTSYS) 532 drbd_err(connection, "sock_recvmsg returned %d\n", rv); 533 } else if (rv == 0) { 534 if (test_bit(DISCONNECT_SENT, &connection->flags)) { 535 long t; 536 rcu_read_lock(); 537 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; 538 rcu_read_unlock(); 539 540 t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t); 541 542 if (t) 543 goto out; 544 } 545 drbd_info(connection, "sock was shut down by peer\n"); 546 } 547 548 if (rv != size) 549 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); 550 551 out: 552 return rv; 553 } 554 555 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size) 556 { 557 int err; 558 559 err = drbd_recv(connection, buf, size); 560 if (err != size) { 561 if (err >= 0) 562 err = -EIO; 563 } else 564 err = 0; 565 return err; 566 } 567 568 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size) 569 { 570 int err; 571 572 err = drbd_recv_all(connection, buf, size); 573 if (err && !signal_pending(current)) 574 drbd_warn(connection, "short read (expected size %d)\n", (int)size); 575 return err; 576 } 577 578 /* quoting tcp(7): 579 * On individual connections, the socket buffer size must be set prior to the 580 * listen(2) or connect(2) calls in order to have it take effect. 581 * This is our wrapper to do so. 582 */ 583 static void drbd_setbufsize(struct socket *sock, unsigned int snd, 584 unsigned int rcv) 585 { 586 /* open coded SO_SNDBUF, SO_RCVBUF */ 587 if (snd) { 588 sock->sk->sk_sndbuf = snd; 589 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 590 } 591 if (rcv) { 592 sock->sk->sk_rcvbuf = rcv; 593 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 594 } 595 } 596 597 static struct socket *drbd_try_connect(struct drbd_connection *connection) 598 { 599 const char *what; 600 struct socket *sock; 601 struct sockaddr_in6 src_in6; 602 struct sockaddr_in6 peer_in6; 603 struct net_conf *nc; 604 int err, peer_addr_len, my_addr_len; 605 int sndbuf_size, rcvbuf_size, connect_int; 606 int disconnect_on_error = 1; 607 608 rcu_read_lock(); 609 nc = rcu_dereference(connection->net_conf); 610 if (!nc) { 611 rcu_read_unlock(); 612 return NULL; 613 } 614 sndbuf_size = nc->sndbuf_size; 615 rcvbuf_size = nc->rcvbuf_size; 616 connect_int = nc->connect_int; 617 rcu_read_unlock(); 618 619 my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6)); 620 memcpy(&src_in6, &connection->my_addr, my_addr_len); 621 622 if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6) 623 src_in6.sin6_port = 0; 624 else 625 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ 626 627 peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6)); 628 memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); 629 630 what = "sock_create_kern"; 631 err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, 632 SOCK_STREAM, IPPROTO_TCP, &sock); 633 if (err < 0) { 634 sock = NULL; 635 goto out; 636 } 637 638 sock->sk->sk_rcvtimeo = 639 sock->sk->sk_sndtimeo = connect_int * HZ; 640 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); 641 642 /* explicitly bind to the configured IP as source IP 643 * for the outgoing connections. 644 * This is needed for multihomed hosts and to be 645 * able to use lo: interfaces for drbd. 646 * Make sure to use 0 as port number, so linux selects 647 * a free one dynamically. 648 */ 649 what = "bind before connect"; 650 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); 651 if (err < 0) 652 goto out; 653 654 /* connect may fail, peer not yet available. 655 * stay C_WF_CONNECTION, don't go Disconnecting! */ 656 disconnect_on_error = 0; 657 what = "connect"; 658 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); 659 660 out: 661 if (err < 0) { 662 if (sock) { 663 sock_release(sock); 664 sock = NULL; 665 } 666 switch (-err) { 667 /* timeout, busy, signal pending */ 668 case ETIMEDOUT: case EAGAIN: case EINPROGRESS: 669 case EINTR: case ERESTARTSYS: 670 /* peer not (yet) available, network problem */ 671 case ECONNREFUSED: case ENETUNREACH: 672 case EHOSTDOWN: case EHOSTUNREACH: 673 disconnect_on_error = 0; 674 break; 675 default: 676 drbd_err(connection, "%s failed, err = %d\n", what, err); 677 } 678 if (disconnect_on_error) 679 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 680 } 681 682 return sock; 683 } 684 685 struct accept_wait_data { 686 struct drbd_connection *connection; 687 struct socket *s_listen; 688 struct completion door_bell; 689 void (*original_sk_state_change)(struct sock *sk); 690 691 }; 692 693 static void drbd_incoming_connection(struct sock *sk) 694 { 695 struct accept_wait_data *ad = sk->sk_user_data; 696 void (*state_change)(struct sock *sk); 697 698 state_change = ad->original_sk_state_change; 699 if (sk->sk_state == TCP_ESTABLISHED) 700 complete(&ad->door_bell); 701 state_change(sk); 702 } 703 704 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad) 705 { 706 int err, sndbuf_size, rcvbuf_size, my_addr_len; 707 struct sockaddr_in6 my_addr; 708 struct socket *s_listen; 709 struct net_conf *nc; 710 const char *what; 711 712 rcu_read_lock(); 713 nc = rcu_dereference(connection->net_conf); 714 if (!nc) { 715 rcu_read_unlock(); 716 return -EIO; 717 } 718 sndbuf_size = nc->sndbuf_size; 719 rcvbuf_size = nc->rcvbuf_size; 720 rcu_read_unlock(); 721 722 my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6)); 723 memcpy(&my_addr, &connection->my_addr, my_addr_len); 724 725 what = "sock_create_kern"; 726 err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family, 727 SOCK_STREAM, IPPROTO_TCP, &s_listen); 728 if (err) { 729 s_listen = NULL; 730 goto out; 731 } 732 733 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 734 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); 735 736 what = "bind before listen"; 737 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); 738 if (err < 0) 739 goto out; 740 741 ad->s_listen = s_listen; 742 write_lock_bh(&s_listen->sk->sk_callback_lock); 743 ad->original_sk_state_change = s_listen->sk->sk_state_change; 744 s_listen->sk->sk_state_change = drbd_incoming_connection; 745 s_listen->sk->sk_user_data = ad; 746 write_unlock_bh(&s_listen->sk->sk_callback_lock); 747 748 what = "listen"; 749 err = s_listen->ops->listen(s_listen, 5); 750 if (err < 0) 751 goto out; 752 753 return 0; 754 out: 755 if (s_listen) 756 sock_release(s_listen); 757 if (err < 0) { 758 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { 759 drbd_err(connection, "%s failed, err = %d\n", what, err); 760 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 761 } 762 } 763 764 return -EIO; 765 } 766 767 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) 768 { 769 write_lock_bh(&sk->sk_callback_lock); 770 sk->sk_state_change = ad->original_sk_state_change; 771 sk->sk_user_data = NULL; 772 write_unlock_bh(&sk->sk_callback_lock); 773 } 774 775 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad) 776 { 777 int timeo, connect_int, err = 0; 778 struct socket *s_estab = NULL; 779 struct net_conf *nc; 780 781 rcu_read_lock(); 782 nc = rcu_dereference(connection->net_conf); 783 if (!nc) { 784 rcu_read_unlock(); 785 return NULL; 786 } 787 connect_int = nc->connect_int; 788 rcu_read_unlock(); 789 790 timeo = connect_int * HZ; 791 /* 28.5% random jitter */ 792 timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; 793 794 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); 795 if (err <= 0) 796 return NULL; 797 798 err = kernel_accept(ad->s_listen, &s_estab, 0); 799 if (err < 0) { 800 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { 801 drbd_err(connection, "accept failed, err = %d\n", err); 802 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 803 } 804 } 805 806 if (s_estab) 807 unregister_state_change(s_estab->sk, ad); 808 809 return s_estab; 810 } 811 812 static int decode_header(struct drbd_connection *, void *, struct packet_info *); 813 814 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock, 815 enum drbd_packet cmd) 816 { 817 if (!conn_prepare_command(connection, sock)) 818 return -EIO; 819 return conn_send_command(connection, sock, cmd, 0, NULL, 0); 820 } 821 822 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock) 823 { 824 unsigned int header_size = drbd_header_size(connection); 825 struct packet_info pi; 826 struct net_conf *nc; 827 int err; 828 829 rcu_read_lock(); 830 nc = rcu_dereference(connection->net_conf); 831 if (!nc) { 832 rcu_read_unlock(); 833 return -EIO; 834 } 835 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; 836 rcu_read_unlock(); 837 838 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); 839 if (err != header_size) { 840 if (err >= 0) 841 err = -EIO; 842 return err; 843 } 844 err = decode_header(connection, connection->data.rbuf, &pi); 845 if (err) 846 return err; 847 return pi.cmd; 848 } 849 850 /** 851 * drbd_socket_okay() - Free the socket if its connection is not okay 852 * @sock: pointer to the pointer to the socket. 853 */ 854 static bool drbd_socket_okay(struct socket **sock) 855 { 856 int rr; 857 char tb[4]; 858 859 if (!*sock) 860 return false; 861 862 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); 863 864 if (rr > 0 || rr == -EAGAIN) { 865 return true; 866 } else { 867 sock_release(*sock); 868 *sock = NULL; 869 return false; 870 } 871 } 872 873 static bool connection_established(struct drbd_connection *connection, 874 struct socket **sock1, 875 struct socket **sock2) 876 { 877 struct net_conf *nc; 878 int timeout; 879 bool ok; 880 881 if (!*sock1 || !*sock2) 882 return false; 883 884 rcu_read_lock(); 885 nc = rcu_dereference(connection->net_conf); 886 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; 887 rcu_read_unlock(); 888 schedule_timeout_interruptible(timeout); 889 890 ok = drbd_socket_okay(sock1); 891 ok = drbd_socket_okay(sock2) && ok; 892 893 return ok; 894 } 895 896 /* Gets called if a connection is established, or if a new minor gets created 897 in a connection */ 898 int drbd_connected(struct drbd_peer_device *peer_device) 899 { 900 struct drbd_device *device = peer_device->device; 901 int err; 902 903 atomic_set(&device->packet_seq, 0); 904 device->peer_seq = 0; 905 906 device->state_mutex = peer_device->connection->agreed_pro_version < 100 ? 907 &peer_device->connection->cstate_mutex : 908 &device->own_state_mutex; 909 910 err = drbd_send_sync_param(peer_device); 911 if (!err) 912 err = drbd_send_sizes(peer_device, 0, 0); 913 if (!err) 914 err = drbd_send_uuids(peer_device); 915 if (!err) 916 err = drbd_send_current_state(peer_device); 917 clear_bit(USE_DEGR_WFC_T, &device->flags); 918 clear_bit(RESIZE_PENDING, &device->flags); 919 atomic_set(&device->ap_in_flight, 0); 920 mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */ 921 return err; 922 } 923 924 /* 925 * return values: 926 * 1 yes, we have a valid connection 927 * 0 oops, did not work out, please try again 928 * -1 peer talks different language, 929 * no point in trying again, please go standalone. 930 * -2 We do not have a network config... 931 */ 932 static int conn_connect(struct drbd_connection *connection) 933 { 934 struct drbd_socket sock, msock; 935 struct drbd_peer_device *peer_device; 936 struct net_conf *nc; 937 int vnr, timeout, h; 938 bool discard_my_data, ok; 939 enum drbd_state_rv rv; 940 struct accept_wait_data ad = { 941 .connection = connection, 942 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), 943 }; 944 945 clear_bit(DISCONNECT_SENT, &connection->flags); 946 if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) 947 return -2; 948 949 mutex_init(&sock.mutex); 950 sock.sbuf = connection->data.sbuf; 951 sock.rbuf = connection->data.rbuf; 952 sock.socket = NULL; 953 mutex_init(&msock.mutex); 954 msock.sbuf = connection->meta.sbuf; 955 msock.rbuf = connection->meta.rbuf; 956 msock.socket = NULL; 957 958 /* Assume that the peer only understands protocol 80 until we know better. */ 959 connection->agreed_pro_version = 80; 960 961 if (prepare_listen_socket(connection, &ad)) 962 return 0; 963 964 do { 965 struct socket *s; 966 967 s = drbd_try_connect(connection); 968 if (s) { 969 if (!sock.socket) { 970 sock.socket = s; 971 send_first_packet(connection, &sock, P_INITIAL_DATA); 972 } else if (!msock.socket) { 973 clear_bit(RESOLVE_CONFLICTS, &connection->flags); 974 msock.socket = s; 975 send_first_packet(connection, &msock, P_INITIAL_META); 976 } else { 977 drbd_err(connection, "Logic error in conn_connect()\n"); 978 goto out_release_sockets; 979 } 980 } 981 982 if (connection_established(connection, &sock.socket, &msock.socket)) 983 break; 984 985 retry: 986 s = drbd_wait_for_connect(connection, &ad); 987 if (s) { 988 int fp = receive_first_packet(connection, s); 989 drbd_socket_okay(&sock.socket); 990 drbd_socket_okay(&msock.socket); 991 switch (fp) { 992 case P_INITIAL_DATA: 993 if (sock.socket) { 994 drbd_warn(connection, "initial packet S crossed\n"); 995 sock_release(sock.socket); 996 sock.socket = s; 997 goto randomize; 998 } 999 sock.socket = s; 1000 break; 1001 case P_INITIAL_META: 1002 set_bit(RESOLVE_CONFLICTS, &connection->flags); 1003 if (msock.socket) { 1004 drbd_warn(connection, "initial packet M crossed\n"); 1005 sock_release(msock.socket); 1006 msock.socket = s; 1007 goto randomize; 1008 } 1009 msock.socket = s; 1010 break; 1011 default: 1012 drbd_warn(connection, "Error receiving initial packet\n"); 1013 sock_release(s); 1014 randomize: 1015 if (prandom_u32() & 1) 1016 goto retry; 1017 } 1018 } 1019 1020 if (connection->cstate <= C_DISCONNECTING) 1021 goto out_release_sockets; 1022 if (signal_pending(current)) { 1023 flush_signals(current); 1024 smp_rmb(); 1025 if (get_t_state(&connection->receiver) == EXITING) 1026 goto out_release_sockets; 1027 } 1028 1029 ok = connection_established(connection, &sock.socket, &msock.socket); 1030 } while (!ok); 1031 1032 if (ad.s_listen) 1033 sock_release(ad.s_listen); 1034 1035 sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 1036 msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ 1037 1038 sock.socket->sk->sk_allocation = GFP_NOIO; 1039 msock.socket->sk->sk_allocation = GFP_NOIO; 1040 1041 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; 1042 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; 1043 1044 /* NOT YET ... 1045 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10; 1046 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 1047 * first set it to the P_CONNECTION_FEATURES timeout, 1048 * which we set to 4x the configured ping_timeout. */ 1049 rcu_read_lock(); 1050 nc = rcu_dereference(connection->net_conf); 1051 1052 sock.socket->sk->sk_sndtimeo = 1053 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; 1054 1055 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; 1056 timeout = nc->timeout * HZ / 10; 1057 discard_my_data = nc->discard_my_data; 1058 rcu_read_unlock(); 1059 1060 msock.socket->sk->sk_sndtimeo = timeout; 1061 1062 /* we don't want delays. 1063 * we use TCP_CORK where appropriate, though */ 1064 drbd_tcp_nodelay(sock.socket); 1065 drbd_tcp_nodelay(msock.socket); 1066 1067 connection->data.socket = sock.socket; 1068 connection->meta.socket = msock.socket; 1069 connection->last_received = jiffies; 1070 1071 h = drbd_do_features(connection); 1072 if (h <= 0) 1073 return h; 1074 1075 if (connection->cram_hmac_tfm) { 1076 /* drbd_request_state(device, NS(conn, WFAuth)); */ 1077 switch (drbd_do_auth(connection)) { 1078 case -1: 1079 drbd_err(connection, "Authentication of peer failed\n"); 1080 return -1; 1081 case 0: 1082 drbd_err(connection, "Authentication of peer failed, trying again.\n"); 1083 return 0; 1084 } 1085 } 1086 1087 connection->data.socket->sk->sk_sndtimeo = timeout; 1088 connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 1089 1090 if (drbd_send_protocol(connection) == -EOPNOTSUPP) 1091 return -1; 1092 1093 /* Prevent a race between resync-handshake and 1094 * being promoted to Primary. 1095 * 1096 * Grab and release the state mutex, so we know that any current 1097 * drbd_set_role() is finished, and any incoming drbd_set_role 1098 * will see the STATE_SENT flag, and wait for it to be cleared. 1099 */ 1100 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1101 mutex_lock(peer_device->device->state_mutex); 1102 1103 set_bit(STATE_SENT, &connection->flags); 1104 1105 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1106 mutex_unlock(peer_device->device->state_mutex); 1107 1108 rcu_read_lock(); 1109 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1110 struct drbd_device *device = peer_device->device; 1111 kref_get(&device->kref); 1112 rcu_read_unlock(); 1113 1114 if (discard_my_data) 1115 set_bit(DISCARD_MY_DATA, &device->flags); 1116 else 1117 clear_bit(DISCARD_MY_DATA, &device->flags); 1118 1119 drbd_connected(peer_device); 1120 kref_put(&device->kref, drbd_destroy_device); 1121 rcu_read_lock(); 1122 } 1123 rcu_read_unlock(); 1124 1125 rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); 1126 if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) { 1127 clear_bit(STATE_SENT, &connection->flags); 1128 return 0; 1129 } 1130 1131 drbd_thread_start(&connection->ack_receiver); 1132 /* opencoded create_singlethread_workqueue(), 1133 * to be able to use format string arguments */ 1134 connection->ack_sender = 1135 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); 1136 if (!connection->ack_sender) { 1137 drbd_err(connection, "Failed to create workqueue ack_sender\n"); 1138 return 0; 1139 } 1140 1141 mutex_lock(&connection->resource->conf_update); 1142 /* The discard_my_data flag is a single-shot modifier to the next 1143 * connection attempt, the handshake of which is now well underway. 1144 * No need for rcu style copying of the whole struct 1145 * just to clear a single value. */ 1146 connection->net_conf->discard_my_data = 0; 1147 mutex_unlock(&connection->resource->conf_update); 1148 1149 return h; 1150 1151 out_release_sockets: 1152 if (ad.s_listen) 1153 sock_release(ad.s_listen); 1154 if (sock.socket) 1155 sock_release(sock.socket); 1156 if (msock.socket) 1157 sock_release(msock.socket); 1158 return -1; 1159 } 1160 1161 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi) 1162 { 1163 unsigned int header_size = drbd_header_size(connection); 1164 1165 if (header_size == sizeof(struct p_header100) && 1166 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { 1167 struct p_header100 *h = header; 1168 if (h->pad != 0) { 1169 drbd_err(connection, "Header padding is not zero\n"); 1170 return -EINVAL; 1171 } 1172 pi->vnr = be16_to_cpu(h->volume); 1173 pi->cmd = be16_to_cpu(h->command); 1174 pi->size = be32_to_cpu(h->length); 1175 } else if (header_size == sizeof(struct p_header95) && 1176 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { 1177 struct p_header95 *h = header; 1178 pi->cmd = be16_to_cpu(h->command); 1179 pi->size = be32_to_cpu(h->length); 1180 pi->vnr = 0; 1181 } else if (header_size == sizeof(struct p_header80) && 1182 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { 1183 struct p_header80 *h = header; 1184 pi->cmd = be16_to_cpu(h->command); 1185 pi->size = be16_to_cpu(h->length); 1186 pi->vnr = 0; 1187 } else { 1188 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n", 1189 be32_to_cpu(*(__be32 *)header), 1190 connection->agreed_pro_version); 1191 return -EINVAL; 1192 } 1193 pi->data = header + header_size; 1194 return 0; 1195 } 1196 1197 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) 1198 { 1199 void *buffer = connection->data.rbuf; 1200 int err; 1201 1202 err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection)); 1203 if (err) 1204 return err; 1205 1206 err = decode_header(connection, buffer, pi); 1207 connection->last_received = jiffies; 1208 1209 return err; 1210 } 1211 1212 /* This is blkdev_issue_flush, but asynchronous. 1213 * We want to submit to all component volumes in parallel, 1214 * then wait for all completions. 1215 */ 1216 struct issue_flush_context { 1217 atomic_t pending; 1218 int error; 1219 struct completion done; 1220 }; 1221 struct one_flush_context { 1222 struct drbd_device *device; 1223 struct issue_flush_context *ctx; 1224 }; 1225 1226 void one_flush_endio(struct bio *bio) 1227 { 1228 struct one_flush_context *octx = bio->bi_private; 1229 struct drbd_device *device = octx->device; 1230 struct issue_flush_context *ctx = octx->ctx; 1231 1232 if (bio->bi_status) { 1233 ctx->error = blk_status_to_errno(bio->bi_status); 1234 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status); 1235 } 1236 kfree(octx); 1237 bio_put(bio); 1238 1239 clear_bit(FLUSH_PENDING, &device->flags); 1240 put_ldev(device); 1241 kref_put(&device->kref, drbd_destroy_device); 1242 1243 if (atomic_dec_and_test(&ctx->pending)) 1244 complete(&ctx->done); 1245 } 1246 1247 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) 1248 { 1249 struct bio *bio = bio_alloc(GFP_NOIO, 0); 1250 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); 1251 if (!bio || !octx) { 1252 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); 1253 /* FIXME: what else can I do now? disconnecting or detaching 1254 * really does not help to improve the state of the world, either. 1255 */ 1256 kfree(octx); 1257 if (bio) 1258 bio_put(bio); 1259 1260 ctx->error = -ENOMEM; 1261 put_ldev(device); 1262 kref_put(&device->kref, drbd_destroy_device); 1263 return; 1264 } 1265 1266 octx->device = device; 1267 octx->ctx = ctx; 1268 bio->bi_bdev = device->ldev->backing_bdev; 1269 bio->bi_private = octx; 1270 bio->bi_end_io = one_flush_endio; 1271 bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; 1272 1273 device->flush_jif = jiffies; 1274 set_bit(FLUSH_PENDING, &device->flags); 1275 atomic_inc(&ctx->pending); 1276 submit_bio(bio); 1277 } 1278 1279 static void drbd_flush(struct drbd_connection *connection) 1280 { 1281 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { 1282 struct drbd_peer_device *peer_device; 1283 struct issue_flush_context ctx; 1284 int vnr; 1285 1286 atomic_set(&ctx.pending, 1); 1287 ctx.error = 0; 1288 init_completion(&ctx.done); 1289 1290 rcu_read_lock(); 1291 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1292 struct drbd_device *device = peer_device->device; 1293 1294 if (!get_ldev(device)) 1295 continue; 1296 kref_get(&device->kref); 1297 rcu_read_unlock(); 1298 1299 submit_one_flush(device, &ctx); 1300 1301 rcu_read_lock(); 1302 } 1303 rcu_read_unlock(); 1304 1305 /* Do we want to add a timeout, 1306 * if disk-timeout is set? */ 1307 if (!atomic_dec_and_test(&ctx.pending)) 1308 wait_for_completion(&ctx.done); 1309 1310 if (ctx.error) { 1311 /* would rather check on EOPNOTSUPP, but that is not reliable. 1312 * don't try again for ANY return value != 0 1313 * if (rv == -EOPNOTSUPP) */ 1314 /* Any error is already reported by bio_endio callback. */ 1315 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); 1316 } 1317 } 1318 } 1319 1320 /** 1321 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. 1322 * @device: DRBD device. 1323 * @epoch: Epoch object. 1324 * @ev: Epoch event. 1325 */ 1326 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection, 1327 struct drbd_epoch *epoch, 1328 enum epoch_event ev) 1329 { 1330 int epoch_size; 1331 struct drbd_epoch *next_epoch; 1332 enum finish_epoch rv = FE_STILL_LIVE; 1333 1334 spin_lock(&connection->epoch_lock); 1335 do { 1336 next_epoch = NULL; 1337 1338 epoch_size = atomic_read(&epoch->epoch_size); 1339 1340 switch (ev & ~EV_CLEANUP) { 1341 case EV_PUT: 1342 atomic_dec(&epoch->active); 1343 break; 1344 case EV_GOT_BARRIER_NR: 1345 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1346 break; 1347 case EV_BECAME_LAST: 1348 /* nothing to do*/ 1349 break; 1350 } 1351 1352 if (epoch_size != 0 && 1353 atomic_read(&epoch->active) == 0 && 1354 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { 1355 if (!(ev & EV_CLEANUP)) { 1356 spin_unlock(&connection->epoch_lock); 1357 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size); 1358 spin_lock(&connection->epoch_lock); 1359 } 1360 #if 0 1361 /* FIXME: dec unacked on connection, once we have 1362 * something to count pending connection packets in. */ 1363 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) 1364 dec_unacked(epoch->connection); 1365 #endif 1366 1367 if (connection->current_epoch != epoch) { 1368 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); 1369 list_del(&epoch->list); 1370 ev = EV_BECAME_LAST | (ev & EV_CLEANUP); 1371 connection->epochs--; 1372 kfree(epoch); 1373 1374 if (rv == FE_STILL_LIVE) 1375 rv = FE_DESTROYED; 1376 } else { 1377 epoch->flags = 0; 1378 atomic_set(&epoch->epoch_size, 0); 1379 /* atomic_set(&epoch->active, 0); is already zero */ 1380 if (rv == FE_STILL_LIVE) 1381 rv = FE_RECYCLED; 1382 } 1383 } 1384 1385 if (!next_epoch) 1386 break; 1387 1388 epoch = next_epoch; 1389 } while (1); 1390 1391 spin_unlock(&connection->epoch_lock); 1392 1393 return rv; 1394 } 1395 1396 static enum write_ordering_e 1397 max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) 1398 { 1399 struct disk_conf *dc; 1400 1401 dc = rcu_dereference(bdev->disk_conf); 1402 1403 if (wo == WO_BDEV_FLUSH && !dc->disk_flushes) 1404 wo = WO_DRAIN_IO; 1405 if (wo == WO_DRAIN_IO && !dc->disk_drain) 1406 wo = WO_NONE; 1407 1408 return wo; 1409 } 1410 1411 /** 1412 * drbd_bump_write_ordering() - Fall back to an other write ordering method 1413 * @connection: DRBD connection. 1414 * @wo: Write ordering method to try. 1415 */ 1416 void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, 1417 enum write_ordering_e wo) 1418 { 1419 struct drbd_device *device; 1420 enum write_ordering_e pwo; 1421 int vnr; 1422 static char *write_ordering_str[] = { 1423 [WO_NONE] = "none", 1424 [WO_DRAIN_IO] = "drain", 1425 [WO_BDEV_FLUSH] = "flush", 1426 }; 1427 1428 pwo = resource->write_ordering; 1429 if (wo != WO_BDEV_FLUSH) 1430 wo = min(pwo, wo); 1431 rcu_read_lock(); 1432 idr_for_each_entry(&resource->devices, device, vnr) { 1433 if (get_ldev(device)) { 1434 wo = max_allowed_wo(device->ldev, wo); 1435 if (device->ldev == bdev) 1436 bdev = NULL; 1437 put_ldev(device); 1438 } 1439 } 1440 1441 if (bdev) 1442 wo = max_allowed_wo(bdev, wo); 1443 1444 rcu_read_unlock(); 1445 1446 resource->write_ordering = wo; 1447 if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH) 1448 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1449 } 1450 1451 static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) 1452 { 1453 struct block_device *bdev = device->ldev->backing_bdev; 1454 1455 if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, 1456 GFP_NOIO, 0)) 1457 peer_req->flags |= EE_WAS_ERROR; 1458 1459 drbd_endio_write_sec_final(peer_req); 1460 } 1461 1462 static void drbd_issue_peer_wsame(struct drbd_device *device, 1463 struct drbd_peer_request *peer_req) 1464 { 1465 struct block_device *bdev = device->ldev->backing_bdev; 1466 sector_t s = peer_req->i.sector; 1467 sector_t nr = peer_req->i.size >> 9; 1468 if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) 1469 peer_req->flags |= EE_WAS_ERROR; 1470 drbd_endio_write_sec_final(peer_req); 1471 } 1472 1473 1474 /** 1475 * drbd_submit_peer_request() 1476 * @device: DRBD device. 1477 * @peer_req: peer request 1478 * @rw: flag field, see bio->bi_opf 1479 * 1480 * May spread the pages to multiple bios, 1481 * depending on bio_add_page restrictions. 1482 * 1483 * Returns 0 if all bios have been submitted, 1484 * -ENOMEM if we could not allocate enough bios, 1485 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a 1486 * single page to an empty bio (which should never happen and likely indicates 1487 * that the lower level IO stack is in some way broken). This has been observed 1488 * on certain Xen deployments. 1489 */ 1490 /* TODO allocate from our own bio_set. */ 1491 int drbd_submit_peer_request(struct drbd_device *device, 1492 struct drbd_peer_request *peer_req, 1493 const unsigned op, const unsigned op_flags, 1494 const int fault_type) 1495 { 1496 struct bio *bios = NULL; 1497 struct bio *bio; 1498 struct page *page = peer_req->pages; 1499 sector_t sector = peer_req->i.sector; 1500 unsigned data_size = peer_req->i.size; 1501 unsigned n_bios = 0; 1502 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 1503 int err = -ENOMEM; 1504 1505 /* TRIM/DISCARD: for now, always use the helper function 1506 * blkdev_issue_zeroout(..., discard=true). 1507 * It's synchronous, but it does the right thing wrt. bio splitting. 1508 * Correctness first, performance later. Next step is to code an 1509 * asynchronous variant of the same. 1510 */ 1511 if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { 1512 /* wait for all pending IO completions, before we start 1513 * zeroing things out. */ 1514 conn_wait_active_ee_empty(peer_req->peer_device->connection); 1515 /* add it to the active list now, 1516 * so we can find it to present it in debugfs */ 1517 peer_req->submit_jif = jiffies; 1518 peer_req->flags |= EE_SUBMITTED; 1519 1520 /* If this was a resync request from receive_rs_deallocated(), 1521 * it is already on the sync_ee list */ 1522 if (list_empty(&peer_req->w.list)) { 1523 spin_lock_irq(&device->resource->req_lock); 1524 list_add_tail(&peer_req->w.list, &device->active_ee); 1525 spin_unlock_irq(&device->resource->req_lock); 1526 } 1527 1528 if (peer_req->flags & EE_IS_TRIM) 1529 drbd_issue_peer_discard(device, peer_req); 1530 else /* EE_WRITE_SAME */ 1531 drbd_issue_peer_wsame(device, peer_req); 1532 return 0; 1533 } 1534 1535 /* In most cases, we will only need one bio. But in case the lower 1536 * level restrictions happen to be different at this offset on this 1537 * side than those of the sending peer, we may need to submit the 1538 * request in more than one bio. 1539 * 1540 * Plain bio_alloc is good enough here, this is no DRBD internally 1541 * generated bio, but a bio allocated on behalf of the peer. 1542 */ 1543 next_bio: 1544 bio = bio_alloc(GFP_NOIO, nr_pages); 1545 if (!bio) { 1546 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); 1547 goto fail; 1548 } 1549 /* > peer_req->i.sector, unless this is the first bio */ 1550 bio->bi_iter.bi_sector = sector; 1551 bio->bi_bdev = device->ldev->backing_bdev; 1552 bio_set_op_attrs(bio, op, op_flags); 1553 bio->bi_private = peer_req; 1554 bio->bi_end_io = drbd_peer_request_endio; 1555 1556 bio->bi_next = bios; 1557 bios = bio; 1558 ++n_bios; 1559 1560 page_chain_for_each(page) { 1561 unsigned len = min_t(unsigned, data_size, PAGE_SIZE); 1562 if (!bio_add_page(bio, page, len, 0)) 1563 goto next_bio; 1564 data_size -= len; 1565 sector += len >> 9; 1566 --nr_pages; 1567 } 1568 D_ASSERT(device, data_size == 0); 1569 D_ASSERT(device, page == NULL); 1570 1571 atomic_set(&peer_req->pending_bios, n_bios); 1572 /* for debugfs: update timestamp, mark as submitted */ 1573 peer_req->submit_jif = jiffies; 1574 peer_req->flags |= EE_SUBMITTED; 1575 do { 1576 bio = bios; 1577 bios = bios->bi_next; 1578 bio->bi_next = NULL; 1579 1580 drbd_generic_make_request(device, fault_type, bio); 1581 } while (bios); 1582 return 0; 1583 1584 fail: 1585 while (bios) { 1586 bio = bios; 1587 bios = bios->bi_next; 1588 bio_put(bio); 1589 } 1590 return err; 1591 } 1592 1593 static void drbd_remove_epoch_entry_interval(struct drbd_device *device, 1594 struct drbd_peer_request *peer_req) 1595 { 1596 struct drbd_interval *i = &peer_req->i; 1597 1598 drbd_remove_interval(&device->write_requests, i); 1599 drbd_clear_interval(i); 1600 1601 /* Wake up any processes waiting for this peer request to complete. */ 1602 if (i->waiting) 1603 wake_up(&device->misc_wait); 1604 } 1605 1606 static void conn_wait_active_ee_empty(struct drbd_connection *connection) 1607 { 1608 struct drbd_peer_device *peer_device; 1609 int vnr; 1610 1611 rcu_read_lock(); 1612 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1613 struct drbd_device *device = peer_device->device; 1614 1615 kref_get(&device->kref); 1616 rcu_read_unlock(); 1617 drbd_wait_ee_list_empty(device, &device->active_ee); 1618 kref_put(&device->kref, drbd_destroy_device); 1619 rcu_read_lock(); 1620 } 1621 rcu_read_unlock(); 1622 } 1623 1624 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) 1625 { 1626 int rv; 1627 struct p_barrier *p = pi->data; 1628 struct drbd_epoch *epoch; 1629 1630 /* FIXME these are unacked on connection, 1631 * not a specific (peer)device. 1632 */ 1633 connection->current_epoch->barrier_nr = p->barrier; 1634 connection->current_epoch->connection = connection; 1635 rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR); 1636 1637 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from 1638 * the activity log, which means it would not be resynced in case the 1639 * R_PRIMARY crashes now. 1640 * Therefore we must send the barrier_ack after the barrier request was 1641 * completed. */ 1642 switch (connection->resource->write_ordering) { 1643 case WO_NONE: 1644 if (rv == FE_RECYCLED) 1645 return 0; 1646 1647 /* receiver context, in the writeout path of the other node. 1648 * avoid potential distributed deadlock */ 1649 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); 1650 if (epoch) 1651 break; 1652 else 1653 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); 1654 /* Fall through */ 1655 1656 case WO_BDEV_FLUSH: 1657 case WO_DRAIN_IO: 1658 conn_wait_active_ee_empty(connection); 1659 drbd_flush(connection); 1660 1661 if (atomic_read(&connection->current_epoch->epoch_size)) { 1662 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); 1663 if (epoch) 1664 break; 1665 } 1666 1667 return 0; 1668 default: 1669 drbd_err(connection, "Strangeness in connection->write_ordering %d\n", 1670 connection->resource->write_ordering); 1671 return -EIO; 1672 } 1673 1674 epoch->flags = 0; 1675 atomic_set(&epoch->epoch_size, 0); 1676 atomic_set(&epoch->active, 0); 1677 1678 spin_lock(&connection->epoch_lock); 1679 if (atomic_read(&connection->current_epoch->epoch_size)) { 1680 list_add(&epoch->list, &connection->current_epoch->list); 1681 connection->current_epoch = epoch; 1682 connection->epochs++; 1683 } else { 1684 /* The current_epoch got recycled while we allocated this one... */ 1685 kfree(epoch); 1686 } 1687 spin_unlock(&connection->epoch_lock); 1688 1689 return 0; 1690 } 1691 1692 /* quick wrapper in case payload size != request_size (write same) */ 1693 static void drbd_csum_ee_size(struct crypto_ahash *h, 1694 struct drbd_peer_request *r, void *d, 1695 unsigned int payload_size) 1696 { 1697 unsigned int tmp = r->i.size; 1698 r->i.size = payload_size; 1699 drbd_csum_ee(h, r, d); 1700 r->i.size = tmp; 1701 } 1702 1703 /* used from receive_RSDataReply (recv_resync_read) 1704 * and from receive_Data. 1705 * data_size: actual payload ("data in") 1706 * for normal writes that is bi_size. 1707 * for discards, that is zero. 1708 * for write same, it is logical_block_size. 1709 * both trim and write same have the bi_size ("data len to be affected") 1710 * as extra argument in the packet header. 1711 */ 1712 static struct drbd_peer_request * 1713 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 1714 struct packet_info *pi) __must_hold(local) 1715 { 1716 struct drbd_device *device = peer_device->device; 1717 const sector_t capacity = drbd_get_capacity(device->this_bdev); 1718 struct drbd_peer_request *peer_req; 1719 struct page *page; 1720 int digest_size, err; 1721 unsigned int data_size = pi->size, ds; 1722 void *dig_in = peer_device->connection->int_dig_in; 1723 void *dig_vv = peer_device->connection->int_dig_vv; 1724 unsigned long *data; 1725 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; 1726 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; 1727 1728 digest_size = 0; 1729 if (!trim && peer_device->connection->peer_integrity_tfm) { 1730 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); 1731 /* 1732 * FIXME: Receive the incoming digest into the receive buffer 1733 * here, together with its struct p_data? 1734 */ 1735 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); 1736 if (err) 1737 return NULL; 1738 data_size -= digest_size; 1739 } 1740 1741 /* assume request_size == data_size, but special case trim and wsame. */ 1742 ds = data_size; 1743 if (trim) { 1744 if (!expect(data_size == 0)) 1745 return NULL; 1746 ds = be32_to_cpu(trim->size); 1747 } else if (wsame) { 1748 if (data_size != queue_logical_block_size(device->rq_queue)) { 1749 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", 1750 data_size, queue_logical_block_size(device->rq_queue)); 1751 return NULL; 1752 } 1753 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { 1754 drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", 1755 data_size, bdev_logical_block_size(device->ldev->backing_bdev)); 1756 return NULL; 1757 } 1758 ds = be32_to_cpu(wsame->size); 1759 } 1760 1761 if (!expect(IS_ALIGNED(ds, 512))) 1762 return NULL; 1763 if (trim || wsame) { 1764 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) 1765 return NULL; 1766 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) 1767 return NULL; 1768 1769 /* even though we trust out peer, 1770 * we sometimes have to double check. */ 1771 if (sector + (ds>>9) > capacity) { 1772 drbd_err(device, "request from peer beyond end of local disk: " 1773 "capacity: %llus < sector: %llus + size: %u\n", 1774 (unsigned long long)capacity, 1775 (unsigned long long)sector, ds); 1776 return NULL; 1777 } 1778 1779 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1780 * "criss-cross" setup, that might cause write-out on some other DRBD, 1781 * which in turn might block on the other node at this very place. */ 1782 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO); 1783 if (!peer_req) 1784 return NULL; 1785 1786 peer_req->flags |= EE_WRITE; 1787 if (trim) { 1788 peer_req->flags |= EE_IS_TRIM; 1789 return peer_req; 1790 } 1791 if (wsame) 1792 peer_req->flags |= EE_WRITE_SAME; 1793 1794 /* receive payload size bytes into page chain */ 1795 ds = data_size; 1796 page = peer_req->pages; 1797 page_chain_for_each(page) { 1798 unsigned len = min_t(int, ds, PAGE_SIZE); 1799 data = kmap(page); 1800 err = drbd_recv_all_warn(peer_device->connection, data, len); 1801 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { 1802 drbd_err(device, "Fault injection: Corrupting data on receive\n"); 1803 data[0] = data[0] ^ (unsigned long)-1; 1804 } 1805 kunmap(page); 1806 if (err) { 1807 drbd_free_peer_req(device, peer_req); 1808 return NULL; 1809 } 1810 ds -= len; 1811 } 1812 1813 if (digest_size) { 1814 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size); 1815 if (memcmp(dig_in, dig_vv, digest_size)) { 1816 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", 1817 (unsigned long long)sector, data_size); 1818 drbd_free_peer_req(device, peer_req); 1819 return NULL; 1820 } 1821 } 1822 device->recv_cnt += data_size >> 9; 1823 return peer_req; 1824 } 1825 1826 /* drbd_drain_block() just takes a data block 1827 * out of the socket input buffer, and discards it. 1828 */ 1829 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) 1830 { 1831 struct page *page; 1832 int err = 0; 1833 void *data; 1834 1835 if (!data_size) 1836 return 0; 1837 1838 page = drbd_alloc_pages(peer_device, 1, 1); 1839 1840 data = kmap(page); 1841 while (data_size) { 1842 unsigned int len = min_t(int, data_size, PAGE_SIZE); 1843 1844 err = drbd_recv_all_warn(peer_device->connection, data, len); 1845 if (err) 1846 break; 1847 data_size -= len; 1848 } 1849 kunmap(page); 1850 drbd_free_pages(peer_device->device, page, 0); 1851 return err; 1852 } 1853 1854 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req, 1855 sector_t sector, int data_size) 1856 { 1857 struct bio_vec bvec; 1858 struct bvec_iter iter; 1859 struct bio *bio; 1860 int digest_size, err, expect; 1861 void *dig_in = peer_device->connection->int_dig_in; 1862 void *dig_vv = peer_device->connection->int_dig_vv; 1863 1864 digest_size = 0; 1865 if (peer_device->connection->peer_integrity_tfm) { 1866 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); 1867 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); 1868 if (err) 1869 return err; 1870 data_size -= digest_size; 1871 } 1872 1873 /* optimistically update recv_cnt. if receiving fails below, 1874 * we disconnect anyways, and counters will be reset. */ 1875 peer_device->device->recv_cnt += data_size>>9; 1876 1877 bio = req->master_bio; 1878 D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); 1879 1880 bio_for_each_segment(bvec, bio, iter) { 1881 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; 1882 expect = min_t(int, data_size, bvec.bv_len); 1883 err = drbd_recv_all_warn(peer_device->connection, mapped, expect); 1884 kunmap(bvec.bv_page); 1885 if (err) 1886 return err; 1887 data_size -= expect; 1888 } 1889 1890 if (digest_size) { 1891 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv); 1892 if (memcmp(dig_in, dig_vv, digest_size)) { 1893 drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n"); 1894 return -EINVAL; 1895 } 1896 } 1897 1898 D_ASSERT(peer_device->device, data_size == 0); 1899 return 0; 1900 } 1901 1902 /* 1903 * e_end_resync_block() is called in ack_sender context via 1904 * drbd_finish_peer_reqs(). 1905 */ 1906 static int e_end_resync_block(struct drbd_work *w, int unused) 1907 { 1908 struct drbd_peer_request *peer_req = 1909 container_of(w, struct drbd_peer_request, w); 1910 struct drbd_peer_device *peer_device = peer_req->peer_device; 1911 struct drbd_device *device = peer_device->device; 1912 sector_t sector = peer_req->i.sector; 1913 int err; 1914 1915 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 1916 1917 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 1918 drbd_set_in_sync(device, sector, peer_req->i.size); 1919 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); 1920 } else { 1921 /* Record failure to sync */ 1922 drbd_rs_failed_io(device, sector, peer_req->i.size); 1923 1924 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); 1925 } 1926 dec_unacked(device); 1927 1928 return err; 1929 } 1930 1931 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, 1932 struct packet_info *pi) __releases(local) 1933 { 1934 struct drbd_device *device = peer_device->device; 1935 struct drbd_peer_request *peer_req; 1936 1937 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); 1938 if (!peer_req) 1939 goto fail; 1940 1941 dec_rs_pending(device); 1942 1943 inc_unacked(device); 1944 /* corresponding dec_unacked() in e_end_resync_block() 1945 * respective _drbd_clear_done_ee */ 1946 1947 peer_req->w.cb = e_end_resync_block; 1948 peer_req->submit_jif = jiffies; 1949 1950 spin_lock_irq(&device->resource->req_lock); 1951 list_add_tail(&peer_req->w.list, &device->sync_ee); 1952 spin_unlock_irq(&device->resource->req_lock); 1953 1954 atomic_add(pi->size >> 9, &device->rs_sect_ev); 1955 if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, 1956 DRBD_FAULT_RS_WR) == 0) 1957 return 0; 1958 1959 /* don't care for the reason here */ 1960 drbd_err(device, "submit failed, triggering re-connect\n"); 1961 spin_lock_irq(&device->resource->req_lock); 1962 list_del(&peer_req->w.list); 1963 spin_unlock_irq(&device->resource->req_lock); 1964 1965 drbd_free_peer_req(device, peer_req); 1966 fail: 1967 put_ldev(device); 1968 return -EIO; 1969 } 1970 1971 static struct drbd_request * 1972 find_request(struct drbd_device *device, struct rb_root *root, u64 id, 1973 sector_t sector, bool missing_ok, const char *func) 1974 { 1975 struct drbd_request *req; 1976 1977 /* Request object according to our peer */ 1978 req = (struct drbd_request *)(unsigned long)id; 1979 if (drbd_contains_interval(root, sector, &req->i) && req->i.local) 1980 return req; 1981 if (!missing_ok) { 1982 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func, 1983 (unsigned long)id, (unsigned long long)sector); 1984 } 1985 return NULL; 1986 } 1987 1988 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi) 1989 { 1990 struct drbd_peer_device *peer_device; 1991 struct drbd_device *device; 1992 struct drbd_request *req; 1993 sector_t sector; 1994 int err; 1995 struct p_data *p = pi->data; 1996 1997 peer_device = conn_peer_device(connection, pi->vnr); 1998 if (!peer_device) 1999 return -EIO; 2000 device = peer_device->device; 2001 2002 sector = be64_to_cpu(p->sector); 2003 2004 spin_lock_irq(&device->resource->req_lock); 2005 req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__); 2006 spin_unlock_irq(&device->resource->req_lock); 2007 if (unlikely(!req)) 2008 return -EIO; 2009 2010 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid 2011 * special casing it there for the various failure cases. 2012 * still no race with drbd_fail_pending_reads */ 2013 err = recv_dless_read(peer_device, req, sector, pi->size); 2014 if (!err) 2015 req_mod(req, DATA_RECEIVED); 2016 /* else: nothing. handled from drbd_disconnect... 2017 * I don't think we may complete this just yet 2018 * in case we are "on-disconnect: freeze" */ 2019 2020 return err; 2021 } 2022 2023 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi) 2024 { 2025 struct drbd_peer_device *peer_device; 2026 struct drbd_device *device; 2027 sector_t sector; 2028 int err; 2029 struct p_data *p = pi->data; 2030 2031 peer_device = conn_peer_device(connection, pi->vnr); 2032 if (!peer_device) 2033 return -EIO; 2034 device = peer_device->device; 2035 2036 sector = be64_to_cpu(p->sector); 2037 D_ASSERT(device, p->block_id == ID_SYNCER); 2038 2039 if (get_ldev(device)) { 2040 /* data is submitted to disk within recv_resync_read. 2041 * corresponding put_ldev done below on error, 2042 * or in drbd_peer_request_endio. */ 2043 err = recv_resync_read(peer_device, sector, pi); 2044 } else { 2045 if (__ratelimit(&drbd_ratelimit_state)) 2046 drbd_err(device, "Can not write resync data to local disk.\n"); 2047 2048 err = drbd_drain_block(peer_device, pi->size); 2049 2050 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); 2051 } 2052 2053 atomic_add(pi->size >> 9, &device->rs_sect_in); 2054 2055 return err; 2056 } 2057 2058 static void restart_conflicting_writes(struct drbd_device *device, 2059 sector_t sector, int size) 2060 { 2061 struct drbd_interval *i; 2062 struct drbd_request *req; 2063 2064 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2065 if (!i->local) 2066 continue; 2067 req = container_of(i, struct drbd_request, i); 2068 if (req->rq_state & RQ_LOCAL_PENDING || 2069 !(req->rq_state & RQ_POSTPONED)) 2070 continue; 2071 /* as it is RQ_POSTPONED, this will cause it to 2072 * be queued on the retry workqueue. */ 2073 __req_mod(req, CONFLICT_RESOLVED, NULL); 2074 } 2075 } 2076 2077 /* 2078 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). 2079 */ 2080 static int e_end_block(struct drbd_work *w, int cancel) 2081 { 2082 struct drbd_peer_request *peer_req = 2083 container_of(w, struct drbd_peer_request, w); 2084 struct drbd_peer_device *peer_device = peer_req->peer_device; 2085 struct drbd_device *device = peer_device->device; 2086 sector_t sector = peer_req->i.sector; 2087 int err = 0, pcmd; 2088 2089 if (peer_req->flags & EE_SEND_WRITE_ACK) { 2090 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 2091 pcmd = (device->state.conn >= C_SYNC_SOURCE && 2092 device->state.conn <= C_PAUSED_SYNC_T && 2093 peer_req->flags & EE_MAY_SET_IN_SYNC) ? 2094 P_RS_WRITE_ACK : P_WRITE_ACK; 2095 err = drbd_send_ack(peer_device, pcmd, peer_req); 2096 if (pcmd == P_RS_WRITE_ACK) 2097 drbd_set_in_sync(device, sector, peer_req->i.size); 2098 } else { 2099 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); 2100 /* we expect it to be marked out of sync anyways... 2101 * maybe assert this? */ 2102 } 2103 dec_unacked(device); 2104 } 2105 2106 /* we delete from the conflict detection hash _after_ we sent out the 2107 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 2108 if (peer_req->flags & EE_IN_INTERVAL_TREE) { 2109 spin_lock_irq(&device->resource->req_lock); 2110 D_ASSERT(device, !drbd_interval_empty(&peer_req->i)); 2111 drbd_remove_epoch_entry_interval(device, peer_req); 2112 if (peer_req->flags & EE_RESTART_REQUESTS) 2113 restart_conflicting_writes(device, sector, peer_req->i.size); 2114 spin_unlock_irq(&device->resource->req_lock); 2115 } else 2116 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 2117 2118 drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 2119 2120 return err; 2121 } 2122 2123 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) 2124 { 2125 struct drbd_peer_request *peer_req = 2126 container_of(w, struct drbd_peer_request, w); 2127 struct drbd_peer_device *peer_device = peer_req->peer_device; 2128 int err; 2129 2130 err = drbd_send_ack(peer_device, ack, peer_req); 2131 dec_unacked(peer_device->device); 2132 2133 return err; 2134 } 2135 2136 static int e_send_superseded(struct drbd_work *w, int unused) 2137 { 2138 return e_send_ack(w, P_SUPERSEDED); 2139 } 2140 2141 static int e_send_retry_write(struct drbd_work *w, int unused) 2142 { 2143 struct drbd_peer_request *peer_req = 2144 container_of(w, struct drbd_peer_request, w); 2145 struct drbd_connection *connection = peer_req->peer_device->connection; 2146 2147 return e_send_ack(w, connection->agreed_pro_version >= 100 ? 2148 P_RETRY_WRITE : P_SUPERSEDED); 2149 } 2150 2151 static bool seq_greater(u32 a, u32 b) 2152 { 2153 /* 2154 * We assume 32-bit wrap-around here. 2155 * For 24-bit wrap-around, we would have to shift: 2156 * a <<= 8; b <<= 8; 2157 */ 2158 return (s32)a - (s32)b > 0; 2159 } 2160 2161 static u32 seq_max(u32 a, u32 b) 2162 { 2163 return seq_greater(a, b) ? a : b; 2164 } 2165 2166 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq) 2167 { 2168 struct drbd_device *device = peer_device->device; 2169 unsigned int newest_peer_seq; 2170 2171 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) { 2172 spin_lock(&device->peer_seq_lock); 2173 newest_peer_seq = seq_max(device->peer_seq, peer_seq); 2174 device->peer_seq = newest_peer_seq; 2175 spin_unlock(&device->peer_seq_lock); 2176 /* wake up only if we actually changed device->peer_seq */ 2177 if (peer_seq == newest_peer_seq) 2178 wake_up(&device->seq_wait); 2179 } 2180 } 2181 2182 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) 2183 { 2184 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); 2185 } 2186 2187 /* maybe change sync_ee into interval trees as well? */ 2188 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) 2189 { 2190 struct drbd_peer_request *rs_req; 2191 bool rv = false; 2192 2193 spin_lock_irq(&device->resource->req_lock); 2194 list_for_each_entry(rs_req, &device->sync_ee, w.list) { 2195 if (overlaps(peer_req->i.sector, peer_req->i.size, 2196 rs_req->i.sector, rs_req->i.size)) { 2197 rv = true; 2198 break; 2199 } 2200 } 2201 spin_unlock_irq(&device->resource->req_lock); 2202 2203 return rv; 2204 } 2205 2206 /* Called from receive_Data. 2207 * Synchronize packets on sock with packets on msock. 2208 * 2209 * This is here so even when a P_DATA packet traveling via sock overtook an Ack 2210 * packet traveling on msock, they are still processed in the order they have 2211 * been sent. 2212 * 2213 * Note: we don't care for Ack packets overtaking P_DATA packets. 2214 * 2215 * In case packet_seq is larger than device->peer_seq number, there are 2216 * outstanding packets on the msock. We wait for them to arrive. 2217 * In case we are the logically next packet, we update device->peer_seq 2218 * ourselves. Correctly handles 32bit wrap around. 2219 * 2220 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, 2221 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds 2222 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have 2223 * 1<<9 == 512 seconds aka ages for the 32bit wrap around... 2224 * 2225 * returns 0 if we may process the packet, 2226 * -ERESTARTSYS if we were interrupted (by disconnect signal). */ 2227 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq) 2228 { 2229 struct drbd_device *device = peer_device->device; 2230 DEFINE_WAIT(wait); 2231 long timeout; 2232 int ret = 0, tp; 2233 2234 if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) 2235 return 0; 2236 2237 spin_lock(&device->peer_seq_lock); 2238 for (;;) { 2239 if (!seq_greater(peer_seq - 1, device->peer_seq)) { 2240 device->peer_seq = seq_max(device->peer_seq, peer_seq); 2241 break; 2242 } 2243 2244 if (signal_pending(current)) { 2245 ret = -ERESTARTSYS; 2246 break; 2247 } 2248 2249 rcu_read_lock(); 2250 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; 2251 rcu_read_unlock(); 2252 2253 if (!tp) 2254 break; 2255 2256 /* Only need to wait if two_primaries is enabled */ 2257 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE); 2258 spin_unlock(&device->peer_seq_lock); 2259 rcu_read_lock(); 2260 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10; 2261 rcu_read_unlock(); 2262 timeout = schedule_timeout(timeout); 2263 spin_lock(&device->peer_seq_lock); 2264 if (!timeout) { 2265 ret = -ETIMEDOUT; 2266 drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n"); 2267 break; 2268 } 2269 } 2270 spin_unlock(&device->peer_seq_lock); 2271 finish_wait(&device->seq_wait, &wait); 2272 return ret; 2273 } 2274 2275 /* see also bio_flags_to_wire() 2276 * DRBD_REQ_*, because we need to semantically map the flags to data packet 2277 * flags and back. We may replicate to other kernel versions. */ 2278 static unsigned long wire_flags_to_bio_flags(u32 dpf) 2279 { 2280 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | 2281 (dpf & DP_FUA ? REQ_FUA : 0) | 2282 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); 2283 } 2284 2285 static unsigned long wire_flags_to_bio_op(u32 dpf) 2286 { 2287 if (dpf & DP_DISCARD) 2288 return REQ_OP_WRITE_ZEROES; 2289 else 2290 return REQ_OP_WRITE; 2291 } 2292 2293 static void fail_postponed_requests(struct drbd_device *device, sector_t sector, 2294 unsigned int size) 2295 { 2296 struct drbd_interval *i; 2297 2298 repeat: 2299 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2300 struct drbd_request *req; 2301 struct bio_and_error m; 2302 2303 if (!i->local) 2304 continue; 2305 req = container_of(i, struct drbd_request, i); 2306 if (!(req->rq_state & RQ_POSTPONED)) 2307 continue; 2308 req->rq_state &= ~RQ_POSTPONED; 2309 __req_mod(req, NEG_ACKED, &m); 2310 spin_unlock_irq(&device->resource->req_lock); 2311 if (m.bio) 2312 complete_master_bio(device, &m); 2313 spin_lock_irq(&device->resource->req_lock); 2314 goto repeat; 2315 } 2316 } 2317 2318 static int handle_write_conflicts(struct drbd_device *device, 2319 struct drbd_peer_request *peer_req) 2320 { 2321 struct drbd_connection *connection = peer_req->peer_device->connection; 2322 bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags); 2323 sector_t sector = peer_req->i.sector; 2324 const unsigned int size = peer_req->i.size; 2325 struct drbd_interval *i; 2326 bool equal; 2327 int err; 2328 2329 /* 2330 * Inserting the peer request into the write_requests tree will prevent 2331 * new conflicting local requests from being added. 2332 */ 2333 drbd_insert_interval(&device->write_requests, &peer_req->i); 2334 2335 repeat: 2336 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2337 if (i == &peer_req->i) 2338 continue; 2339 if (i->completed) 2340 continue; 2341 2342 if (!i->local) { 2343 /* 2344 * Our peer has sent a conflicting remote request; this 2345 * should not happen in a two-node setup. Wait for the 2346 * earlier peer request to complete. 2347 */ 2348 err = drbd_wait_misc(device, i); 2349 if (err) 2350 goto out; 2351 goto repeat; 2352 } 2353 2354 equal = i->sector == sector && i->size == size; 2355 if (resolve_conflicts) { 2356 /* 2357 * If the peer request is fully contained within the 2358 * overlapping request, it can be considered overwritten 2359 * and thus superseded; otherwise, it will be retried 2360 * once all overlapping requests have completed. 2361 */ 2362 bool superseded = i->sector <= sector && i->sector + 2363 (i->size >> 9) >= sector + (size >> 9); 2364 2365 if (!equal) 2366 drbd_alert(device, "Concurrent writes detected: " 2367 "local=%llus +%u, remote=%llus +%u, " 2368 "assuming %s came first\n", 2369 (unsigned long long)i->sector, i->size, 2370 (unsigned long long)sector, size, 2371 superseded ? "local" : "remote"); 2372 2373 peer_req->w.cb = superseded ? e_send_superseded : 2374 e_send_retry_write; 2375 list_add_tail(&peer_req->w.list, &device->done_ee); 2376 queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); 2377 2378 err = -ENOENT; 2379 goto out; 2380 } else { 2381 struct drbd_request *req = 2382 container_of(i, struct drbd_request, i); 2383 2384 if (!equal) 2385 drbd_alert(device, "Concurrent writes detected: " 2386 "local=%llus +%u, remote=%llus +%u\n", 2387 (unsigned long long)i->sector, i->size, 2388 (unsigned long long)sector, size); 2389 2390 if (req->rq_state & RQ_LOCAL_PENDING || 2391 !(req->rq_state & RQ_POSTPONED)) { 2392 /* 2393 * Wait for the node with the discard flag to 2394 * decide if this request has been superseded 2395 * or needs to be retried. 2396 * Requests that have been superseded will 2397 * disappear from the write_requests tree. 2398 * 2399 * In addition, wait for the conflicting 2400 * request to finish locally before submitting 2401 * the conflicting peer request. 2402 */ 2403 err = drbd_wait_misc(device, &req->i); 2404 if (err) { 2405 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); 2406 fail_postponed_requests(device, sector, size); 2407 goto out; 2408 } 2409 goto repeat; 2410 } 2411 /* 2412 * Remember to restart the conflicting requests after 2413 * the new peer request has completed. 2414 */ 2415 peer_req->flags |= EE_RESTART_REQUESTS; 2416 } 2417 } 2418 err = 0; 2419 2420 out: 2421 if (err) 2422 drbd_remove_epoch_entry_interval(device, peer_req); 2423 return err; 2424 } 2425 2426 /* mirrored write */ 2427 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi) 2428 { 2429 struct drbd_peer_device *peer_device; 2430 struct drbd_device *device; 2431 struct net_conf *nc; 2432 sector_t sector; 2433 struct drbd_peer_request *peer_req; 2434 struct p_data *p = pi->data; 2435 u32 peer_seq = be32_to_cpu(p->seq_num); 2436 int op, op_flags; 2437 u32 dp_flags; 2438 int err, tp; 2439 2440 peer_device = conn_peer_device(connection, pi->vnr); 2441 if (!peer_device) 2442 return -EIO; 2443 device = peer_device->device; 2444 2445 if (!get_ldev(device)) { 2446 int err2; 2447 2448 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2449 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); 2450 atomic_inc(&connection->current_epoch->epoch_size); 2451 err2 = drbd_drain_block(peer_device, pi->size); 2452 if (!err) 2453 err = err2; 2454 return err; 2455 } 2456 2457 /* 2458 * Corresponding put_ldev done either below (on various errors), or in 2459 * drbd_peer_request_endio, if we successfully submit the data at the 2460 * end of this function. 2461 */ 2462 2463 sector = be64_to_cpu(p->sector); 2464 peer_req = read_in_block(peer_device, p->block_id, sector, pi); 2465 if (!peer_req) { 2466 put_ldev(device); 2467 return -EIO; 2468 } 2469 2470 peer_req->w.cb = e_end_block; 2471 peer_req->submit_jif = jiffies; 2472 peer_req->flags |= EE_APPLICATION; 2473 2474 dp_flags = be32_to_cpu(p->dp_flags); 2475 op = wire_flags_to_bio_op(dp_flags); 2476 op_flags = wire_flags_to_bio_flags(dp_flags); 2477 if (pi->cmd == P_TRIM) { 2478 D_ASSERT(peer_device, peer_req->i.size > 0); 2479 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); 2480 D_ASSERT(peer_device, peer_req->pages == NULL); 2481 } else if (peer_req->pages == NULL) { 2482 D_ASSERT(device, peer_req->i.size == 0); 2483 D_ASSERT(device, dp_flags & DP_FLUSH); 2484 } 2485 2486 if (dp_flags & DP_MAY_SET_IN_SYNC) 2487 peer_req->flags |= EE_MAY_SET_IN_SYNC; 2488 2489 spin_lock(&connection->epoch_lock); 2490 peer_req->epoch = connection->current_epoch; 2491 atomic_inc(&peer_req->epoch->epoch_size); 2492 atomic_inc(&peer_req->epoch->active); 2493 spin_unlock(&connection->epoch_lock); 2494 2495 rcu_read_lock(); 2496 nc = rcu_dereference(peer_device->connection->net_conf); 2497 tp = nc->two_primaries; 2498 if (peer_device->connection->agreed_pro_version < 100) { 2499 switch (nc->wire_protocol) { 2500 case DRBD_PROT_C: 2501 dp_flags |= DP_SEND_WRITE_ACK; 2502 break; 2503 case DRBD_PROT_B: 2504 dp_flags |= DP_SEND_RECEIVE_ACK; 2505 break; 2506 } 2507 } 2508 rcu_read_unlock(); 2509 2510 if (dp_flags & DP_SEND_WRITE_ACK) { 2511 peer_req->flags |= EE_SEND_WRITE_ACK; 2512 inc_unacked(device); 2513 /* corresponding dec_unacked() in e_end_block() 2514 * respective _drbd_clear_done_ee */ 2515 } 2516 2517 if (dp_flags & DP_SEND_RECEIVE_ACK) { 2518 /* I really don't like it that the receiver thread 2519 * sends on the msock, but anyways */ 2520 drbd_send_ack(peer_device, P_RECV_ACK, peer_req); 2521 } 2522 2523 if (tp) { 2524 /* two primaries implies protocol C */ 2525 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); 2526 peer_req->flags |= EE_IN_INTERVAL_TREE; 2527 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2528 if (err) 2529 goto out_interrupted; 2530 spin_lock_irq(&device->resource->req_lock); 2531 err = handle_write_conflicts(device, peer_req); 2532 if (err) { 2533 spin_unlock_irq(&device->resource->req_lock); 2534 if (err == -ENOENT) { 2535 put_ldev(device); 2536 return 0; 2537 } 2538 goto out_interrupted; 2539 } 2540 } else { 2541 update_peer_seq(peer_device, peer_seq); 2542 spin_lock_irq(&device->resource->req_lock); 2543 } 2544 /* TRIM and WRITE_SAME are processed synchronously, 2545 * we wait for all pending requests, respectively wait for 2546 * active_ee to become empty in drbd_submit_peer_request(); 2547 * better not add ourselves here. */ 2548 if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) 2549 list_add_tail(&peer_req->w.list, &device->active_ee); 2550 spin_unlock_irq(&device->resource->req_lock); 2551 2552 if (device->state.conn == C_SYNC_TARGET) 2553 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); 2554 2555 if (device->state.pdsk < D_INCONSISTENT) { 2556 /* In case we have the only disk of the cluster, */ 2557 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); 2558 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2559 drbd_al_begin_io(device, &peer_req->i); 2560 peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2561 } 2562 2563 err = drbd_submit_peer_request(device, peer_req, op, op_flags, 2564 DRBD_FAULT_DT_WR); 2565 if (!err) 2566 return 0; 2567 2568 /* don't care for the reason here */ 2569 drbd_err(device, "submit failed, triggering re-connect\n"); 2570 spin_lock_irq(&device->resource->req_lock); 2571 list_del(&peer_req->w.list); 2572 drbd_remove_epoch_entry_interval(device, peer_req); 2573 spin_unlock_irq(&device->resource->req_lock); 2574 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { 2575 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 2576 drbd_al_complete_io(device, &peer_req->i); 2577 } 2578 2579 out_interrupted: 2580 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP); 2581 put_ldev(device); 2582 drbd_free_peer_req(device, peer_req); 2583 return err; 2584 } 2585 2586 /* We may throttle resync, if the lower device seems to be busy, 2587 * and current sync rate is above c_min_rate. 2588 * 2589 * To decide whether or not the lower device is busy, we use a scheme similar 2590 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" 2591 * (more than 64 sectors) of activity we cannot account for with our own resync 2592 * activity, it obviously is "busy". 2593 * 2594 * The current sync rate used here uses only the most recent two step marks, 2595 * to have a short time average so we can react faster. 2596 */ 2597 bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 2598 bool throttle_if_app_is_waiting) 2599 { 2600 struct lc_element *tmp; 2601 bool throttle = drbd_rs_c_min_rate_throttle(device); 2602 2603 if (!throttle || throttle_if_app_is_waiting) 2604 return throttle; 2605 2606 spin_lock_irq(&device->al_lock); 2607 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); 2608 if (tmp) { 2609 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2610 if (test_bit(BME_PRIORITY, &bm_ext->flags)) 2611 throttle = false; 2612 /* Do not slow down if app IO is already waiting for this extent, 2613 * and our progress is necessary for application IO to complete. */ 2614 } 2615 spin_unlock_irq(&device->al_lock); 2616 2617 return throttle; 2618 } 2619 2620 bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) 2621 { 2622 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; 2623 unsigned long db, dt, dbdt; 2624 unsigned int c_min_rate; 2625 int curr_events; 2626 2627 rcu_read_lock(); 2628 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; 2629 rcu_read_unlock(); 2630 2631 /* feature disabled? */ 2632 if (c_min_rate == 0) 2633 return false; 2634 2635 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2636 (int)part_stat_read(&disk->part0, sectors[1]) - 2637 atomic_read(&device->rs_sect_ev); 2638 2639 if (atomic_read(&device->ap_actlog_cnt) 2640 || curr_events - device->rs_last_events > 64) { 2641 unsigned long rs_left; 2642 int i; 2643 2644 device->rs_last_events = curr_events; 2645 2646 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, 2647 * approx. */ 2648 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; 2649 2650 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) 2651 rs_left = device->ov_left; 2652 else 2653 rs_left = drbd_bm_total_weight(device) - device->rs_failed; 2654 2655 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; 2656 if (!dt) 2657 dt++; 2658 db = device->rs_mark_left[i] - rs_left; 2659 dbdt = Bit2KB(db/dt); 2660 2661 if (dbdt > c_min_rate) 2662 return true; 2663 } 2664 return false; 2665 } 2666 2667 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) 2668 { 2669 struct drbd_peer_device *peer_device; 2670 struct drbd_device *device; 2671 sector_t sector; 2672 sector_t capacity; 2673 struct drbd_peer_request *peer_req; 2674 struct digest_info *di = NULL; 2675 int size, verb; 2676 unsigned int fault_type; 2677 struct p_block_req *p = pi->data; 2678 2679 peer_device = conn_peer_device(connection, pi->vnr); 2680 if (!peer_device) 2681 return -EIO; 2682 device = peer_device->device; 2683 capacity = drbd_get_capacity(device->this_bdev); 2684 2685 sector = be64_to_cpu(p->sector); 2686 size = be32_to_cpu(p->blksize); 2687 2688 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 2689 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 2690 (unsigned long long)sector, size); 2691 return -EINVAL; 2692 } 2693 if (sector + (size>>9) > capacity) { 2694 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 2695 (unsigned long long)sector, size); 2696 return -EINVAL; 2697 } 2698 2699 if (!get_ldev_if_state(device, D_UP_TO_DATE)) { 2700 verb = 1; 2701 switch (pi->cmd) { 2702 case P_DATA_REQUEST: 2703 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); 2704 break; 2705 case P_RS_THIN_REQ: 2706 case P_RS_DATA_REQUEST: 2707 case P_CSUM_RS_REQUEST: 2708 case P_OV_REQUEST: 2709 drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p); 2710 break; 2711 case P_OV_REPLY: 2712 verb = 0; 2713 dec_rs_pending(device); 2714 drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); 2715 break; 2716 default: 2717 BUG(); 2718 } 2719 if (verb && __ratelimit(&drbd_ratelimit_state)) 2720 drbd_err(device, "Can not satisfy peer's read request, " 2721 "no local data.\n"); 2722 2723 /* drain possibly payload */ 2724 return drbd_drain_block(peer_device, pi->size); 2725 } 2726 2727 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2728 * "criss-cross" setup, that might cause write-out on some other DRBD, 2729 * which in turn might block on the other node at this very place. */ 2730 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, 2731 size, GFP_NOIO); 2732 if (!peer_req) { 2733 put_ldev(device); 2734 return -ENOMEM; 2735 } 2736 2737 switch (pi->cmd) { 2738 case P_DATA_REQUEST: 2739 peer_req->w.cb = w_e_end_data_req; 2740 fault_type = DRBD_FAULT_DT_RD; 2741 /* application IO, don't drbd_rs_begin_io */ 2742 peer_req->flags |= EE_APPLICATION; 2743 goto submit; 2744 2745 case P_RS_THIN_REQ: 2746 /* If at some point in the future we have a smart way to 2747 find out if this data block is completely deallocated, 2748 then we would do something smarter here than reading 2749 the block... */ 2750 peer_req->flags |= EE_RS_THIN_REQ; 2751 case P_RS_DATA_REQUEST: 2752 peer_req->w.cb = w_e_end_rsdata_req; 2753 fault_type = DRBD_FAULT_RS_RD; 2754 /* used in the sector offset progress display */ 2755 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2756 break; 2757 2758 case P_OV_REPLY: 2759 case P_CSUM_RS_REQUEST: 2760 fault_type = DRBD_FAULT_RS_RD; 2761 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); 2762 if (!di) 2763 goto out_free_e; 2764 2765 di->digest_size = pi->size; 2766 di->digest = (((char *)di)+sizeof(struct digest_info)); 2767 2768 peer_req->digest = di; 2769 peer_req->flags |= EE_HAS_DIGEST; 2770 2771 if (drbd_recv_all(peer_device->connection, di->digest, pi->size)) 2772 goto out_free_e; 2773 2774 if (pi->cmd == P_CSUM_RS_REQUEST) { 2775 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); 2776 peer_req->w.cb = w_e_end_csum_rs_req; 2777 /* used in the sector offset progress display */ 2778 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2779 /* remember to report stats in drbd_resync_finished */ 2780 device->use_csums = true; 2781 } else if (pi->cmd == P_OV_REPLY) { 2782 /* track progress, we may need to throttle */ 2783 atomic_add(size >> 9, &device->rs_sect_in); 2784 peer_req->w.cb = w_e_end_ov_reply; 2785 dec_rs_pending(device); 2786 /* drbd_rs_begin_io done when we sent this request, 2787 * but accounting still needs to be done. */ 2788 goto submit_for_resync; 2789 } 2790 break; 2791 2792 case P_OV_REQUEST: 2793 if (device->ov_start_sector == ~(sector_t)0 && 2794 peer_device->connection->agreed_pro_version >= 90) { 2795 unsigned long now = jiffies; 2796 int i; 2797 device->ov_start_sector = sector; 2798 device->ov_position = sector; 2799 device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector); 2800 device->rs_total = device->ov_left; 2801 for (i = 0; i < DRBD_SYNC_MARKS; i++) { 2802 device->rs_mark_left[i] = device->ov_left; 2803 device->rs_mark_time[i] = now; 2804 } 2805 drbd_info(device, "Online Verify start sector: %llu\n", 2806 (unsigned long long)sector); 2807 } 2808 peer_req->w.cb = w_e_end_ov_req; 2809 fault_type = DRBD_FAULT_RS_RD; 2810 break; 2811 2812 default: 2813 BUG(); 2814 } 2815 2816 /* Throttle, drbd_rs_begin_io and submit should become asynchronous 2817 * wrt the receiver, but it is not as straightforward as it may seem. 2818 * Various places in the resync start and stop logic assume resync 2819 * requests are processed in order, requeuing this on the worker thread 2820 * introduces a bunch of new code for synchronization between threads. 2821 * 2822 * Unlimited throttling before drbd_rs_begin_io may stall the resync 2823 * "forever", throttling after drbd_rs_begin_io will lock that extent 2824 * for application writes for the same time. For now, just throttle 2825 * here, where the rest of the code expects the receiver to sleep for 2826 * a while, anyways. 2827 */ 2828 2829 /* Throttle before drbd_rs_begin_io, as that locks out application IO; 2830 * this defers syncer requests for some time, before letting at least 2831 * on request through. The resync controller on the receiving side 2832 * will adapt to the incoming rate accordingly. 2833 * 2834 * We cannot throttle here if remote is Primary/SyncTarget: 2835 * we would also throttle its application reads. 2836 * In that case, throttling is done on the SyncTarget only. 2837 */ 2838 2839 /* Even though this may be a resync request, we do add to "read_ee"; 2840 * "sync_ee" is only used for resync WRITEs. 2841 * Add to list early, so debugfs can find this request 2842 * even if we have to sleep below. */ 2843 spin_lock_irq(&device->resource->req_lock); 2844 list_add_tail(&peer_req->w.list, &device->read_ee); 2845 spin_unlock_irq(&device->resource->req_lock); 2846 2847 update_receiver_timing_details(connection, drbd_rs_should_slow_down); 2848 if (device->state.peer != R_PRIMARY 2849 && drbd_rs_should_slow_down(device, sector, false)) 2850 schedule_timeout_uninterruptible(HZ/10); 2851 update_receiver_timing_details(connection, drbd_rs_begin_io); 2852 if (drbd_rs_begin_io(device, sector)) 2853 goto out_free_e; 2854 2855 submit_for_resync: 2856 atomic_add(size >> 9, &device->rs_sect_ev); 2857 2858 submit: 2859 update_receiver_timing_details(connection, drbd_submit_peer_request); 2860 inc_unacked(device); 2861 if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, 2862 fault_type) == 0) 2863 return 0; 2864 2865 /* don't care for the reason here */ 2866 drbd_err(device, "submit failed, triggering re-connect\n"); 2867 2868 out_free_e: 2869 spin_lock_irq(&device->resource->req_lock); 2870 list_del(&peer_req->w.list); 2871 spin_unlock_irq(&device->resource->req_lock); 2872 /* no drbd_rs_complete_io(), we are dropping the connection anyways */ 2873 2874 put_ldev(device); 2875 drbd_free_peer_req(device, peer_req); 2876 return -EIO; 2877 } 2878 2879 /** 2880 * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries 2881 */ 2882 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) 2883 { 2884 struct drbd_device *device = peer_device->device; 2885 int self, peer, rv = -100; 2886 unsigned long ch_self, ch_peer; 2887 enum drbd_after_sb_p after_sb_0p; 2888 2889 self = device->ldev->md.uuid[UI_BITMAP] & 1; 2890 peer = device->p_uuid[UI_BITMAP] & 1; 2891 2892 ch_peer = device->p_uuid[UI_SIZE]; 2893 ch_self = device->comm_bm_set; 2894 2895 rcu_read_lock(); 2896 after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p; 2897 rcu_read_unlock(); 2898 switch (after_sb_0p) { 2899 case ASB_CONSENSUS: 2900 case ASB_DISCARD_SECONDARY: 2901 case ASB_CALL_HELPER: 2902 case ASB_VIOLENTLY: 2903 drbd_err(device, "Configuration error.\n"); 2904 break; 2905 case ASB_DISCONNECT: 2906 break; 2907 case ASB_DISCARD_YOUNGER_PRI: 2908 if (self == 0 && peer == 1) { 2909 rv = -1; 2910 break; 2911 } 2912 if (self == 1 && peer == 0) { 2913 rv = 1; 2914 break; 2915 } 2916 /* Else fall through to one of the other strategies... */ 2917 case ASB_DISCARD_OLDER_PRI: 2918 if (self == 0 && peer == 1) { 2919 rv = 1; 2920 break; 2921 } 2922 if (self == 1 && peer == 0) { 2923 rv = -1; 2924 break; 2925 } 2926 /* Else fall through to one of the other strategies... */ 2927 drbd_warn(device, "Discard younger/older primary did not find a decision\n" 2928 "Using discard-least-changes instead\n"); 2929 case ASB_DISCARD_ZERO_CHG: 2930 if (ch_peer == 0 && ch_self == 0) { 2931 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) 2932 ? -1 : 1; 2933 break; 2934 } else { 2935 if (ch_peer == 0) { rv = 1; break; } 2936 if (ch_self == 0) { rv = -1; break; } 2937 } 2938 if (after_sb_0p == ASB_DISCARD_ZERO_CHG) 2939 break; 2940 case ASB_DISCARD_LEAST_CHG: 2941 if (ch_self < ch_peer) 2942 rv = -1; 2943 else if (ch_self > ch_peer) 2944 rv = 1; 2945 else /* ( ch_self == ch_peer ) */ 2946 /* Well, then use something else. */ 2947 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) 2948 ? -1 : 1; 2949 break; 2950 case ASB_DISCARD_LOCAL: 2951 rv = -1; 2952 break; 2953 case ASB_DISCARD_REMOTE: 2954 rv = 1; 2955 } 2956 2957 return rv; 2958 } 2959 2960 /** 2961 * drbd_asb_recover_1p - Recover after split-brain with one remaining primary 2962 */ 2963 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) 2964 { 2965 struct drbd_device *device = peer_device->device; 2966 int hg, rv = -100; 2967 enum drbd_after_sb_p after_sb_1p; 2968 2969 rcu_read_lock(); 2970 after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p; 2971 rcu_read_unlock(); 2972 switch (after_sb_1p) { 2973 case ASB_DISCARD_YOUNGER_PRI: 2974 case ASB_DISCARD_OLDER_PRI: 2975 case ASB_DISCARD_LEAST_CHG: 2976 case ASB_DISCARD_LOCAL: 2977 case ASB_DISCARD_REMOTE: 2978 case ASB_DISCARD_ZERO_CHG: 2979 drbd_err(device, "Configuration error.\n"); 2980 break; 2981 case ASB_DISCONNECT: 2982 break; 2983 case ASB_CONSENSUS: 2984 hg = drbd_asb_recover_0p(peer_device); 2985 if (hg == -1 && device->state.role == R_SECONDARY) 2986 rv = hg; 2987 if (hg == 1 && device->state.role == R_PRIMARY) 2988 rv = hg; 2989 break; 2990 case ASB_VIOLENTLY: 2991 rv = drbd_asb_recover_0p(peer_device); 2992 break; 2993 case ASB_DISCARD_SECONDARY: 2994 return device->state.role == R_PRIMARY ? 1 : -1; 2995 case ASB_CALL_HELPER: 2996 hg = drbd_asb_recover_0p(peer_device); 2997 if (hg == -1 && device->state.role == R_PRIMARY) { 2998 enum drbd_state_rv rv2; 2999 3000 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 3001 * we might be here in C_WF_REPORT_PARAMS which is transient. 3002 * we do not need to wait for the after state change work either. */ 3003 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); 3004 if (rv2 != SS_SUCCESS) { 3005 drbd_khelper(device, "pri-lost-after-sb"); 3006 } else { 3007 drbd_warn(device, "Successfully gave up primary role.\n"); 3008 rv = hg; 3009 } 3010 } else 3011 rv = hg; 3012 } 3013 3014 return rv; 3015 } 3016 3017 /** 3018 * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries 3019 */ 3020 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) 3021 { 3022 struct drbd_device *device = peer_device->device; 3023 int hg, rv = -100; 3024 enum drbd_after_sb_p after_sb_2p; 3025 3026 rcu_read_lock(); 3027 after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p; 3028 rcu_read_unlock(); 3029 switch (after_sb_2p) { 3030 case ASB_DISCARD_YOUNGER_PRI: 3031 case ASB_DISCARD_OLDER_PRI: 3032 case ASB_DISCARD_LEAST_CHG: 3033 case ASB_DISCARD_LOCAL: 3034 case ASB_DISCARD_REMOTE: 3035 case ASB_CONSENSUS: 3036 case ASB_DISCARD_SECONDARY: 3037 case ASB_DISCARD_ZERO_CHG: 3038 drbd_err(device, "Configuration error.\n"); 3039 break; 3040 case ASB_VIOLENTLY: 3041 rv = drbd_asb_recover_0p(peer_device); 3042 break; 3043 case ASB_DISCONNECT: 3044 break; 3045 case ASB_CALL_HELPER: 3046 hg = drbd_asb_recover_0p(peer_device); 3047 if (hg == -1) { 3048 enum drbd_state_rv rv2; 3049 3050 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 3051 * we might be here in C_WF_REPORT_PARAMS which is transient. 3052 * we do not need to wait for the after state change work either. */ 3053 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); 3054 if (rv2 != SS_SUCCESS) { 3055 drbd_khelper(device, "pri-lost-after-sb"); 3056 } else { 3057 drbd_warn(device, "Successfully gave up primary role.\n"); 3058 rv = hg; 3059 } 3060 } else 3061 rv = hg; 3062 } 3063 3064 return rv; 3065 } 3066 3067 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, 3068 u64 bits, u64 flags) 3069 { 3070 if (!uuid) { 3071 drbd_info(device, "%s uuid info vanished while I was looking!\n", text); 3072 return; 3073 } 3074 drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", 3075 text, 3076 (unsigned long long)uuid[UI_CURRENT], 3077 (unsigned long long)uuid[UI_BITMAP], 3078 (unsigned long long)uuid[UI_HISTORY_START], 3079 (unsigned long long)uuid[UI_HISTORY_END], 3080 (unsigned long long)bits, 3081 (unsigned long long)flags); 3082 } 3083 3084 /* 3085 100 after split brain try auto recover 3086 2 C_SYNC_SOURCE set BitMap 3087 1 C_SYNC_SOURCE use BitMap 3088 0 no Sync 3089 -1 C_SYNC_TARGET use BitMap 3090 -2 C_SYNC_TARGET set BitMap 3091 -100 after split brain, disconnect 3092 -1000 unrelated data 3093 -1091 requires proto 91 3094 -1096 requires proto 96 3095 */ 3096 3097 static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) 3098 { 3099 struct drbd_peer_device *const peer_device = first_peer_device(device); 3100 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 3101 u64 self, peer; 3102 int i, j; 3103 3104 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 3105 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3106 3107 *rule_nr = 10; 3108 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) 3109 return 0; 3110 3111 *rule_nr = 20; 3112 if ((self == UUID_JUST_CREATED || self == (u64)0) && 3113 peer != UUID_JUST_CREATED) 3114 return -2; 3115 3116 *rule_nr = 30; 3117 if (self != UUID_JUST_CREATED && 3118 (peer == UUID_JUST_CREATED || peer == (u64)0)) 3119 return 2; 3120 3121 if (self == peer) { 3122 int rct, dc; /* roles at crash time */ 3123 3124 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { 3125 3126 if (connection->agreed_pro_version < 91) 3127 return -1091; 3128 3129 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 3130 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { 3131 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n"); 3132 drbd_uuid_move_history(device); 3133 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; 3134 device->ldev->md.uuid[UI_BITMAP] = 0; 3135 3136 drbd_uuid_dump(device, "self", device->ldev->md.uuid, 3137 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); 3138 *rule_nr = 34; 3139 } else { 3140 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n"); 3141 *rule_nr = 36; 3142 } 3143 3144 return 1; 3145 } 3146 3147 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { 3148 3149 if (connection->agreed_pro_version < 91) 3150 return -1091; 3151 3152 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && 3153 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) { 3154 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); 3155 3156 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START]; 3157 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP]; 3158 device->p_uuid[UI_BITMAP] = 0UL; 3159 3160 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3161 *rule_nr = 35; 3162 } else { 3163 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n"); 3164 *rule_nr = 37; 3165 } 3166 3167 return -1; 3168 } 3169 3170 /* Common power [off|failure] */ 3171 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) + 3172 (device->p_uuid[UI_FLAGS] & 2); 3173 /* lowest bit is set when we were primary, 3174 * next bit (weight 2) is set when peer was primary */ 3175 *rule_nr = 40; 3176 3177 /* Neither has the "crashed primary" flag set, 3178 * only a replication link hickup. */ 3179 if (rct == 0) 3180 return 0; 3181 3182 /* Current UUID equal and no bitmap uuid; does not necessarily 3183 * mean this was a "simultaneous hard crash", maybe IO was 3184 * frozen, so no UUID-bump happened. 3185 * This is a protocol change, overload DRBD_FF_WSAME as flag 3186 * for "new-enough" peer DRBD version. */ 3187 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) { 3188 *rule_nr = 41; 3189 if (!(connection->agreed_features & DRBD_FF_WSAME)) { 3190 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n"); 3191 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8)); 3192 } 3193 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) { 3194 /* At least one has the "crashed primary" bit set, 3195 * both are primary now, but neither has rotated its UUIDs? 3196 * "Can not happen." */ 3197 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n"); 3198 return -100; 3199 } 3200 if (device->state.role == R_PRIMARY) 3201 return 1; 3202 return -1; 3203 } 3204 3205 /* Both are secondary. 3206 * Really looks like recovery from simultaneous hard crash. 3207 * Check which had been primary before, and arbitrate. */ 3208 switch (rct) { 3209 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */ 3210 case 1: /* self_pri && !peer_pri */ return 1; 3211 case 2: /* !self_pri && peer_pri */ return -1; 3212 case 3: /* self_pri && peer_pri */ 3213 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); 3214 return dc ? -1 : 1; 3215 } 3216 } 3217 3218 *rule_nr = 50; 3219 peer = device->p_uuid[UI_BITMAP] & ~((u64)1); 3220 if (self == peer) 3221 return -1; 3222 3223 *rule_nr = 51; 3224 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); 3225 if (self == peer) { 3226 if (connection->agreed_pro_version < 96 ? 3227 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == 3228 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : 3229 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { 3230 /* The last P_SYNC_UUID did not get though. Undo the last start of 3231 resync as sync source modifications of the peer's UUIDs. */ 3232 3233 if (connection->agreed_pro_version < 91) 3234 return -1091; 3235 3236 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; 3237 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1]; 3238 3239 drbd_info(device, "Lost last syncUUID packet, corrected:\n"); 3240 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3241 3242 return -1; 3243 } 3244 } 3245 3246 *rule_nr = 60; 3247 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 3248 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3249 peer = device->p_uuid[i] & ~((u64)1); 3250 if (self == peer) 3251 return -2; 3252 } 3253 3254 *rule_nr = 70; 3255 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 3256 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3257 if (self == peer) 3258 return 1; 3259 3260 *rule_nr = 71; 3261 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 3262 if (self == peer) { 3263 if (connection->agreed_pro_version < 96 ? 3264 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == 3265 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : 3266 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { 3267 /* The last P_SYNC_UUID did not get though. Undo the last start of 3268 resync as sync source modifications of our UUIDs. */ 3269 3270 if (connection->agreed_pro_version < 91) 3271 return -1091; 3272 3273 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); 3274 __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]); 3275 3276 drbd_info(device, "Last syncUUID did not get through, corrected:\n"); 3277 drbd_uuid_dump(device, "self", device->ldev->md.uuid, 3278 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); 3279 3280 return 1; 3281 } 3282 } 3283 3284 3285 *rule_nr = 80; 3286 peer = device->p_uuid[UI_CURRENT] & ~((u64)1); 3287 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3288 self = device->ldev->md.uuid[i] & ~((u64)1); 3289 if (self == peer) 3290 return 2; 3291 } 3292 3293 *rule_nr = 90; 3294 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 3295 peer = device->p_uuid[UI_BITMAP] & ~((u64)1); 3296 if (self == peer && self != ((u64)0)) 3297 return 100; 3298 3299 *rule_nr = 100; 3300 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 3301 self = device->ldev->md.uuid[i] & ~((u64)1); 3302 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { 3303 peer = device->p_uuid[j] & ~((u64)1); 3304 if (self == peer) 3305 return -100; 3306 } 3307 } 3308 3309 return -1000; 3310 } 3311 3312 /* drbd_sync_handshake() returns the new conn state on success, or 3313 CONN_MASK (-1) on failure. 3314 */ 3315 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, 3316 enum drbd_role peer_role, 3317 enum drbd_disk_state peer_disk) __must_hold(local) 3318 { 3319 struct drbd_device *device = peer_device->device; 3320 enum drbd_conns rv = C_MASK; 3321 enum drbd_disk_state mydisk; 3322 struct net_conf *nc; 3323 int hg, rule_nr, rr_conflict, tentative; 3324 3325 mydisk = device->state.disk; 3326 if (mydisk == D_NEGOTIATING) 3327 mydisk = device->new_state_tmp.disk; 3328 3329 drbd_info(device, "drbd_sync_handshake:\n"); 3330 3331 spin_lock_irq(&device->ldev->md.uuid_lock); 3332 drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0); 3333 drbd_uuid_dump(device, "peer", device->p_uuid, 3334 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3335 3336 hg = drbd_uuid_compare(device, peer_role, &rule_nr); 3337 spin_unlock_irq(&device->ldev->md.uuid_lock); 3338 3339 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); 3340 3341 if (hg == -1000) { 3342 drbd_alert(device, "Unrelated data, aborting!\n"); 3343 return C_MASK; 3344 } 3345 if (hg < -0x10000) { 3346 int proto, fflags; 3347 hg = -hg; 3348 proto = hg & 0xff; 3349 fflags = (hg >> 8) & 0xff; 3350 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n", 3351 proto, fflags); 3352 return C_MASK; 3353 } 3354 if (hg < -1000) { 3355 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); 3356 return C_MASK; 3357 } 3358 3359 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || 3360 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { 3361 int f = (hg == -100) || abs(hg) == 2; 3362 hg = mydisk > D_INCONSISTENT ? 1 : -1; 3363 if (f) 3364 hg = hg*2; 3365 drbd_info(device, "Becoming sync %s due to disk states.\n", 3366 hg > 0 ? "source" : "target"); 3367 } 3368 3369 if (abs(hg) == 100) 3370 drbd_khelper(device, "initial-split-brain"); 3371 3372 rcu_read_lock(); 3373 nc = rcu_dereference(peer_device->connection->net_conf); 3374 3375 if (hg == 100 || (hg == -100 && nc->always_asbp)) { 3376 int pcount = (device->state.role == R_PRIMARY) 3377 + (peer_role == R_PRIMARY); 3378 int forced = (hg == -100); 3379 3380 switch (pcount) { 3381 case 0: 3382 hg = drbd_asb_recover_0p(peer_device); 3383 break; 3384 case 1: 3385 hg = drbd_asb_recover_1p(peer_device); 3386 break; 3387 case 2: 3388 hg = drbd_asb_recover_2p(peer_device); 3389 break; 3390 } 3391 if (abs(hg) < 100) { 3392 drbd_warn(device, "Split-Brain detected, %d primaries, " 3393 "automatically solved. Sync from %s node\n", 3394 pcount, (hg < 0) ? "peer" : "this"); 3395 if (forced) { 3396 drbd_warn(device, "Doing a full sync, since" 3397 " UUIDs where ambiguous.\n"); 3398 hg = hg*2; 3399 } 3400 } 3401 } 3402 3403 if (hg == -100) { 3404 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1)) 3405 hg = -1; 3406 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1)) 3407 hg = 1; 3408 3409 if (abs(hg) < 100) 3410 drbd_warn(device, "Split-Brain detected, manually solved. " 3411 "Sync from %s node\n", 3412 (hg < 0) ? "peer" : "this"); 3413 } 3414 rr_conflict = nc->rr_conflict; 3415 tentative = nc->tentative; 3416 rcu_read_unlock(); 3417 3418 if (hg == -100) { 3419 /* FIXME this log message is not correct if we end up here 3420 * after an attempted attach on a diskless node. 3421 * We just refuse to attach -- well, we drop the "connection" 3422 * to that disk, in a way... */ 3423 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n"); 3424 drbd_khelper(device, "split-brain"); 3425 return C_MASK; 3426 } 3427 3428 if (hg > 0 && mydisk <= D_INCONSISTENT) { 3429 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n"); 3430 return C_MASK; 3431 } 3432 3433 if (hg < 0 && /* by intention we do not use mydisk here. */ 3434 device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) { 3435 switch (rr_conflict) { 3436 case ASB_CALL_HELPER: 3437 drbd_khelper(device, "pri-lost"); 3438 /* fall through */ 3439 case ASB_DISCONNECT: 3440 drbd_err(device, "I shall become SyncTarget, but I am primary!\n"); 3441 return C_MASK; 3442 case ASB_VIOLENTLY: 3443 drbd_warn(device, "Becoming SyncTarget, violating the stable-data" 3444 "assumption\n"); 3445 } 3446 } 3447 3448 if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) { 3449 if (hg == 0) 3450 drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n"); 3451 else 3452 drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.", 3453 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), 3454 abs(hg) >= 2 ? "full" : "bit-map based"); 3455 return C_MASK; 3456 } 3457 3458 if (abs(hg) >= 2) { 3459 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); 3460 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", 3461 BM_LOCKED_SET_ALLOWED)) 3462 return C_MASK; 3463 } 3464 3465 if (hg > 0) { /* become sync source. */ 3466 rv = C_WF_BITMAP_S; 3467 } else if (hg < 0) { /* become sync target */ 3468 rv = C_WF_BITMAP_T; 3469 } else { 3470 rv = C_CONNECTED; 3471 if (drbd_bm_total_weight(device)) { 3472 drbd_info(device, "No resync, but %lu bits in bitmap!\n", 3473 drbd_bm_total_weight(device)); 3474 } 3475 } 3476 3477 return rv; 3478 } 3479 3480 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) 3481 { 3482 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ 3483 if (peer == ASB_DISCARD_REMOTE) 3484 return ASB_DISCARD_LOCAL; 3485 3486 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ 3487 if (peer == ASB_DISCARD_LOCAL) 3488 return ASB_DISCARD_REMOTE; 3489 3490 /* everything else is valid if they are equal on both sides. */ 3491 return peer; 3492 } 3493 3494 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi) 3495 { 3496 struct p_protocol *p = pi->data; 3497 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; 3498 int p_proto, p_discard_my_data, p_two_primaries, cf; 3499 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; 3500 char integrity_alg[SHARED_SECRET_MAX] = ""; 3501 struct crypto_ahash *peer_integrity_tfm = NULL; 3502 void *int_dig_in = NULL, *int_dig_vv = NULL; 3503 3504 p_proto = be32_to_cpu(p->protocol); 3505 p_after_sb_0p = be32_to_cpu(p->after_sb_0p); 3506 p_after_sb_1p = be32_to_cpu(p->after_sb_1p); 3507 p_after_sb_2p = be32_to_cpu(p->after_sb_2p); 3508 p_two_primaries = be32_to_cpu(p->two_primaries); 3509 cf = be32_to_cpu(p->conn_flags); 3510 p_discard_my_data = cf & CF_DISCARD_MY_DATA; 3511 3512 if (connection->agreed_pro_version >= 87) { 3513 int err; 3514 3515 if (pi->size > sizeof(integrity_alg)) 3516 return -EIO; 3517 err = drbd_recv_all(connection, integrity_alg, pi->size); 3518 if (err) 3519 return err; 3520 integrity_alg[SHARED_SECRET_MAX - 1] = 0; 3521 } 3522 3523 if (pi->cmd != P_PROTOCOL_UPDATE) { 3524 clear_bit(CONN_DRY_RUN, &connection->flags); 3525 3526 if (cf & CF_DRY_RUN) 3527 set_bit(CONN_DRY_RUN, &connection->flags); 3528 3529 rcu_read_lock(); 3530 nc = rcu_dereference(connection->net_conf); 3531 3532 if (p_proto != nc->wire_protocol) { 3533 drbd_err(connection, "incompatible %s settings\n", "protocol"); 3534 goto disconnect_rcu_unlock; 3535 } 3536 3537 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { 3538 drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri"); 3539 goto disconnect_rcu_unlock; 3540 } 3541 3542 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { 3543 drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri"); 3544 goto disconnect_rcu_unlock; 3545 } 3546 3547 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { 3548 drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri"); 3549 goto disconnect_rcu_unlock; 3550 } 3551 3552 if (p_discard_my_data && nc->discard_my_data) { 3553 drbd_err(connection, "incompatible %s settings\n", "discard-my-data"); 3554 goto disconnect_rcu_unlock; 3555 } 3556 3557 if (p_two_primaries != nc->two_primaries) { 3558 drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries"); 3559 goto disconnect_rcu_unlock; 3560 } 3561 3562 if (strcmp(integrity_alg, nc->integrity_alg)) { 3563 drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg"); 3564 goto disconnect_rcu_unlock; 3565 } 3566 3567 rcu_read_unlock(); 3568 } 3569 3570 if (integrity_alg[0]) { 3571 int hash_size; 3572 3573 /* 3574 * We can only change the peer data integrity algorithm 3575 * here. Changing our own data integrity algorithm 3576 * requires that we send a P_PROTOCOL_UPDATE packet at 3577 * the same time; otherwise, the peer has no way to 3578 * tell between which packets the algorithm should 3579 * change. 3580 */ 3581 3582 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); 3583 if (IS_ERR(peer_integrity_tfm)) { 3584 peer_integrity_tfm = NULL; 3585 drbd_err(connection, "peer data-integrity-alg %s not supported\n", 3586 integrity_alg); 3587 goto disconnect; 3588 } 3589 3590 hash_size = crypto_ahash_digestsize(peer_integrity_tfm); 3591 int_dig_in = kmalloc(hash_size, GFP_KERNEL); 3592 int_dig_vv = kmalloc(hash_size, GFP_KERNEL); 3593 if (!(int_dig_in && int_dig_vv)) { 3594 drbd_err(connection, "Allocation of buffers for data integrity checking failed\n"); 3595 goto disconnect; 3596 } 3597 } 3598 3599 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); 3600 if (!new_net_conf) { 3601 drbd_err(connection, "Allocation of new net_conf failed\n"); 3602 goto disconnect; 3603 } 3604 3605 mutex_lock(&connection->data.mutex); 3606 mutex_lock(&connection->resource->conf_update); 3607 old_net_conf = connection->net_conf; 3608 *new_net_conf = *old_net_conf; 3609 3610 new_net_conf->wire_protocol = p_proto; 3611 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); 3612 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); 3613 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); 3614 new_net_conf->two_primaries = p_two_primaries; 3615 3616 rcu_assign_pointer(connection->net_conf, new_net_conf); 3617 mutex_unlock(&connection->resource->conf_update); 3618 mutex_unlock(&connection->data.mutex); 3619 3620 crypto_free_ahash(connection->peer_integrity_tfm); 3621 kfree(connection->int_dig_in); 3622 kfree(connection->int_dig_vv); 3623 connection->peer_integrity_tfm = peer_integrity_tfm; 3624 connection->int_dig_in = int_dig_in; 3625 connection->int_dig_vv = int_dig_vv; 3626 3627 if (strcmp(old_net_conf->integrity_alg, integrity_alg)) 3628 drbd_info(connection, "peer data-integrity-alg: %s\n", 3629 integrity_alg[0] ? integrity_alg : "(none)"); 3630 3631 synchronize_rcu(); 3632 kfree(old_net_conf); 3633 return 0; 3634 3635 disconnect_rcu_unlock: 3636 rcu_read_unlock(); 3637 disconnect: 3638 crypto_free_ahash(peer_integrity_tfm); 3639 kfree(int_dig_in); 3640 kfree(int_dig_vv); 3641 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 3642 return -EIO; 3643 } 3644 3645 /* helper function 3646 * input: alg name, feature name 3647 * return: NULL (alg name was "") 3648 * ERR_PTR(error) if something goes wrong 3649 * or the crypto hash ptr, if it worked out ok. */ 3650 static struct crypto_ahash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, 3651 const char *alg, const char *name) 3652 { 3653 struct crypto_ahash *tfm; 3654 3655 if (!alg[0]) 3656 return NULL; 3657 3658 tfm = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); 3659 if (IS_ERR(tfm)) { 3660 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", 3661 alg, name, PTR_ERR(tfm)); 3662 return tfm; 3663 } 3664 return tfm; 3665 } 3666 3667 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi) 3668 { 3669 void *buffer = connection->data.rbuf; 3670 int size = pi->size; 3671 3672 while (size) { 3673 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); 3674 s = drbd_recv(connection, buffer, s); 3675 if (s <= 0) { 3676 if (s < 0) 3677 return s; 3678 break; 3679 } 3680 size -= s; 3681 } 3682 if (size) 3683 return -EIO; 3684 return 0; 3685 } 3686 3687 /* 3688 * config_unknown_volume - device configuration command for unknown volume 3689 * 3690 * When a device is added to an existing connection, the node on which the 3691 * device is added first will send configuration commands to its peer but the 3692 * peer will not know about the device yet. It will warn and ignore these 3693 * commands. Once the device is added on the second node, the second node will 3694 * send the same device configuration commands, but in the other direction. 3695 * 3696 * (We can also end up here if drbd is misconfigured.) 3697 */ 3698 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi) 3699 { 3700 drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n", 3701 cmdname(pi->cmd), pi->vnr); 3702 return ignore_remaining_packet(connection, pi); 3703 } 3704 3705 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi) 3706 { 3707 struct drbd_peer_device *peer_device; 3708 struct drbd_device *device; 3709 struct p_rs_param_95 *p; 3710 unsigned int header_size, data_size, exp_max_sz; 3711 struct crypto_ahash *verify_tfm = NULL; 3712 struct crypto_ahash *csums_tfm = NULL; 3713 struct net_conf *old_net_conf, *new_net_conf = NULL; 3714 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; 3715 const int apv = connection->agreed_pro_version; 3716 struct fifo_buffer *old_plan = NULL, *new_plan = NULL; 3717 int fifo_size = 0; 3718 int err; 3719 3720 peer_device = conn_peer_device(connection, pi->vnr); 3721 if (!peer_device) 3722 return config_unknown_volume(connection, pi); 3723 device = peer_device->device; 3724 3725 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) 3726 : apv == 88 ? sizeof(struct p_rs_param) 3727 + SHARED_SECRET_MAX 3728 : apv <= 94 ? sizeof(struct p_rs_param_89) 3729 : /* apv >= 95 */ sizeof(struct p_rs_param_95); 3730 3731 if (pi->size > exp_max_sz) { 3732 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n", 3733 pi->size, exp_max_sz); 3734 return -EIO; 3735 } 3736 3737 if (apv <= 88) { 3738 header_size = sizeof(struct p_rs_param); 3739 data_size = pi->size - header_size; 3740 } else if (apv <= 94) { 3741 header_size = sizeof(struct p_rs_param_89); 3742 data_size = pi->size - header_size; 3743 D_ASSERT(device, data_size == 0); 3744 } else { 3745 header_size = sizeof(struct p_rs_param_95); 3746 data_size = pi->size - header_size; 3747 D_ASSERT(device, data_size == 0); 3748 } 3749 3750 /* initialize verify_alg and csums_alg */ 3751 p = pi->data; 3752 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 3753 3754 err = drbd_recv_all(peer_device->connection, p, header_size); 3755 if (err) 3756 return err; 3757 3758 mutex_lock(&connection->resource->conf_update); 3759 old_net_conf = peer_device->connection->net_conf; 3760 if (get_ldev(device)) { 3761 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); 3762 if (!new_disk_conf) { 3763 put_ldev(device); 3764 mutex_unlock(&connection->resource->conf_update); 3765 drbd_err(device, "Allocation of new disk_conf failed\n"); 3766 return -ENOMEM; 3767 } 3768 3769 old_disk_conf = device->ldev->disk_conf; 3770 *new_disk_conf = *old_disk_conf; 3771 3772 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); 3773 } 3774 3775 if (apv >= 88) { 3776 if (apv == 88) { 3777 if (data_size > SHARED_SECRET_MAX || data_size == 0) { 3778 drbd_err(device, "verify-alg of wrong size, " 3779 "peer wants %u, accepting only up to %u byte\n", 3780 data_size, SHARED_SECRET_MAX); 3781 err = -EIO; 3782 goto reconnect; 3783 } 3784 3785 err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size); 3786 if (err) 3787 goto reconnect; 3788 /* we expect NUL terminated string */ 3789 /* but just in case someone tries to be evil */ 3790 D_ASSERT(device, p->verify_alg[data_size-1] == 0); 3791 p->verify_alg[data_size-1] = 0; 3792 3793 } else /* apv >= 89 */ { 3794 /* we still expect NUL terminated strings */ 3795 /* but just in case someone tries to be evil */ 3796 D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0); 3797 D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0); 3798 p->verify_alg[SHARED_SECRET_MAX-1] = 0; 3799 p->csums_alg[SHARED_SECRET_MAX-1] = 0; 3800 } 3801 3802 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { 3803 if (device->state.conn == C_WF_REPORT_PARAMS) { 3804 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", 3805 old_net_conf->verify_alg, p->verify_alg); 3806 goto disconnect; 3807 } 3808 verify_tfm = drbd_crypto_alloc_digest_safe(device, 3809 p->verify_alg, "verify-alg"); 3810 if (IS_ERR(verify_tfm)) { 3811 verify_tfm = NULL; 3812 goto disconnect; 3813 } 3814 } 3815 3816 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { 3817 if (device->state.conn == C_WF_REPORT_PARAMS) { 3818 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", 3819 old_net_conf->csums_alg, p->csums_alg); 3820 goto disconnect; 3821 } 3822 csums_tfm = drbd_crypto_alloc_digest_safe(device, 3823 p->csums_alg, "csums-alg"); 3824 if (IS_ERR(csums_tfm)) { 3825 csums_tfm = NULL; 3826 goto disconnect; 3827 } 3828 } 3829 3830 if (apv > 94 && new_disk_conf) { 3831 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); 3832 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); 3833 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); 3834 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); 3835 3836 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; 3837 if (fifo_size != device->rs_plan_s->size) { 3838 new_plan = fifo_alloc(fifo_size); 3839 if (!new_plan) { 3840 drbd_err(device, "kmalloc of fifo_buffer failed"); 3841 put_ldev(device); 3842 goto disconnect; 3843 } 3844 } 3845 } 3846 3847 if (verify_tfm || csums_tfm) { 3848 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); 3849 if (!new_net_conf) { 3850 drbd_err(device, "Allocation of new net_conf failed\n"); 3851 goto disconnect; 3852 } 3853 3854 *new_net_conf = *old_net_conf; 3855 3856 if (verify_tfm) { 3857 strcpy(new_net_conf->verify_alg, p->verify_alg); 3858 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; 3859 crypto_free_ahash(peer_device->connection->verify_tfm); 3860 peer_device->connection->verify_tfm = verify_tfm; 3861 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); 3862 } 3863 if (csums_tfm) { 3864 strcpy(new_net_conf->csums_alg, p->csums_alg); 3865 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; 3866 crypto_free_ahash(peer_device->connection->csums_tfm); 3867 peer_device->connection->csums_tfm = csums_tfm; 3868 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); 3869 } 3870 rcu_assign_pointer(connection->net_conf, new_net_conf); 3871 } 3872 } 3873 3874 if (new_disk_conf) { 3875 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); 3876 put_ldev(device); 3877 } 3878 3879 if (new_plan) { 3880 old_plan = device->rs_plan_s; 3881 rcu_assign_pointer(device->rs_plan_s, new_plan); 3882 } 3883 3884 mutex_unlock(&connection->resource->conf_update); 3885 synchronize_rcu(); 3886 if (new_net_conf) 3887 kfree(old_net_conf); 3888 kfree(old_disk_conf); 3889 kfree(old_plan); 3890 3891 return 0; 3892 3893 reconnect: 3894 if (new_disk_conf) { 3895 put_ldev(device); 3896 kfree(new_disk_conf); 3897 } 3898 mutex_unlock(&connection->resource->conf_update); 3899 return -EIO; 3900 3901 disconnect: 3902 kfree(new_plan); 3903 if (new_disk_conf) { 3904 put_ldev(device); 3905 kfree(new_disk_conf); 3906 } 3907 mutex_unlock(&connection->resource->conf_update); 3908 /* just for completeness: actually not needed, 3909 * as this is not reached if csums_tfm was ok. */ 3910 crypto_free_ahash(csums_tfm); 3911 /* but free the verify_tfm again, if csums_tfm did not work out */ 3912 crypto_free_ahash(verify_tfm); 3913 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 3914 return -EIO; 3915 } 3916 3917 /* warn if the arguments differ by more than 12.5% */ 3918 static void warn_if_differ_considerably(struct drbd_device *device, 3919 const char *s, sector_t a, sector_t b) 3920 { 3921 sector_t d; 3922 if (a == 0 || b == 0) 3923 return; 3924 d = (a > b) ? (a - b) : (b - a); 3925 if (d > (a>>3) || d > (b>>3)) 3926 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s, 3927 (unsigned long long)a, (unsigned long long)b); 3928 } 3929 3930 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi) 3931 { 3932 struct drbd_peer_device *peer_device; 3933 struct drbd_device *device; 3934 struct p_sizes *p = pi->data; 3935 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; 3936 enum determine_dev_size dd = DS_UNCHANGED; 3937 sector_t p_size, p_usize, p_csize, my_usize; 3938 int ldsc = 0; /* local disk size changed */ 3939 enum dds_flags ddsf; 3940 3941 peer_device = conn_peer_device(connection, pi->vnr); 3942 if (!peer_device) 3943 return config_unknown_volume(connection, pi); 3944 device = peer_device->device; 3945 3946 p_size = be64_to_cpu(p->d_size); 3947 p_usize = be64_to_cpu(p->u_size); 3948 p_csize = be64_to_cpu(p->c_size); 3949 3950 /* just store the peer's disk size for now. 3951 * we still need to figure out whether we accept that. */ 3952 device->p_size = p_size; 3953 3954 if (get_ldev(device)) { 3955 sector_t new_size, cur_size; 3956 rcu_read_lock(); 3957 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; 3958 rcu_read_unlock(); 3959 3960 warn_if_differ_considerably(device, "lower level device sizes", 3961 p_size, drbd_get_max_capacity(device->ldev)); 3962 warn_if_differ_considerably(device, "user requested size", 3963 p_usize, my_usize); 3964 3965 /* if this is the first connect, or an otherwise expected 3966 * param exchange, choose the minimum */ 3967 if (device->state.conn == C_WF_REPORT_PARAMS) 3968 p_usize = min_not_zero(my_usize, p_usize); 3969 3970 /* Never shrink a device with usable data during connect. 3971 But allow online shrinking if we are connected. */ 3972 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); 3973 cur_size = drbd_get_capacity(device->this_bdev); 3974 if (new_size < cur_size && 3975 device->state.disk >= D_OUTDATED && 3976 device->state.conn < C_CONNECTED) { 3977 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", 3978 (unsigned long long)new_size, (unsigned long long)cur_size); 3979 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 3980 put_ldev(device); 3981 return -EIO; 3982 } 3983 3984 if (my_usize != p_usize) { 3985 struct disk_conf *old_disk_conf, *new_disk_conf = NULL; 3986 3987 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); 3988 if (!new_disk_conf) { 3989 drbd_err(device, "Allocation of new disk_conf failed\n"); 3990 put_ldev(device); 3991 return -ENOMEM; 3992 } 3993 3994 mutex_lock(&connection->resource->conf_update); 3995 old_disk_conf = device->ldev->disk_conf; 3996 *new_disk_conf = *old_disk_conf; 3997 new_disk_conf->disk_size = p_usize; 3998 3999 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); 4000 mutex_unlock(&connection->resource->conf_update); 4001 synchronize_rcu(); 4002 kfree(old_disk_conf); 4003 4004 drbd_info(device, "Peer sets u_size to %lu sectors\n", 4005 (unsigned long)my_usize); 4006 } 4007 4008 put_ldev(device); 4009 } 4010 4011 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 4012 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size(). 4013 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 4014 drbd_reconsider_queue_parameters(), we can be sure that after 4015 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ 4016 4017 ddsf = be16_to_cpu(p->dds_flags); 4018 if (get_ldev(device)) { 4019 drbd_reconsider_queue_parameters(device, device->ldev, o); 4020 dd = drbd_determine_dev_size(device, ddsf, NULL); 4021 put_ldev(device); 4022 if (dd == DS_ERROR) 4023 return -EIO; 4024 drbd_md_sync(device); 4025 } else { 4026 /* 4027 * I am diskless, need to accept the peer's *current* size. 4028 * I must NOT accept the peers backing disk size, 4029 * it may have been larger than mine all along... 4030 * 4031 * At this point, the peer knows more about my disk, or at 4032 * least about what we last agreed upon, than myself. 4033 * So if his c_size is less than his d_size, the most likely 4034 * reason is that *my* d_size was smaller last time we checked. 4035 * 4036 * However, if he sends a zero current size, 4037 * take his (user-capped or) backing disk size anyways. 4038 */ 4039 drbd_reconsider_queue_parameters(device, NULL, o); 4040 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); 4041 } 4042 4043 if (get_ldev(device)) { 4044 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { 4045 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); 4046 ldsc = 1; 4047 } 4048 4049 put_ldev(device); 4050 } 4051 4052 if (device->state.conn > C_WF_REPORT_PARAMS) { 4053 if (be64_to_cpu(p->c_size) != 4054 drbd_get_capacity(device->this_bdev) || ldsc) { 4055 /* we have different sizes, probably peer 4056 * needs to know my new size... */ 4057 drbd_send_sizes(peer_device, 0, ddsf); 4058 } 4059 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) || 4060 (dd == DS_GREW && device->state.conn == C_CONNECTED)) { 4061 if (device->state.pdsk >= D_INCONSISTENT && 4062 device->state.disk >= D_INCONSISTENT) { 4063 if (ddsf & DDSF_NO_RESYNC) 4064 drbd_info(device, "Resync of new storage suppressed with --assume-clean\n"); 4065 else 4066 resync_after_online_grow(device); 4067 } else 4068 set_bit(RESYNC_AFTER_NEG, &device->flags); 4069 } 4070 } 4071 4072 return 0; 4073 } 4074 4075 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi) 4076 { 4077 struct drbd_peer_device *peer_device; 4078 struct drbd_device *device; 4079 struct p_uuids *p = pi->data; 4080 u64 *p_uuid; 4081 int i, updated_uuids = 0; 4082 4083 peer_device = conn_peer_device(connection, pi->vnr); 4084 if (!peer_device) 4085 return config_unknown_volume(connection, pi); 4086 device = peer_device->device; 4087 4088 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); 4089 if (!p_uuid) { 4090 drbd_err(device, "kmalloc of p_uuid failed\n"); 4091 return false; 4092 } 4093 4094 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) 4095 p_uuid[i] = be64_to_cpu(p->uuid[i]); 4096 4097 kfree(device->p_uuid); 4098 device->p_uuid = p_uuid; 4099 4100 if (device->state.conn < C_CONNECTED && 4101 device->state.disk < D_INCONSISTENT && 4102 device->state.role == R_PRIMARY && 4103 (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { 4104 drbd_err(device, "Can only connect to data with current UUID=%016llX\n", 4105 (unsigned long long)device->ed_uuid); 4106 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4107 return -EIO; 4108 } 4109 4110 if (get_ldev(device)) { 4111 int skip_initial_sync = 4112 device->state.conn == C_CONNECTED && 4113 peer_device->connection->agreed_pro_version >= 90 && 4114 device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && 4115 (p_uuid[UI_FLAGS] & 8); 4116 if (skip_initial_sync) { 4117 drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); 4118 drbd_bitmap_io(device, &drbd_bmio_clear_n_write, 4119 "clear_n_write from receive_uuids", 4120 BM_LOCKED_TEST_ALLOWED); 4121 _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); 4122 _drbd_uuid_set(device, UI_BITMAP, 0); 4123 _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 4124 CS_VERBOSE, NULL); 4125 drbd_md_sync(device); 4126 updated_uuids = 1; 4127 } 4128 put_ldev(device); 4129 } else if (device->state.disk < D_INCONSISTENT && 4130 device->state.role == R_PRIMARY) { 4131 /* I am a diskless primary, the peer just created a new current UUID 4132 for me. */ 4133 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); 4134 } 4135 4136 /* Before we test for the disk state, we should wait until an eventually 4137 ongoing cluster wide state change is finished. That is important if 4138 we are primary and are detaching from our disk. We need to see the 4139 new disk state... */ 4140 mutex_lock(device->state_mutex); 4141 mutex_unlock(device->state_mutex); 4142 if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT) 4143 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); 4144 4145 if (updated_uuids) 4146 drbd_print_uuids(device, "receiver updated UUIDs to"); 4147 4148 return 0; 4149 } 4150 4151 /** 4152 * convert_state() - Converts the peer's view of the cluster state to our point of view 4153 * @ps: The state as seen by the peer. 4154 */ 4155 static union drbd_state convert_state(union drbd_state ps) 4156 { 4157 union drbd_state ms; 4158 4159 static enum drbd_conns c_tab[] = { 4160 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, 4161 [C_CONNECTED] = C_CONNECTED, 4162 4163 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, 4164 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, 4165 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ 4166 [C_VERIFY_S] = C_VERIFY_T, 4167 [C_MASK] = C_MASK, 4168 }; 4169 4170 ms.i = ps.i; 4171 4172 ms.conn = c_tab[ps.conn]; 4173 ms.peer = ps.role; 4174 ms.role = ps.peer; 4175 ms.pdsk = ps.disk; 4176 ms.disk = ps.pdsk; 4177 ms.peer_isp = (ps.aftr_isp | ps.user_isp); 4178 4179 return ms; 4180 } 4181 4182 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi) 4183 { 4184 struct drbd_peer_device *peer_device; 4185 struct drbd_device *device; 4186 struct p_req_state *p = pi->data; 4187 union drbd_state mask, val; 4188 enum drbd_state_rv rv; 4189 4190 peer_device = conn_peer_device(connection, pi->vnr); 4191 if (!peer_device) 4192 return -EIO; 4193 device = peer_device->device; 4194 4195 mask.i = be32_to_cpu(p->mask); 4196 val.i = be32_to_cpu(p->val); 4197 4198 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) && 4199 mutex_is_locked(device->state_mutex)) { 4200 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG); 4201 return 0; 4202 } 4203 4204 mask = convert_state(mask); 4205 val = convert_state(val); 4206 4207 rv = drbd_change_state(device, CS_VERBOSE, mask, val); 4208 drbd_send_sr_reply(peer_device, rv); 4209 4210 drbd_md_sync(device); 4211 4212 return 0; 4213 } 4214 4215 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi) 4216 { 4217 struct p_req_state *p = pi->data; 4218 union drbd_state mask, val; 4219 enum drbd_state_rv rv; 4220 4221 mask.i = be32_to_cpu(p->mask); 4222 val.i = be32_to_cpu(p->val); 4223 4224 if (test_bit(RESOLVE_CONFLICTS, &connection->flags) && 4225 mutex_is_locked(&connection->cstate_mutex)) { 4226 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG); 4227 return 0; 4228 } 4229 4230 mask = convert_state(mask); 4231 val = convert_state(val); 4232 4233 rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); 4234 conn_send_sr_reply(connection, rv); 4235 4236 return 0; 4237 } 4238 4239 static int receive_state(struct drbd_connection *connection, struct packet_info *pi) 4240 { 4241 struct drbd_peer_device *peer_device; 4242 struct drbd_device *device; 4243 struct p_state *p = pi->data; 4244 union drbd_state os, ns, peer_state; 4245 enum drbd_disk_state real_peer_disk; 4246 enum chg_state_flags cs_flags; 4247 int rv; 4248 4249 peer_device = conn_peer_device(connection, pi->vnr); 4250 if (!peer_device) 4251 return config_unknown_volume(connection, pi); 4252 device = peer_device->device; 4253 4254 peer_state.i = be32_to_cpu(p->state); 4255 4256 real_peer_disk = peer_state.disk; 4257 if (peer_state.disk == D_NEGOTIATING) { 4258 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; 4259 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); 4260 } 4261 4262 spin_lock_irq(&device->resource->req_lock); 4263 retry: 4264 os = ns = drbd_read_state(device); 4265 spin_unlock_irq(&device->resource->req_lock); 4266 4267 /* If some other part of the code (ack_receiver thread, timeout) 4268 * already decided to close the connection again, 4269 * we must not "re-establish" it here. */ 4270 if (os.conn <= C_TEAR_DOWN) 4271 return -ECONNRESET; 4272 4273 /* If this is the "end of sync" confirmation, usually the peer disk 4274 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits 4275 * set) resync started in PausedSyncT, or if the timing of pause-/ 4276 * unpause-sync events has been "just right", the peer disk may 4277 * transition from D_CONSISTENT to D_UP_TO_DATE as well. 4278 */ 4279 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && 4280 real_peer_disk == D_UP_TO_DATE && 4281 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { 4282 /* If we are (becoming) SyncSource, but peer is still in sync 4283 * preparation, ignore its uptodate-ness to avoid flapping, it 4284 * will change to inconsistent once the peer reaches active 4285 * syncing states. 4286 * It may have changed syncer-paused flags, however, so we 4287 * cannot ignore this completely. */ 4288 if (peer_state.conn > C_CONNECTED && 4289 peer_state.conn < C_SYNC_SOURCE) 4290 real_peer_disk = D_INCONSISTENT; 4291 4292 /* if peer_state changes to connected at the same time, 4293 * it explicitly notifies us that it finished resync. 4294 * Maybe we should finish it up, too? */ 4295 else if (os.conn >= C_SYNC_SOURCE && 4296 peer_state.conn == C_CONNECTED) { 4297 if (drbd_bm_total_weight(device) <= device->rs_failed) 4298 drbd_resync_finished(device); 4299 return 0; 4300 } 4301 } 4302 4303 /* explicit verify finished notification, stop sector reached. */ 4304 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && 4305 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { 4306 ov_out_of_sync_print(device); 4307 drbd_resync_finished(device); 4308 return 0; 4309 } 4310 4311 /* peer says his disk is inconsistent, while we think it is uptodate, 4312 * and this happens while the peer still thinks we have a sync going on, 4313 * but we think we are already done with the sync. 4314 * We ignore this to avoid flapping pdsk. 4315 * This should not happen, if the peer is a recent version of drbd. */ 4316 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && 4317 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) 4318 real_peer_disk = D_UP_TO_DATE; 4319 4320 if (ns.conn == C_WF_REPORT_PARAMS) 4321 ns.conn = C_CONNECTED; 4322 4323 if (peer_state.conn == C_AHEAD) 4324 ns.conn = C_BEHIND; 4325 4326 if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && 4327 get_ldev_if_state(device, D_NEGOTIATING)) { 4328 int cr; /* consider resync */ 4329 4330 /* if we established a new connection */ 4331 cr = (os.conn < C_CONNECTED); 4332 /* if we had an established connection 4333 * and one of the nodes newly attaches a disk */ 4334 cr |= (os.conn == C_CONNECTED && 4335 (peer_state.disk == D_NEGOTIATING || 4336 os.disk == D_NEGOTIATING)); 4337 /* if we have both been inconsistent, and the peer has been 4338 * forced to be UpToDate with --overwrite-data */ 4339 cr |= test_bit(CONSIDER_RESYNC, &device->flags); 4340 /* if we had been plain connected, and the admin requested to 4341 * start a sync by "invalidate" or "invalidate-remote" */ 4342 cr |= (os.conn == C_CONNECTED && 4343 (peer_state.conn >= C_STARTING_SYNC_S && 4344 peer_state.conn <= C_WF_BITMAP_T)); 4345 4346 if (cr) 4347 ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk); 4348 4349 put_ldev(device); 4350 if (ns.conn == C_MASK) { 4351 ns.conn = C_CONNECTED; 4352 if (device->state.disk == D_NEGOTIATING) { 4353 drbd_force_state(device, NS(disk, D_FAILED)); 4354 } else if (peer_state.disk == D_NEGOTIATING) { 4355 drbd_err(device, "Disk attach process on the peer node was aborted.\n"); 4356 peer_state.disk = D_DISKLESS; 4357 real_peer_disk = D_DISKLESS; 4358 } else { 4359 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags)) 4360 return -EIO; 4361 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS); 4362 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4363 return -EIO; 4364 } 4365 } 4366 } 4367 4368 spin_lock_irq(&device->resource->req_lock); 4369 if (os.i != drbd_read_state(device).i) 4370 goto retry; 4371 clear_bit(CONSIDER_RESYNC, &device->flags); 4372 ns.peer = peer_state.role; 4373 ns.pdsk = real_peer_disk; 4374 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); 4375 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) 4376 ns.disk = device->new_state_tmp.disk; 4377 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); 4378 if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && 4379 test_bit(NEW_CUR_UUID, &device->flags)) { 4380 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this 4381 for temporal network outages! */ 4382 spin_unlock_irq(&device->resource->req_lock); 4383 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); 4384 tl_clear(peer_device->connection); 4385 drbd_uuid_new_current(device); 4386 clear_bit(NEW_CUR_UUID, &device->flags); 4387 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); 4388 return -EIO; 4389 } 4390 rv = _drbd_set_state(device, ns, cs_flags, NULL); 4391 ns = drbd_read_state(device); 4392 spin_unlock_irq(&device->resource->req_lock); 4393 4394 if (rv < SS_SUCCESS) { 4395 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4396 return -EIO; 4397 } 4398 4399 if (os.conn > C_WF_REPORT_PARAMS) { 4400 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && 4401 peer_state.disk != D_NEGOTIATING ) { 4402 /* we want resync, peer has not yet decided to sync... */ 4403 /* Nowadays only used when forcing a node into primary role and 4404 setting its disk to UpToDate with that */ 4405 drbd_send_uuids(peer_device); 4406 drbd_send_current_state(peer_device); 4407 } 4408 } 4409 4410 clear_bit(DISCARD_MY_DATA, &device->flags); 4411 4412 drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */ 4413 4414 return 0; 4415 } 4416 4417 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi) 4418 { 4419 struct drbd_peer_device *peer_device; 4420 struct drbd_device *device; 4421 struct p_rs_uuid *p = pi->data; 4422 4423 peer_device = conn_peer_device(connection, pi->vnr); 4424 if (!peer_device) 4425 return -EIO; 4426 device = peer_device->device; 4427 4428 wait_event(device->misc_wait, 4429 device->state.conn == C_WF_SYNC_UUID || 4430 device->state.conn == C_BEHIND || 4431 device->state.conn < C_CONNECTED || 4432 device->state.disk < D_NEGOTIATING); 4433 4434 /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */ 4435 4436 /* Here the _drbd_uuid_ functions are right, current should 4437 _not_ be rotated into the history */ 4438 if (get_ldev_if_state(device, D_NEGOTIATING)) { 4439 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid)); 4440 _drbd_uuid_set(device, UI_BITMAP, 0UL); 4441 4442 drbd_print_uuids(device, "updated sync uuid"); 4443 drbd_start_resync(device, C_SYNC_TARGET); 4444 4445 put_ldev(device); 4446 } else 4447 drbd_err(device, "Ignoring SyncUUID packet!\n"); 4448 4449 return 0; 4450 } 4451 4452 /** 4453 * receive_bitmap_plain 4454 * 4455 * Return 0 when done, 1 when another iteration is needed, and a negative error 4456 * code upon failure. 4457 */ 4458 static int 4459 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size, 4460 unsigned long *p, struct bm_xfer_ctx *c) 4461 { 4462 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - 4463 drbd_header_size(peer_device->connection); 4464 unsigned int num_words = min_t(size_t, data_size / sizeof(*p), 4465 c->bm_words - c->word_offset); 4466 unsigned int want = num_words * sizeof(*p); 4467 int err; 4468 4469 if (want != size) { 4470 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size); 4471 return -EIO; 4472 } 4473 if (want == 0) 4474 return 0; 4475 err = drbd_recv_all(peer_device->connection, p, want); 4476 if (err) 4477 return err; 4478 4479 drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p); 4480 4481 c->word_offset += num_words; 4482 c->bit_offset = c->word_offset * BITS_PER_LONG; 4483 if (c->bit_offset > c->bm_bits) 4484 c->bit_offset = c->bm_bits; 4485 4486 return 1; 4487 } 4488 4489 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) 4490 { 4491 return (enum drbd_bitmap_code)(p->encoding & 0x0f); 4492 } 4493 4494 static int dcbp_get_start(struct p_compressed_bm *p) 4495 { 4496 return (p->encoding & 0x80) != 0; 4497 } 4498 4499 static int dcbp_get_pad_bits(struct p_compressed_bm *p) 4500 { 4501 return (p->encoding >> 4) & 0x7; 4502 } 4503 4504 /** 4505 * recv_bm_rle_bits 4506 * 4507 * Return 0 when done, 1 when another iteration is needed, and a negative error 4508 * code upon failure. 4509 */ 4510 static int 4511 recv_bm_rle_bits(struct drbd_peer_device *peer_device, 4512 struct p_compressed_bm *p, 4513 struct bm_xfer_ctx *c, 4514 unsigned int len) 4515 { 4516 struct bitstream bs; 4517 u64 look_ahead; 4518 u64 rl; 4519 u64 tmp; 4520 unsigned long s = c->bit_offset; 4521 unsigned long e; 4522 int toggle = dcbp_get_start(p); 4523 int have; 4524 int bits; 4525 4526 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); 4527 4528 bits = bitstream_get_bits(&bs, &look_ahead, 64); 4529 if (bits < 0) 4530 return -EIO; 4531 4532 for (have = bits; have > 0; s += rl, toggle = !toggle) { 4533 bits = vli_decode_bits(&rl, look_ahead); 4534 if (bits <= 0) 4535 return -EIO; 4536 4537 if (toggle) { 4538 e = s + rl -1; 4539 if (e >= c->bm_bits) { 4540 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); 4541 return -EIO; 4542 } 4543 _drbd_bm_set_bits(peer_device->device, s, e); 4544 } 4545 4546 if (have < bits) { 4547 drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", 4548 have, bits, look_ahead, 4549 (unsigned int)(bs.cur.b - p->code), 4550 (unsigned int)bs.buf_len); 4551 return -EIO; 4552 } 4553 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ 4554 if (likely(bits < 64)) 4555 look_ahead >>= bits; 4556 else 4557 look_ahead = 0; 4558 have -= bits; 4559 4560 bits = bitstream_get_bits(&bs, &tmp, 64 - have); 4561 if (bits < 0) 4562 return -EIO; 4563 look_ahead |= tmp << have; 4564 have += bits; 4565 } 4566 4567 c->bit_offset = s; 4568 bm_xfer_ctx_bit_to_word_offset(c); 4569 4570 return (s != c->bm_bits); 4571 } 4572 4573 /** 4574 * decode_bitmap_c 4575 * 4576 * Return 0 when done, 1 when another iteration is needed, and a negative error 4577 * code upon failure. 4578 */ 4579 static int 4580 decode_bitmap_c(struct drbd_peer_device *peer_device, 4581 struct p_compressed_bm *p, 4582 struct bm_xfer_ctx *c, 4583 unsigned int len) 4584 { 4585 if (dcbp_get_code(p) == RLE_VLI_Bits) 4586 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p)); 4587 4588 /* other variants had been implemented for evaluation, 4589 * but have been dropped as this one turned out to be "best" 4590 * during all our tests. */ 4591 4592 drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding); 4593 conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); 4594 return -EIO; 4595 } 4596 4597 void INFO_bm_xfer_stats(struct drbd_device *device, 4598 const char *direction, struct bm_xfer_ctx *c) 4599 { 4600 /* what would it take to transfer it "plaintext" */ 4601 unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); 4602 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; 4603 unsigned int plain = 4604 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + 4605 c->bm_words * sizeof(unsigned long); 4606 unsigned int total = c->bytes[0] + c->bytes[1]; 4607 unsigned int r; 4608 4609 /* total can not be zero. but just in case: */ 4610 if (total == 0) 4611 return; 4612 4613 /* don't report if not compressed */ 4614 if (total >= plain) 4615 return; 4616 4617 /* total < plain. check for overflow, still */ 4618 r = (total > UINT_MAX/1000) ? (total / (plain/1000)) 4619 : (1000 * total / plain); 4620 4621 if (r > 1000) 4622 r = 1000; 4623 4624 r = 1000 - r; 4625 drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " 4626 "total %u; compression: %u.%u%%\n", 4627 direction, 4628 c->bytes[1], c->packets[1], 4629 c->bytes[0], c->packets[0], 4630 total, r/10, r % 10); 4631 } 4632 4633 /* Since we are processing the bitfield from lower addresses to higher, 4634 it does not matter if the process it in 32 bit chunks or 64 bit 4635 chunks as long as it is little endian. (Understand it as byte stream, 4636 beginning with the lowest byte...) If we would use big endian 4637 we would need to process it from the highest address to the lowest, 4638 in order to be agnostic to the 32 vs 64 bits issue. 4639 4640 returns 0 on failure, 1 if we successfully received it. */ 4641 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi) 4642 { 4643 struct drbd_peer_device *peer_device; 4644 struct drbd_device *device; 4645 struct bm_xfer_ctx c; 4646 int err; 4647 4648 peer_device = conn_peer_device(connection, pi->vnr); 4649 if (!peer_device) 4650 return -EIO; 4651 device = peer_device->device; 4652 4653 drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED); 4654 /* you are supposed to send additional out-of-sync information 4655 * if you actually set bits during this phase */ 4656 4657 c = (struct bm_xfer_ctx) { 4658 .bm_bits = drbd_bm_bits(device), 4659 .bm_words = drbd_bm_words(device), 4660 }; 4661 4662 for(;;) { 4663 if (pi->cmd == P_BITMAP) 4664 err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c); 4665 else if (pi->cmd == P_COMPRESSED_BITMAP) { 4666 /* MAYBE: sanity check that we speak proto >= 90, 4667 * and the feature is enabled! */ 4668 struct p_compressed_bm *p = pi->data; 4669 4670 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) { 4671 drbd_err(device, "ReportCBitmap packet too large\n"); 4672 err = -EIO; 4673 goto out; 4674 } 4675 if (pi->size <= sizeof(*p)) { 4676 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size); 4677 err = -EIO; 4678 goto out; 4679 } 4680 err = drbd_recv_all(peer_device->connection, p, pi->size); 4681 if (err) 4682 goto out; 4683 err = decode_bitmap_c(peer_device, p, &c, pi->size); 4684 } else { 4685 drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); 4686 err = -EIO; 4687 goto out; 4688 } 4689 4690 c.packets[pi->cmd == P_BITMAP]++; 4691 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size; 4692 4693 if (err <= 0) { 4694 if (err < 0) 4695 goto out; 4696 break; 4697 } 4698 err = drbd_recv_header(peer_device->connection, pi); 4699 if (err) 4700 goto out; 4701 } 4702 4703 INFO_bm_xfer_stats(device, "receive", &c); 4704 4705 if (device->state.conn == C_WF_BITMAP_T) { 4706 enum drbd_state_rv rv; 4707 4708 err = drbd_send_bitmap(device); 4709 if (err) 4710 goto out; 4711 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ 4712 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 4713 D_ASSERT(device, rv == SS_SUCCESS); 4714 } else if (device->state.conn != C_WF_BITMAP_S) { 4715 /* admin may have requested C_DISCONNECTING, 4716 * other threads may have noticed network errors */ 4717 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n", 4718 drbd_conn_str(device->state.conn)); 4719 } 4720 err = 0; 4721 4722 out: 4723 drbd_bm_unlock(device); 4724 if (!err && device->state.conn == C_WF_BITMAP_S) 4725 drbd_start_resync(device, C_SYNC_SOURCE); 4726 return err; 4727 } 4728 4729 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi) 4730 { 4731 drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n", 4732 pi->cmd, pi->size); 4733 4734 return ignore_remaining_packet(connection, pi); 4735 } 4736 4737 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi) 4738 { 4739 /* Make sure we've acked all the TCP data associated 4740 * with the data requests being unplugged */ 4741 drbd_tcp_quickack(connection->data.socket); 4742 4743 return 0; 4744 } 4745 4746 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi) 4747 { 4748 struct drbd_peer_device *peer_device; 4749 struct drbd_device *device; 4750 struct p_block_desc *p = pi->data; 4751 4752 peer_device = conn_peer_device(connection, pi->vnr); 4753 if (!peer_device) 4754 return -EIO; 4755 device = peer_device->device; 4756 4757 switch (device->state.conn) { 4758 case C_WF_SYNC_UUID: 4759 case C_WF_BITMAP_T: 4760 case C_BEHIND: 4761 break; 4762 default: 4763 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n", 4764 drbd_conn_str(device->state.conn)); 4765 } 4766 4767 drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); 4768 4769 return 0; 4770 } 4771 4772 static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi) 4773 { 4774 struct drbd_peer_device *peer_device; 4775 struct p_block_desc *p = pi->data; 4776 struct drbd_device *device; 4777 sector_t sector; 4778 int size, err = 0; 4779 4780 peer_device = conn_peer_device(connection, pi->vnr); 4781 if (!peer_device) 4782 return -EIO; 4783 device = peer_device->device; 4784 4785 sector = be64_to_cpu(p->sector); 4786 size = be32_to_cpu(p->blksize); 4787 4788 dec_rs_pending(device); 4789 4790 if (get_ldev(device)) { 4791 struct drbd_peer_request *peer_req; 4792 const int op = REQ_OP_WRITE_ZEROES; 4793 4794 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, 4795 size, 0, GFP_NOIO); 4796 if (!peer_req) { 4797 put_ldev(device); 4798 return -ENOMEM; 4799 } 4800 4801 peer_req->w.cb = e_end_resync_block; 4802 peer_req->submit_jif = jiffies; 4803 peer_req->flags |= EE_IS_TRIM; 4804 4805 spin_lock_irq(&device->resource->req_lock); 4806 list_add_tail(&peer_req->w.list, &device->sync_ee); 4807 spin_unlock_irq(&device->resource->req_lock); 4808 4809 atomic_add(pi->size >> 9, &device->rs_sect_ev); 4810 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); 4811 4812 if (err) { 4813 spin_lock_irq(&device->resource->req_lock); 4814 list_del(&peer_req->w.list); 4815 spin_unlock_irq(&device->resource->req_lock); 4816 4817 drbd_free_peer_req(device, peer_req); 4818 put_ldev(device); 4819 err = 0; 4820 goto fail; 4821 } 4822 4823 inc_unacked(device); 4824 4825 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(), 4826 as well as drbd_rs_complete_io() */ 4827 } else { 4828 fail: 4829 drbd_rs_complete_io(device, sector); 4830 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER); 4831 } 4832 4833 atomic_add(size >> 9, &device->rs_sect_in); 4834 4835 return err; 4836 } 4837 4838 struct data_cmd { 4839 int expect_payload; 4840 unsigned int pkt_size; 4841 int (*fn)(struct drbd_connection *, struct packet_info *); 4842 }; 4843 4844 static struct data_cmd drbd_cmd_handler[] = { 4845 [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, 4846 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, 4847 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , 4848 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , 4849 [P_BITMAP] = { 1, 0, receive_bitmap } , 4850 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , 4851 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, 4852 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4853 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4854 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, 4855 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, 4856 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, 4857 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, 4858 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, 4859 [P_STATE] = { 0, sizeof(struct p_state), receive_state }, 4860 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, 4861 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, 4862 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4863 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4864 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4865 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4866 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, 4867 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, 4868 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 4869 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 4870 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, 4871 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, 4872 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, 4873 }; 4874 4875 static void drbdd(struct drbd_connection *connection) 4876 { 4877 struct packet_info pi; 4878 size_t shs; /* sub header size */ 4879 int err; 4880 4881 while (get_t_state(&connection->receiver) == RUNNING) { 4882 struct data_cmd const *cmd; 4883 4884 drbd_thread_current_set_cpu(&connection->receiver); 4885 update_receiver_timing_details(connection, drbd_recv_header); 4886 if (drbd_recv_header(connection, &pi)) 4887 goto err_out; 4888 4889 cmd = &drbd_cmd_handler[pi.cmd]; 4890 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { 4891 drbd_err(connection, "Unexpected data packet %s (0x%04x)", 4892 cmdname(pi.cmd), pi.cmd); 4893 goto err_out; 4894 } 4895 4896 shs = cmd->pkt_size; 4897 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME) 4898 shs += sizeof(struct o_qlim); 4899 if (pi.size > shs && !cmd->expect_payload) { 4900 drbd_err(connection, "No payload expected %s l:%d\n", 4901 cmdname(pi.cmd), pi.size); 4902 goto err_out; 4903 } 4904 if (pi.size < shs) { 4905 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n", 4906 cmdname(pi.cmd), (int)shs, pi.size); 4907 goto err_out; 4908 } 4909 4910 if (shs) { 4911 update_receiver_timing_details(connection, drbd_recv_all_warn); 4912 err = drbd_recv_all_warn(connection, pi.data, shs); 4913 if (err) 4914 goto err_out; 4915 pi.size -= shs; 4916 } 4917 4918 update_receiver_timing_details(connection, cmd->fn); 4919 err = cmd->fn(connection, &pi); 4920 if (err) { 4921 drbd_err(connection, "error receiving %s, e: %d l: %d!\n", 4922 cmdname(pi.cmd), err, pi.size); 4923 goto err_out; 4924 } 4925 } 4926 return; 4927 4928 err_out: 4929 conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); 4930 } 4931 4932 static void conn_disconnect(struct drbd_connection *connection) 4933 { 4934 struct drbd_peer_device *peer_device; 4935 enum drbd_conns oc; 4936 int vnr; 4937 4938 if (connection->cstate == C_STANDALONE) 4939 return; 4940 4941 /* We are about to start the cleanup after connection loss. 4942 * Make sure drbd_make_request knows about that. 4943 * Usually we should be in some network failure state already, 4944 * but just in case we are not, we fix it up here. 4945 */ 4946 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 4947 4948 /* ack_receiver does not clean up anything. it must not interfere, either */ 4949 drbd_thread_stop(&connection->ack_receiver); 4950 if (connection->ack_sender) { 4951 destroy_workqueue(connection->ack_sender); 4952 connection->ack_sender = NULL; 4953 } 4954 drbd_free_sock(connection); 4955 4956 rcu_read_lock(); 4957 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 4958 struct drbd_device *device = peer_device->device; 4959 kref_get(&device->kref); 4960 rcu_read_unlock(); 4961 drbd_disconnected(peer_device); 4962 kref_put(&device->kref, drbd_destroy_device); 4963 rcu_read_lock(); 4964 } 4965 rcu_read_unlock(); 4966 4967 if (!list_empty(&connection->current_epoch->list)) 4968 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n"); 4969 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ 4970 atomic_set(&connection->current_epoch->epoch_size, 0); 4971 connection->send.seen_any_write_yet = false; 4972 4973 drbd_info(connection, "Connection closed\n"); 4974 4975 if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN) 4976 conn_try_outdate_peer_async(connection); 4977 4978 spin_lock_irq(&connection->resource->req_lock); 4979 oc = connection->cstate; 4980 if (oc >= C_UNCONNECTED) 4981 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); 4982 4983 spin_unlock_irq(&connection->resource->req_lock); 4984 4985 if (oc == C_DISCONNECTING) 4986 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); 4987 } 4988 4989 static int drbd_disconnected(struct drbd_peer_device *peer_device) 4990 { 4991 struct drbd_device *device = peer_device->device; 4992 unsigned int i; 4993 4994 /* wait for current activity to cease. */ 4995 spin_lock_irq(&device->resource->req_lock); 4996 _drbd_wait_ee_list_empty(device, &device->active_ee); 4997 _drbd_wait_ee_list_empty(device, &device->sync_ee); 4998 _drbd_wait_ee_list_empty(device, &device->read_ee); 4999 spin_unlock_irq(&device->resource->req_lock); 5000 5001 /* We do not have data structures that would allow us to 5002 * get the rs_pending_cnt down to 0 again. 5003 * * On C_SYNC_TARGET we do not have any data structures describing 5004 * the pending RSDataRequest's we have sent. 5005 * * On C_SYNC_SOURCE there is no data structure that tracks 5006 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. 5007 * And no, it is not the sum of the reference counts in the 5008 * resync_LRU. The resync_LRU tracks the whole operation including 5009 * the disk-IO, while the rs_pending_cnt only tracks the blocks 5010 * on the fly. */ 5011 drbd_rs_cancel_all(device); 5012 device->rs_total = 0; 5013 device->rs_failed = 0; 5014 atomic_set(&device->rs_pending_cnt, 0); 5015 wake_up(&device->misc_wait); 5016 5017 del_timer_sync(&device->resync_timer); 5018 resync_timer_fn((unsigned long)device); 5019 5020 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, 5021 * w_make_resync_request etc. which may still be on the worker queue 5022 * to be "canceled" */ 5023 drbd_flush_workqueue(&peer_device->connection->sender_work); 5024 5025 drbd_finish_peer_reqs(device); 5026 5027 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() 5028 might have issued a work again. The one before drbd_finish_peer_reqs() is 5029 necessary to reclain net_ee in drbd_finish_peer_reqs(). */ 5030 drbd_flush_workqueue(&peer_device->connection->sender_work); 5031 5032 /* need to do it again, drbd_finish_peer_reqs() may have populated it 5033 * again via drbd_try_clear_on_disk_bm(). */ 5034 drbd_rs_cancel_all(device); 5035 5036 kfree(device->p_uuid); 5037 device->p_uuid = NULL; 5038 5039 if (!drbd_suspended(device)) 5040 tl_clear(peer_device->connection); 5041 5042 drbd_md_sync(device); 5043 5044 if (get_ldev(device)) { 5045 drbd_bitmap_io(device, &drbd_bm_write_copy_pages, 5046 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); 5047 put_ldev(device); 5048 } 5049 5050 /* tcp_close and release of sendpage pages can be deferred. I don't 5051 * want to use SO_LINGER, because apparently it can be deferred for 5052 * more than 20 seconds (longest time I checked). 5053 * 5054 * Actually we don't care for exactly when the network stack does its 5055 * put_page(), but release our reference on these pages right here. 5056 */ 5057 i = drbd_free_peer_reqs(device, &device->net_ee); 5058 if (i) 5059 drbd_info(device, "net_ee not empty, killed %u entries\n", i); 5060 i = atomic_read(&device->pp_in_use_by_net); 5061 if (i) 5062 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); 5063 i = atomic_read(&device->pp_in_use); 5064 if (i) 5065 drbd_info(device, "pp_in_use = %d, expected 0\n", i); 5066 5067 D_ASSERT(device, list_empty(&device->read_ee)); 5068 D_ASSERT(device, list_empty(&device->active_ee)); 5069 D_ASSERT(device, list_empty(&device->sync_ee)); 5070 D_ASSERT(device, list_empty(&device->done_ee)); 5071 5072 return 0; 5073 } 5074 5075 /* 5076 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version 5077 * we can agree on is stored in agreed_pro_version. 5078 * 5079 * feature flags and the reserved array should be enough room for future 5080 * enhancements of the handshake protocol, and possible plugins... 5081 * 5082 * for now, they are expected to be zero, but ignored. 5083 */ 5084 static int drbd_send_features(struct drbd_connection *connection) 5085 { 5086 struct drbd_socket *sock; 5087 struct p_connection_features *p; 5088 5089 sock = &connection->data; 5090 p = conn_prepare_command(connection, sock); 5091 if (!p) 5092 return -EIO; 5093 memset(p, 0, sizeof(*p)); 5094 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 5095 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 5096 p->feature_flags = cpu_to_be32(PRO_FEATURES); 5097 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); 5098 } 5099 5100 /* 5101 * return values: 5102 * 1 yes, we have a valid connection 5103 * 0 oops, did not work out, please try again 5104 * -1 peer talks different language, 5105 * no point in trying again, please go standalone. 5106 */ 5107 static int drbd_do_features(struct drbd_connection *connection) 5108 { 5109 /* ASSERT current == connection->receiver ... */ 5110 struct p_connection_features *p; 5111 const int expect = sizeof(struct p_connection_features); 5112 struct packet_info pi; 5113 int err; 5114 5115 err = drbd_send_features(connection); 5116 if (err) 5117 return 0; 5118 5119 err = drbd_recv_header(connection, &pi); 5120 if (err) 5121 return 0; 5122 5123 if (pi.cmd != P_CONNECTION_FEATURES) { 5124 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", 5125 cmdname(pi.cmd), pi.cmd); 5126 return -1; 5127 } 5128 5129 if (pi.size != expect) { 5130 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n", 5131 expect, pi.size); 5132 return -1; 5133 } 5134 5135 p = pi.data; 5136 err = drbd_recv_all_warn(connection, p, expect); 5137 if (err) 5138 return 0; 5139 5140 p->protocol_min = be32_to_cpu(p->protocol_min); 5141 p->protocol_max = be32_to_cpu(p->protocol_max); 5142 if (p->protocol_max == 0) 5143 p->protocol_max = p->protocol_min; 5144 5145 if (PRO_VERSION_MAX < p->protocol_min || 5146 PRO_VERSION_MIN > p->protocol_max) 5147 goto incompat; 5148 5149 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); 5150 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); 5151 5152 drbd_info(connection, "Handshake successful: " 5153 "Agreed network protocol version %d\n", connection->agreed_pro_version); 5154 5155 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", 5156 connection->agreed_features, 5157 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", 5158 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", 5159 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : 5160 connection->agreed_features ? "" : " none"); 5161 5162 return 1; 5163 5164 incompat: 5165 drbd_err(connection, "incompatible DRBD dialects: " 5166 "I support %d-%d, peer supports %d-%d\n", 5167 PRO_VERSION_MIN, PRO_VERSION_MAX, 5168 p->protocol_min, p->protocol_max); 5169 return -1; 5170 } 5171 5172 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) 5173 static int drbd_do_auth(struct drbd_connection *connection) 5174 { 5175 drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); 5176 drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); 5177 return -1; 5178 } 5179 #else 5180 #define CHALLENGE_LEN 64 5181 5182 /* Return value: 5183 1 - auth succeeded, 5184 0 - failed, try again (network error), 5185 -1 - auth failed, don't try again. 5186 */ 5187 5188 static int drbd_do_auth(struct drbd_connection *connection) 5189 { 5190 struct drbd_socket *sock; 5191 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ 5192 char *response = NULL; 5193 char *right_response = NULL; 5194 char *peers_ch = NULL; 5195 unsigned int key_len; 5196 char secret[SHARED_SECRET_MAX]; /* 64 byte */ 5197 unsigned int resp_size; 5198 SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm); 5199 struct packet_info pi; 5200 struct net_conf *nc; 5201 int err, rv; 5202 5203 /* FIXME: Put the challenge/response into the preallocated socket buffer. */ 5204 5205 rcu_read_lock(); 5206 nc = rcu_dereference(connection->net_conf); 5207 key_len = strlen(nc->shared_secret); 5208 memcpy(secret, nc->shared_secret, key_len); 5209 rcu_read_unlock(); 5210 5211 desc->tfm = connection->cram_hmac_tfm; 5212 desc->flags = 0; 5213 5214 rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); 5215 if (rv) { 5216 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv); 5217 rv = -1; 5218 goto fail; 5219 } 5220 5221 get_random_bytes(my_challenge, CHALLENGE_LEN); 5222 5223 sock = &connection->data; 5224 if (!conn_prepare_command(connection, sock)) { 5225 rv = 0; 5226 goto fail; 5227 } 5228 rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0, 5229 my_challenge, CHALLENGE_LEN); 5230 if (!rv) 5231 goto fail; 5232 5233 err = drbd_recv_header(connection, &pi); 5234 if (err) { 5235 rv = 0; 5236 goto fail; 5237 } 5238 5239 if (pi.cmd != P_AUTH_CHALLENGE) { 5240 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", 5241 cmdname(pi.cmd), pi.cmd); 5242 rv = 0; 5243 goto fail; 5244 } 5245 5246 if (pi.size > CHALLENGE_LEN * 2) { 5247 drbd_err(connection, "expected AuthChallenge payload too big.\n"); 5248 rv = -1; 5249 goto fail; 5250 } 5251 5252 if (pi.size < CHALLENGE_LEN) { 5253 drbd_err(connection, "AuthChallenge payload too small.\n"); 5254 rv = -1; 5255 goto fail; 5256 } 5257 5258 peers_ch = kmalloc(pi.size, GFP_NOIO); 5259 if (peers_ch == NULL) { 5260 drbd_err(connection, "kmalloc of peers_ch failed\n"); 5261 rv = -1; 5262 goto fail; 5263 } 5264 5265 err = drbd_recv_all_warn(connection, peers_ch, pi.size); 5266 if (err) { 5267 rv = 0; 5268 goto fail; 5269 } 5270 5271 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { 5272 drbd_err(connection, "Peer presented the same challenge!\n"); 5273 rv = -1; 5274 goto fail; 5275 } 5276 5277 resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm); 5278 response = kmalloc(resp_size, GFP_NOIO); 5279 if (response == NULL) { 5280 drbd_err(connection, "kmalloc of response failed\n"); 5281 rv = -1; 5282 goto fail; 5283 } 5284 5285 rv = crypto_shash_digest(desc, peers_ch, pi.size, response); 5286 if (rv) { 5287 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); 5288 rv = -1; 5289 goto fail; 5290 } 5291 5292 if (!conn_prepare_command(connection, sock)) { 5293 rv = 0; 5294 goto fail; 5295 } 5296 rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0, 5297 response, resp_size); 5298 if (!rv) 5299 goto fail; 5300 5301 err = drbd_recv_header(connection, &pi); 5302 if (err) { 5303 rv = 0; 5304 goto fail; 5305 } 5306 5307 if (pi.cmd != P_AUTH_RESPONSE) { 5308 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n", 5309 cmdname(pi.cmd), pi.cmd); 5310 rv = 0; 5311 goto fail; 5312 } 5313 5314 if (pi.size != resp_size) { 5315 drbd_err(connection, "expected AuthResponse payload of wrong size\n"); 5316 rv = 0; 5317 goto fail; 5318 } 5319 5320 err = drbd_recv_all_warn(connection, response , resp_size); 5321 if (err) { 5322 rv = 0; 5323 goto fail; 5324 } 5325 5326 right_response = kmalloc(resp_size, GFP_NOIO); 5327 if (right_response == NULL) { 5328 drbd_err(connection, "kmalloc of right_response failed\n"); 5329 rv = -1; 5330 goto fail; 5331 } 5332 5333 rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN, 5334 right_response); 5335 if (rv) { 5336 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); 5337 rv = -1; 5338 goto fail; 5339 } 5340 5341 rv = !memcmp(response, right_response, resp_size); 5342 5343 if (rv) 5344 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n", 5345 resp_size); 5346 else 5347 rv = -1; 5348 5349 fail: 5350 kfree(peers_ch); 5351 kfree(response); 5352 kfree(right_response); 5353 shash_desc_zero(desc); 5354 5355 return rv; 5356 } 5357 #endif 5358 5359 int drbd_receiver(struct drbd_thread *thi) 5360 { 5361 struct drbd_connection *connection = thi->connection; 5362 int h; 5363 5364 drbd_info(connection, "receiver (re)started\n"); 5365 5366 do { 5367 h = conn_connect(connection); 5368 if (h == 0) { 5369 conn_disconnect(connection); 5370 schedule_timeout_interruptible(HZ); 5371 } 5372 if (h == -1) { 5373 drbd_warn(connection, "Discarding network configuration.\n"); 5374 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 5375 } 5376 } while (h == 0); 5377 5378 if (h > 0) 5379 drbdd(connection); 5380 5381 conn_disconnect(connection); 5382 5383 drbd_info(connection, "receiver terminated\n"); 5384 return 0; 5385 } 5386 5387 /* ********* acknowledge sender ******** */ 5388 5389 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi) 5390 { 5391 struct p_req_state_reply *p = pi->data; 5392 int retcode = be32_to_cpu(p->retcode); 5393 5394 if (retcode >= SS_SUCCESS) { 5395 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags); 5396 } else { 5397 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags); 5398 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n", 5399 drbd_set_st_err_str(retcode), retcode); 5400 } 5401 wake_up(&connection->ping_wait); 5402 5403 return 0; 5404 } 5405 5406 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi) 5407 { 5408 struct drbd_peer_device *peer_device; 5409 struct drbd_device *device; 5410 struct p_req_state_reply *p = pi->data; 5411 int retcode = be32_to_cpu(p->retcode); 5412 5413 peer_device = conn_peer_device(connection, pi->vnr); 5414 if (!peer_device) 5415 return -EIO; 5416 device = peer_device->device; 5417 5418 if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) { 5419 D_ASSERT(device, connection->agreed_pro_version < 100); 5420 return got_conn_RqSReply(connection, pi); 5421 } 5422 5423 if (retcode >= SS_SUCCESS) { 5424 set_bit(CL_ST_CHG_SUCCESS, &device->flags); 5425 } else { 5426 set_bit(CL_ST_CHG_FAIL, &device->flags); 5427 drbd_err(device, "Requested state change failed by peer: %s (%d)\n", 5428 drbd_set_st_err_str(retcode), retcode); 5429 } 5430 wake_up(&device->state_wait); 5431 5432 return 0; 5433 } 5434 5435 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi) 5436 { 5437 return drbd_send_ping_ack(connection); 5438 5439 } 5440 5441 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi) 5442 { 5443 /* restore idle timeout */ 5444 connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ; 5445 if (!test_and_set_bit(GOT_PING_ACK, &connection->flags)) 5446 wake_up(&connection->ping_wait); 5447 5448 return 0; 5449 } 5450 5451 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi) 5452 { 5453 struct drbd_peer_device *peer_device; 5454 struct drbd_device *device; 5455 struct p_block_ack *p = pi->data; 5456 sector_t sector = be64_to_cpu(p->sector); 5457 int blksize = be32_to_cpu(p->blksize); 5458 5459 peer_device = conn_peer_device(connection, pi->vnr); 5460 if (!peer_device) 5461 return -EIO; 5462 device = peer_device->device; 5463 5464 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); 5465 5466 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5467 5468 if (get_ldev(device)) { 5469 drbd_rs_complete_io(device, sector); 5470 drbd_set_in_sync(device, sector, blksize); 5471 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ 5472 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); 5473 put_ldev(device); 5474 } 5475 dec_rs_pending(device); 5476 atomic_add(blksize >> 9, &device->rs_sect_in); 5477 5478 return 0; 5479 } 5480 5481 static int 5482 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, 5483 struct rb_root *root, const char *func, 5484 enum drbd_req_event what, bool missing_ok) 5485 { 5486 struct drbd_request *req; 5487 struct bio_and_error m; 5488 5489 spin_lock_irq(&device->resource->req_lock); 5490 req = find_request(device, root, id, sector, missing_ok, func); 5491 if (unlikely(!req)) { 5492 spin_unlock_irq(&device->resource->req_lock); 5493 return -EIO; 5494 } 5495 __req_mod(req, what, &m); 5496 spin_unlock_irq(&device->resource->req_lock); 5497 5498 if (m.bio) 5499 complete_master_bio(device, &m); 5500 return 0; 5501 } 5502 5503 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi) 5504 { 5505 struct drbd_peer_device *peer_device; 5506 struct drbd_device *device; 5507 struct p_block_ack *p = pi->data; 5508 sector_t sector = be64_to_cpu(p->sector); 5509 int blksize = be32_to_cpu(p->blksize); 5510 enum drbd_req_event what; 5511 5512 peer_device = conn_peer_device(connection, pi->vnr); 5513 if (!peer_device) 5514 return -EIO; 5515 device = peer_device->device; 5516 5517 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5518 5519 if (p->block_id == ID_SYNCER) { 5520 drbd_set_in_sync(device, sector, blksize); 5521 dec_rs_pending(device); 5522 return 0; 5523 } 5524 switch (pi->cmd) { 5525 case P_RS_WRITE_ACK: 5526 what = WRITE_ACKED_BY_PEER_AND_SIS; 5527 break; 5528 case P_WRITE_ACK: 5529 what = WRITE_ACKED_BY_PEER; 5530 break; 5531 case P_RECV_ACK: 5532 what = RECV_ACKED_BY_PEER; 5533 break; 5534 case P_SUPERSEDED: 5535 what = CONFLICT_RESOLVED; 5536 break; 5537 case P_RETRY_WRITE: 5538 what = POSTPONE_WRITE; 5539 break; 5540 default: 5541 BUG(); 5542 } 5543 5544 return validate_req_change_req_state(device, p->block_id, sector, 5545 &device->write_requests, __func__, 5546 what, false); 5547 } 5548 5549 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi) 5550 { 5551 struct drbd_peer_device *peer_device; 5552 struct drbd_device *device; 5553 struct p_block_ack *p = pi->data; 5554 sector_t sector = be64_to_cpu(p->sector); 5555 int size = be32_to_cpu(p->blksize); 5556 int err; 5557 5558 peer_device = conn_peer_device(connection, pi->vnr); 5559 if (!peer_device) 5560 return -EIO; 5561 device = peer_device->device; 5562 5563 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5564 5565 if (p->block_id == ID_SYNCER) { 5566 dec_rs_pending(device); 5567 drbd_rs_failed_io(device, sector, size); 5568 return 0; 5569 } 5570 5571 err = validate_req_change_req_state(device, p->block_id, sector, 5572 &device->write_requests, __func__, 5573 NEG_ACKED, true); 5574 if (err) { 5575 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. 5576 The master bio might already be completed, therefore the 5577 request is no longer in the collision hash. */ 5578 /* In Protocol B we might already have got a P_RECV_ACK 5579 but then get a P_NEG_ACK afterwards. */ 5580 drbd_set_out_of_sync(device, sector, size); 5581 } 5582 return 0; 5583 } 5584 5585 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi) 5586 { 5587 struct drbd_peer_device *peer_device; 5588 struct drbd_device *device; 5589 struct p_block_ack *p = pi->data; 5590 sector_t sector = be64_to_cpu(p->sector); 5591 5592 peer_device = conn_peer_device(connection, pi->vnr); 5593 if (!peer_device) 5594 return -EIO; 5595 device = peer_device->device; 5596 5597 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5598 5599 drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", 5600 (unsigned long long)sector, be32_to_cpu(p->blksize)); 5601 5602 return validate_req_change_req_state(device, p->block_id, sector, 5603 &device->read_requests, __func__, 5604 NEG_ACKED, false); 5605 } 5606 5607 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi) 5608 { 5609 struct drbd_peer_device *peer_device; 5610 struct drbd_device *device; 5611 sector_t sector; 5612 int size; 5613 struct p_block_ack *p = pi->data; 5614 5615 peer_device = conn_peer_device(connection, pi->vnr); 5616 if (!peer_device) 5617 return -EIO; 5618 device = peer_device->device; 5619 5620 sector = be64_to_cpu(p->sector); 5621 size = be32_to_cpu(p->blksize); 5622 5623 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5624 5625 dec_rs_pending(device); 5626 5627 if (get_ldev_if_state(device, D_FAILED)) { 5628 drbd_rs_complete_io(device, sector); 5629 switch (pi->cmd) { 5630 case P_NEG_RS_DREPLY: 5631 drbd_rs_failed_io(device, sector, size); 5632 case P_RS_CANCEL: 5633 break; 5634 default: 5635 BUG(); 5636 } 5637 put_ldev(device); 5638 } 5639 5640 return 0; 5641 } 5642 5643 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi) 5644 { 5645 struct p_barrier_ack *p = pi->data; 5646 struct drbd_peer_device *peer_device; 5647 int vnr; 5648 5649 tl_release(connection, p->barrier, be32_to_cpu(p->set_size)); 5650 5651 rcu_read_lock(); 5652 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 5653 struct drbd_device *device = peer_device->device; 5654 5655 if (device->state.conn == C_AHEAD && 5656 atomic_read(&device->ap_in_flight) == 0 && 5657 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) { 5658 device->start_resync_timer.expires = jiffies + HZ; 5659 add_timer(&device->start_resync_timer); 5660 } 5661 } 5662 rcu_read_unlock(); 5663 5664 return 0; 5665 } 5666 5667 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi) 5668 { 5669 struct drbd_peer_device *peer_device; 5670 struct drbd_device *device; 5671 struct p_block_ack *p = pi->data; 5672 struct drbd_device_work *dw; 5673 sector_t sector; 5674 int size; 5675 5676 peer_device = conn_peer_device(connection, pi->vnr); 5677 if (!peer_device) 5678 return -EIO; 5679 device = peer_device->device; 5680 5681 sector = be64_to_cpu(p->sector); 5682 size = be32_to_cpu(p->blksize); 5683 5684 update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); 5685 5686 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) 5687 drbd_ov_out_of_sync_found(device, sector, size); 5688 else 5689 ov_out_of_sync_print(device); 5690 5691 if (!get_ldev(device)) 5692 return 0; 5693 5694 drbd_rs_complete_io(device, sector); 5695 dec_rs_pending(device); 5696 5697 --device->ov_left; 5698 5699 /* let's advance progress step marks only for every other megabyte */ 5700 if ((device->ov_left & 0x200) == 0x200) 5701 drbd_advance_rs_marks(device, device->ov_left); 5702 5703 if (device->ov_left == 0) { 5704 dw = kmalloc(sizeof(*dw), GFP_NOIO); 5705 if (dw) { 5706 dw->w.cb = w_ov_finished; 5707 dw->device = device; 5708 drbd_queue_work(&peer_device->connection->sender_work, &dw->w); 5709 } else { 5710 drbd_err(device, "kmalloc(dw) failed."); 5711 ov_out_of_sync_print(device); 5712 drbd_resync_finished(device); 5713 } 5714 } 5715 put_ldev(device); 5716 return 0; 5717 } 5718 5719 static int got_skip(struct drbd_connection *connection, struct packet_info *pi) 5720 { 5721 return 0; 5722 } 5723 5724 struct meta_sock_cmd { 5725 size_t pkt_size; 5726 int (*fn)(struct drbd_connection *connection, struct packet_info *); 5727 }; 5728 5729 static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) 5730 { 5731 long t; 5732 struct net_conf *nc; 5733 5734 rcu_read_lock(); 5735 nc = rcu_dereference(connection->net_conf); 5736 t = ping_timeout ? nc->ping_timeo : nc->ping_int; 5737 rcu_read_unlock(); 5738 5739 t *= HZ; 5740 if (ping_timeout) 5741 t /= 10; 5742 5743 connection->meta.socket->sk->sk_rcvtimeo = t; 5744 } 5745 5746 static void set_ping_timeout(struct drbd_connection *connection) 5747 { 5748 set_rcvtimeo(connection, 1); 5749 } 5750 5751 static void set_idle_timeout(struct drbd_connection *connection) 5752 { 5753 set_rcvtimeo(connection, 0); 5754 } 5755 5756 static struct meta_sock_cmd ack_receiver_tbl[] = { 5757 [P_PING] = { 0, got_Ping }, 5758 [P_PING_ACK] = { 0, got_PingAck }, 5759 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5760 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5761 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5762 [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, 5763 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, 5764 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, 5765 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, 5766 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, 5767 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 5768 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 5769 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 5770 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, 5771 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, 5772 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, 5773 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, 5774 }; 5775 5776 int drbd_ack_receiver(struct drbd_thread *thi) 5777 { 5778 struct drbd_connection *connection = thi->connection; 5779 struct meta_sock_cmd *cmd = NULL; 5780 struct packet_info pi; 5781 unsigned long pre_recv_jif; 5782 int rv; 5783 void *buf = connection->meta.rbuf; 5784 int received = 0; 5785 unsigned int header_size = drbd_header_size(connection); 5786 int expect = header_size; 5787 bool ping_timeout_active = false; 5788 struct sched_param param = { .sched_priority = 2 }; 5789 5790 rv = sched_setscheduler(current, SCHED_RR, ¶m); 5791 if (rv < 0) 5792 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); 5793 5794 while (get_t_state(thi) == RUNNING) { 5795 drbd_thread_current_set_cpu(thi); 5796 5797 conn_reclaim_net_peer_reqs(connection); 5798 5799 if (test_and_clear_bit(SEND_PING, &connection->flags)) { 5800 if (drbd_send_ping(connection)) { 5801 drbd_err(connection, "drbd_send_ping has failed\n"); 5802 goto reconnect; 5803 } 5804 set_ping_timeout(connection); 5805 ping_timeout_active = true; 5806 } 5807 5808 pre_recv_jif = jiffies; 5809 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); 5810 5811 /* Note: 5812 * -EINTR (on meta) we got a signal 5813 * -EAGAIN (on meta) rcvtimeo expired 5814 * -ECONNRESET other side closed the connection 5815 * -ERESTARTSYS (on data) we got a signal 5816 * rv < 0 other than above: unexpected error! 5817 * rv == expected: full header or command 5818 * rv < expected: "woken" by signal during receive 5819 * rv == 0 : "connection shut down by peer" 5820 */ 5821 if (likely(rv > 0)) { 5822 received += rv; 5823 buf += rv; 5824 } else if (rv == 0) { 5825 if (test_bit(DISCONNECT_SENT, &connection->flags)) { 5826 long t; 5827 rcu_read_lock(); 5828 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; 5829 rcu_read_unlock(); 5830 5831 t = wait_event_timeout(connection->ping_wait, 5832 connection->cstate < C_WF_REPORT_PARAMS, 5833 t); 5834 if (t) 5835 break; 5836 } 5837 drbd_err(connection, "meta connection shut down by peer.\n"); 5838 goto reconnect; 5839 } else if (rv == -EAGAIN) { 5840 /* If the data socket received something meanwhile, 5841 * that is good enough: peer is still alive. */ 5842 if (time_after(connection->last_received, pre_recv_jif)) 5843 continue; 5844 if (ping_timeout_active) { 5845 drbd_err(connection, "PingAck did not arrive in time.\n"); 5846 goto reconnect; 5847 } 5848 set_bit(SEND_PING, &connection->flags); 5849 continue; 5850 } else if (rv == -EINTR) { 5851 /* maybe drbd_thread_stop(): the while condition will notice. 5852 * maybe woken for send_ping: we'll send a ping above, 5853 * and change the rcvtimeo */ 5854 flush_signals(current); 5855 continue; 5856 } else { 5857 drbd_err(connection, "sock_recvmsg returned %d\n", rv); 5858 goto reconnect; 5859 } 5860 5861 if (received == expect && cmd == NULL) { 5862 if (decode_header(connection, connection->meta.rbuf, &pi)) 5863 goto reconnect; 5864 cmd = &ack_receiver_tbl[pi.cmd]; 5865 if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { 5866 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", 5867 cmdname(pi.cmd), pi.cmd); 5868 goto disconnect; 5869 } 5870 expect = header_size + cmd->pkt_size; 5871 if (pi.size != expect - header_size) { 5872 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n", 5873 pi.cmd, pi.size); 5874 goto reconnect; 5875 } 5876 } 5877 if (received == expect) { 5878 bool err; 5879 5880 err = cmd->fn(connection, &pi); 5881 if (err) { 5882 drbd_err(connection, "%pf failed\n", cmd->fn); 5883 goto reconnect; 5884 } 5885 5886 connection->last_received = jiffies; 5887 5888 if (cmd == &ack_receiver_tbl[P_PING_ACK]) { 5889 set_idle_timeout(connection); 5890 ping_timeout_active = false; 5891 } 5892 5893 buf = connection->meta.rbuf; 5894 received = 0; 5895 expect = header_size; 5896 cmd = NULL; 5897 } 5898 } 5899 5900 if (0) { 5901 reconnect: 5902 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 5903 conn_md_sync(connection); 5904 } 5905 if (0) { 5906 disconnect: 5907 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 5908 } 5909 5910 drbd_info(connection, "ack_receiver terminated\n"); 5911 5912 return 0; 5913 } 5914 5915 void drbd_send_acks_wf(struct work_struct *ws) 5916 { 5917 struct drbd_peer_device *peer_device = 5918 container_of(ws, struct drbd_peer_device, send_acks_work); 5919 struct drbd_connection *connection = peer_device->connection; 5920 struct drbd_device *device = peer_device->device; 5921 struct net_conf *nc; 5922 int tcp_cork, err; 5923 5924 rcu_read_lock(); 5925 nc = rcu_dereference(connection->net_conf); 5926 tcp_cork = nc->tcp_cork; 5927 rcu_read_unlock(); 5928 5929 if (tcp_cork) 5930 drbd_tcp_cork(connection->meta.socket); 5931 5932 err = drbd_finish_peer_reqs(device); 5933 kref_put(&device->kref, drbd_destroy_device); 5934 /* get is in drbd_endio_write_sec_final(). That is necessary to keep the 5935 struct work_struct send_acks_work alive, which is in the peer_device object */ 5936 5937 if (err) { 5938 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 5939 return; 5940 } 5941 5942 if (tcp_cork) 5943 drbd_tcp_uncork(connection->meta.socket); 5944 5945 return; 5946 } 5947