xref: /openbmc/linux/drivers/block/drbd/drbd_worker.c (revision 2dd6532e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3    drbd_worker.c
4 
5    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 
7    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 
11 
12 */
13 
14 #include <linux/module.h>
15 #include <linux/drbd.h>
16 #include <linux/sched/signal.h>
17 #include <linux/wait.h>
18 #include <linux/mm.h>
19 #include <linux/memcontrol.h>
20 #include <linux/mm_inline.h>
21 #include <linux/slab.h>
22 #include <linux/random.h>
23 #include <linux/string.h>
24 #include <linux/scatterlist.h>
25 #include <linux/part_stat.h>
26 
27 #include "drbd_int.h"
28 #include "drbd_protocol.h"
29 #include "drbd_req.h"
30 
31 static int make_ov_request(struct drbd_device *, int);
32 static int make_resync_request(struct drbd_device *, int);
33 
34 /* endio handlers:
35  *   drbd_md_endio (defined here)
36  *   drbd_request_endio (defined here)
37  *   drbd_peer_request_endio (defined here)
38  *   drbd_bm_endio (defined in drbd_bitmap.c)
39  *
40  * For all these callbacks, note the following:
41  * The callbacks will be called in irq context by the IDE drivers,
42  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
43  * Try to get the locking right :)
44  *
45  */
46 
47 /* used for synchronous meta data and bitmap IO
48  * submitted by drbd_md_sync_page_io()
49  */
50 void drbd_md_endio(struct bio *bio)
51 {
52 	struct drbd_device *device;
53 
54 	device = bio->bi_private;
55 	device->md_io.error = blk_status_to_errno(bio->bi_status);
56 
57 	/* special case: drbd_md_read() during drbd_adm_attach() */
58 	if (device->ldev)
59 		put_ldev(device);
60 	bio_put(bio);
61 
62 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
63 	 * to timeout on the lower level device, and eventually detach from it.
64 	 * If this io completion runs after that timeout expired, this
65 	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
66 	 * During normal operation, this only puts that extra reference
67 	 * down to 1 again.
68 	 * Make sure we first drop the reference, and only then signal
69 	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
70 	 * next drbd_md_sync_page_io(), that we trigger the
71 	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
72 	 */
73 	drbd_md_put_buffer(device);
74 	device->md_io.done = 1;
75 	wake_up(&device->misc_wait);
76 }
77 
78 /* reads on behalf of the partner,
79  * "submitted" by the receiver
80  */
81 static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
82 {
83 	unsigned long flags = 0;
84 	struct drbd_peer_device *peer_device = peer_req->peer_device;
85 	struct drbd_device *device = peer_device->device;
86 
87 	spin_lock_irqsave(&device->resource->req_lock, flags);
88 	device->read_cnt += peer_req->i.size >> 9;
89 	list_del(&peer_req->w.list);
90 	if (list_empty(&device->read_ee))
91 		wake_up(&device->ee_wait);
92 	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
93 		__drbd_chk_io_error(device, DRBD_READ_ERROR);
94 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
95 
96 	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
97 	put_ldev(device);
98 }
99 
100 /* writes on behalf of the partner, or resync writes,
101  * "submitted" by the receiver, final stage.  */
102 void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
103 {
104 	unsigned long flags = 0;
105 	struct drbd_peer_device *peer_device = peer_req->peer_device;
106 	struct drbd_device *device = peer_device->device;
107 	struct drbd_connection *connection = peer_device->connection;
108 	struct drbd_interval i;
109 	int do_wake;
110 	u64 block_id;
111 	int do_al_complete_io;
112 
113 	/* after we moved peer_req to done_ee,
114 	 * we may no longer access it,
115 	 * it may be freed/reused already!
116 	 * (as soon as we release the req_lock) */
117 	i = peer_req->i;
118 	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
119 	block_id = peer_req->block_id;
120 	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
121 
122 	if (peer_req->flags & EE_WAS_ERROR) {
123 		/* In protocol != C, we usually do not send write acks.
124 		 * In case of a write error, send the neg ack anyways. */
125 		if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags))
126 			inc_unacked(device);
127 		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
128 	}
129 
130 	spin_lock_irqsave(&device->resource->req_lock, flags);
131 	device->writ_cnt += peer_req->i.size >> 9;
132 	list_move_tail(&peer_req->w.list, &device->done_ee);
133 
134 	/*
135 	 * Do not remove from the write_requests tree here: we did not send the
136 	 * Ack yet and did not wake possibly waiting conflicting requests.
137 	 * Removed from the tree from "drbd_process_done_ee" within the
138 	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
139 	 * _drbd_clear_done_ee.
140 	 */
141 
142 	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
143 
144 	/* FIXME do we want to detach for failed REQ_OP_DISCARD?
145 	 * ((peer_req->flags & (EE_WAS_ERROR|EE_TRIM)) == EE_WAS_ERROR) */
146 	if (peer_req->flags & EE_WAS_ERROR)
147 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
148 
149 	if (connection->cstate >= C_WF_REPORT_PARAMS) {
150 		kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
151 		if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
152 			kref_put(&device->kref, drbd_destroy_device);
153 	}
154 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
155 
156 	if (block_id == ID_SYNCER)
157 		drbd_rs_complete_io(device, i.sector);
158 
159 	if (do_wake)
160 		wake_up(&device->ee_wait);
161 
162 	if (do_al_complete_io)
163 		drbd_al_complete_io(device, &i);
164 
165 	put_ldev(device);
166 }
167 
168 /* writes on behalf of the partner, or resync writes,
169  * "submitted" by the receiver.
170  */
171 void drbd_peer_request_endio(struct bio *bio)
172 {
173 	struct drbd_peer_request *peer_req = bio->bi_private;
174 	struct drbd_device *device = peer_req->peer_device->device;
175 	bool is_write = bio_data_dir(bio) == WRITE;
176 	bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
177 			  bio_op(bio) == REQ_OP_DISCARD;
178 
179 	if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
180 		drbd_warn(device, "%s: error=%d s=%llus\n",
181 				is_write ? (is_discard ? "discard" : "write")
182 					: "read", bio->bi_status,
183 				(unsigned long long)peer_req->i.sector);
184 
185 	if (bio->bi_status)
186 		set_bit(__EE_WAS_ERROR, &peer_req->flags);
187 
188 	bio_put(bio); /* no need for the bio anymore */
189 	if (atomic_dec_and_test(&peer_req->pending_bios)) {
190 		if (is_write)
191 			drbd_endio_write_sec_final(peer_req);
192 		else
193 			drbd_endio_read_sec_final(peer_req);
194 	}
195 }
196 
197 static void
198 drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
199 {
200 	panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
201 		device->minor, device->resource->name, device->vnr);
202 }
203 
204 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
205  */
206 void drbd_request_endio(struct bio *bio)
207 {
208 	unsigned long flags;
209 	struct drbd_request *req = bio->bi_private;
210 	struct drbd_device *device = req->device;
211 	struct bio_and_error m;
212 	enum drbd_req_event what;
213 
214 	/* If this request was aborted locally before,
215 	 * but now was completed "successfully",
216 	 * chances are that this caused arbitrary data corruption.
217 	 *
218 	 * "aborting" requests, or force-detaching the disk, is intended for
219 	 * completely blocked/hung local backing devices which do no longer
220 	 * complete requests at all, not even do error completions.  In this
221 	 * situation, usually a hard-reset and failover is the only way out.
222 	 *
223 	 * By "aborting", basically faking a local error-completion,
224 	 * we allow for a more graceful swichover by cleanly migrating services.
225 	 * Still the affected node has to be rebooted "soon".
226 	 *
227 	 * By completing these requests, we allow the upper layers to re-use
228 	 * the associated data pages.
229 	 *
230 	 * If later the local backing device "recovers", and now DMAs some data
231 	 * from disk into the original request pages, in the best case it will
232 	 * just put random data into unused pages; but typically it will corrupt
233 	 * meanwhile completely unrelated data, causing all sorts of damage.
234 	 *
235 	 * Which means delayed successful completion,
236 	 * especially for READ requests,
237 	 * is a reason to panic().
238 	 *
239 	 * We assume that a delayed *error* completion is OK,
240 	 * though we still will complain noisily about it.
241 	 */
242 	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
243 		if (__ratelimit(&drbd_ratelimit_state))
244 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
245 
246 		if (!bio->bi_status)
247 			drbd_panic_after_delayed_completion_of_aborted_request(device);
248 	}
249 
250 	/* to avoid recursion in __req_mod */
251 	if (unlikely(bio->bi_status)) {
252 		switch (bio_op(bio)) {
253 		case REQ_OP_WRITE_ZEROES:
254 		case REQ_OP_DISCARD:
255 			if (bio->bi_status == BLK_STS_NOTSUPP)
256 				what = DISCARD_COMPLETED_NOTSUPP;
257 			else
258 				what = DISCARD_COMPLETED_WITH_ERROR;
259 			break;
260 		case REQ_OP_READ:
261 			if (bio->bi_opf & REQ_RAHEAD)
262 				what = READ_AHEAD_COMPLETED_WITH_ERROR;
263 			else
264 				what = READ_COMPLETED_WITH_ERROR;
265 			break;
266 		default:
267 			what = WRITE_COMPLETED_WITH_ERROR;
268 			break;
269 		}
270 	} else {
271 		what = COMPLETED_OK;
272 	}
273 
274 	req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status));
275 	bio_put(bio);
276 
277 	/* not req_mod(), we need irqsave here! */
278 	spin_lock_irqsave(&device->resource->req_lock, flags);
279 	__req_mod(req, what, &m);
280 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
281 	put_ldev(device);
282 
283 	if (m.bio)
284 		complete_master_bio(device, &m);
285 }
286 
287 void drbd_csum_ee(struct crypto_shash *tfm, struct drbd_peer_request *peer_req, void *digest)
288 {
289 	SHASH_DESC_ON_STACK(desc, tfm);
290 	struct page *page = peer_req->pages;
291 	struct page *tmp;
292 	unsigned len;
293 	void *src;
294 
295 	desc->tfm = tfm;
296 
297 	crypto_shash_init(desc);
298 
299 	src = kmap_atomic(page);
300 	while ((tmp = page_chain_next(page))) {
301 		/* all but the last page will be fully used */
302 		crypto_shash_update(desc, src, PAGE_SIZE);
303 		kunmap_atomic(src);
304 		page = tmp;
305 		src = kmap_atomic(page);
306 	}
307 	/* and now the last, possibly only partially used page */
308 	len = peer_req->i.size & (PAGE_SIZE - 1);
309 	crypto_shash_update(desc, src, len ?: PAGE_SIZE);
310 	kunmap_atomic(src);
311 
312 	crypto_shash_final(desc, digest);
313 	shash_desc_zero(desc);
314 }
315 
316 void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest)
317 {
318 	SHASH_DESC_ON_STACK(desc, tfm);
319 	struct bio_vec bvec;
320 	struct bvec_iter iter;
321 
322 	desc->tfm = tfm;
323 
324 	crypto_shash_init(desc);
325 
326 	bio_for_each_segment(bvec, bio, iter) {
327 		u8 *src;
328 
329 		src = bvec_kmap_local(&bvec);
330 		crypto_shash_update(desc, src, bvec.bv_len);
331 		kunmap_local(src);
332 	}
333 	crypto_shash_final(desc, digest);
334 	shash_desc_zero(desc);
335 }
336 
337 /* MAYBE merge common code with w_e_end_ov_req */
338 static int w_e_send_csum(struct drbd_work *w, int cancel)
339 {
340 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
341 	struct drbd_peer_device *peer_device = peer_req->peer_device;
342 	struct drbd_device *device = peer_device->device;
343 	int digest_size;
344 	void *digest;
345 	int err = 0;
346 
347 	if (unlikely(cancel))
348 		goto out;
349 
350 	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
351 		goto out;
352 
353 	digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm);
354 	digest = kmalloc(digest_size, GFP_NOIO);
355 	if (digest) {
356 		sector_t sector = peer_req->i.sector;
357 		unsigned int size = peer_req->i.size;
358 		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
359 		/* Free peer_req and pages before send.
360 		 * In case we block on congestion, we could otherwise run into
361 		 * some distributed deadlock, if the other side blocks on
362 		 * congestion as well, because our receiver blocks in
363 		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
364 		drbd_free_peer_req(device, peer_req);
365 		peer_req = NULL;
366 		inc_rs_pending(device);
367 		err = drbd_send_drequest_csum(peer_device, sector, size,
368 					      digest, digest_size,
369 					      P_CSUM_RS_REQUEST);
370 		kfree(digest);
371 	} else {
372 		drbd_err(device, "kmalloc() of digest failed.\n");
373 		err = -ENOMEM;
374 	}
375 
376 out:
377 	if (peer_req)
378 		drbd_free_peer_req(device, peer_req);
379 
380 	if (unlikely(err))
381 		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
382 	return err;
383 }
384 
385 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
386 
387 static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
388 {
389 	struct drbd_device *device = peer_device->device;
390 	struct drbd_peer_request *peer_req;
391 
392 	if (!get_ldev(device))
393 		return -EIO;
394 
395 	/* GFP_TRY, because if there is no memory available right now, this may
396 	 * be rescheduled for later. It is "only" background resync, after all. */
397 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
398 				       size, size, GFP_TRY);
399 	if (!peer_req)
400 		goto defer;
401 
402 	peer_req->w.cb = w_e_send_csum;
403 	spin_lock_irq(&device->resource->req_lock);
404 	list_add_tail(&peer_req->w.list, &device->read_ee);
405 	spin_unlock_irq(&device->resource->req_lock);
406 
407 	atomic_add(size >> 9, &device->rs_sect_ev);
408 	if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
409 				     DRBD_FAULT_RS_RD) == 0)
410 		return 0;
411 
412 	/* If it failed because of ENOMEM, retry should help.  If it failed
413 	 * because bio_add_page failed (probably broken lower level driver),
414 	 * retry may or may not help.
415 	 * If it does not, you may need to force disconnect. */
416 	spin_lock_irq(&device->resource->req_lock);
417 	list_del(&peer_req->w.list);
418 	spin_unlock_irq(&device->resource->req_lock);
419 
420 	drbd_free_peer_req(device, peer_req);
421 defer:
422 	put_ldev(device);
423 	return -EAGAIN;
424 }
425 
426 int w_resync_timer(struct drbd_work *w, int cancel)
427 {
428 	struct drbd_device *device =
429 		container_of(w, struct drbd_device, resync_work);
430 
431 	switch (device->state.conn) {
432 	case C_VERIFY_S:
433 		make_ov_request(device, cancel);
434 		break;
435 	case C_SYNC_TARGET:
436 		make_resync_request(device, cancel);
437 		break;
438 	}
439 
440 	return 0;
441 }
442 
443 void resync_timer_fn(struct timer_list *t)
444 {
445 	struct drbd_device *device = from_timer(device, t, resync_timer);
446 
447 	drbd_queue_work_if_unqueued(
448 		&first_peer_device(device)->connection->sender_work,
449 		&device->resync_work);
450 }
451 
452 static void fifo_set(struct fifo_buffer *fb, int value)
453 {
454 	int i;
455 
456 	for (i = 0; i < fb->size; i++)
457 		fb->values[i] = value;
458 }
459 
460 static int fifo_push(struct fifo_buffer *fb, int value)
461 {
462 	int ov;
463 
464 	ov = fb->values[fb->head_index];
465 	fb->values[fb->head_index++] = value;
466 
467 	if (fb->head_index >= fb->size)
468 		fb->head_index = 0;
469 
470 	return ov;
471 }
472 
473 static void fifo_add_val(struct fifo_buffer *fb, int value)
474 {
475 	int i;
476 
477 	for (i = 0; i < fb->size; i++)
478 		fb->values[i] += value;
479 }
480 
481 struct fifo_buffer *fifo_alloc(unsigned int fifo_size)
482 {
483 	struct fifo_buffer *fb;
484 
485 	fb = kzalloc(struct_size(fb, values, fifo_size), GFP_NOIO);
486 	if (!fb)
487 		return NULL;
488 
489 	fb->head_index = 0;
490 	fb->size = fifo_size;
491 	fb->total = 0;
492 
493 	return fb;
494 }
495 
496 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
497 {
498 	struct disk_conf *dc;
499 	unsigned int want;     /* The number of sectors we want in-flight */
500 	int req_sect; /* Number of sectors to request in this turn */
501 	int correction; /* Number of sectors more we need in-flight */
502 	int cps; /* correction per invocation of drbd_rs_controller() */
503 	int steps; /* Number of time steps to plan ahead */
504 	int curr_corr;
505 	int max_sect;
506 	struct fifo_buffer *plan;
507 
508 	dc = rcu_dereference(device->ldev->disk_conf);
509 	plan = rcu_dereference(device->rs_plan_s);
510 
511 	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
512 
513 	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
514 		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
515 	} else { /* normal path */
516 		want = dc->c_fill_target ? dc->c_fill_target :
517 			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
518 	}
519 
520 	correction = want - device->rs_in_flight - plan->total;
521 
522 	/* Plan ahead */
523 	cps = correction / steps;
524 	fifo_add_val(plan, cps);
525 	plan->total += cps * steps;
526 
527 	/* What we do in this step */
528 	curr_corr = fifo_push(plan, 0);
529 	plan->total -= curr_corr;
530 
531 	req_sect = sect_in + curr_corr;
532 	if (req_sect < 0)
533 		req_sect = 0;
534 
535 	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
536 	if (req_sect > max_sect)
537 		req_sect = max_sect;
538 
539 	/*
540 	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
541 		 sect_in, device->rs_in_flight, want, correction,
542 		 steps, cps, device->rs_planed, curr_corr, req_sect);
543 	*/
544 
545 	return req_sect;
546 }
547 
548 static int drbd_rs_number_requests(struct drbd_device *device)
549 {
550 	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
551 	int number, mxb;
552 
553 	sect_in = atomic_xchg(&device->rs_sect_in, 0);
554 	device->rs_in_flight -= sect_in;
555 
556 	rcu_read_lock();
557 	mxb = drbd_get_max_buffers(device) / 2;
558 	if (rcu_dereference(device->rs_plan_s)->size) {
559 		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
560 		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
561 	} else {
562 		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
563 		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
564 	}
565 	rcu_read_unlock();
566 
567 	/* Don't have more than "max-buffers"/2 in-flight.
568 	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
569 	 * potentially causing a distributed deadlock on congestion during
570 	 * online-verify or (checksum-based) resync, if max-buffers,
571 	 * socket buffer sizes and resync rate settings are mis-configured. */
572 
573 	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
574 	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
575 	 * "number of pages" (typically also 4k),
576 	 * but "rs_in_flight" is in "sectors" (512 Byte). */
577 	if (mxb - device->rs_in_flight/8 < number)
578 		number = mxb - device->rs_in_flight/8;
579 
580 	return number;
581 }
582 
583 static int make_resync_request(struct drbd_device *const device, int cancel)
584 {
585 	struct drbd_peer_device *const peer_device = first_peer_device(device);
586 	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
587 	unsigned long bit;
588 	sector_t sector;
589 	const sector_t capacity = get_capacity(device->vdisk);
590 	int max_bio_size;
591 	int number, rollback_i, size;
592 	int align, requeue = 0;
593 	int i = 0;
594 	int discard_granularity = 0;
595 
596 	if (unlikely(cancel))
597 		return 0;
598 
599 	if (device->rs_total == 0) {
600 		/* empty resync? */
601 		drbd_resync_finished(device);
602 		return 0;
603 	}
604 
605 	if (!get_ldev(device)) {
606 		/* Since we only need to access device->rsync a
607 		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
608 		   to continue resync with a broken disk makes no sense at
609 		   all */
610 		drbd_err(device, "Disk broke down during resync!\n");
611 		return 0;
612 	}
613 
614 	if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
615 		rcu_read_lock();
616 		discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
617 		rcu_read_unlock();
618 	}
619 
620 	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
621 	number = drbd_rs_number_requests(device);
622 	if (number <= 0)
623 		goto requeue;
624 
625 	for (i = 0; i < number; i++) {
626 		/* Stop generating RS requests when half of the send buffer is filled,
627 		 * but notify TCP that we'd like to have more space. */
628 		mutex_lock(&connection->data.mutex);
629 		if (connection->data.socket) {
630 			struct sock *sk = connection->data.socket->sk;
631 			int queued = sk->sk_wmem_queued;
632 			int sndbuf = sk->sk_sndbuf;
633 			if (queued > sndbuf / 2) {
634 				requeue = 1;
635 				if (sk->sk_socket)
636 					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
637 			}
638 		} else
639 			requeue = 1;
640 		mutex_unlock(&connection->data.mutex);
641 		if (requeue)
642 			goto requeue;
643 
644 next_sector:
645 		size = BM_BLOCK_SIZE;
646 		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
647 
648 		if (bit == DRBD_END_OF_BITMAP) {
649 			device->bm_resync_fo = drbd_bm_bits(device);
650 			put_ldev(device);
651 			return 0;
652 		}
653 
654 		sector = BM_BIT_TO_SECT(bit);
655 
656 		if (drbd_try_rs_begin_io(device, sector)) {
657 			device->bm_resync_fo = bit;
658 			goto requeue;
659 		}
660 		device->bm_resync_fo = bit + 1;
661 
662 		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
663 			drbd_rs_complete_io(device, sector);
664 			goto next_sector;
665 		}
666 
667 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
668 		/* try to find some adjacent bits.
669 		 * we stop if we have already the maximum req size.
670 		 *
671 		 * Additionally always align bigger requests, in order to
672 		 * be prepared for all stripe sizes of software RAIDs.
673 		 */
674 		align = 1;
675 		rollback_i = i;
676 		while (i < number) {
677 			if (size + BM_BLOCK_SIZE > max_bio_size)
678 				break;
679 
680 			/* Be always aligned */
681 			if (sector & ((1<<(align+3))-1))
682 				break;
683 
684 			if (discard_granularity && size == discard_granularity)
685 				break;
686 
687 			/* do not cross extent boundaries */
688 			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
689 				break;
690 			/* now, is it actually dirty, after all?
691 			 * caution, drbd_bm_test_bit is tri-state for some
692 			 * obscure reason; ( b == 0 ) would get the out-of-band
693 			 * only accidentally right because of the "oddly sized"
694 			 * adjustment below */
695 			if (drbd_bm_test_bit(device, bit+1) != 1)
696 				break;
697 			bit++;
698 			size += BM_BLOCK_SIZE;
699 			if ((BM_BLOCK_SIZE << align) <= size)
700 				align++;
701 			i++;
702 		}
703 		/* if we merged some,
704 		 * reset the offset to start the next drbd_bm_find_next from */
705 		if (size > BM_BLOCK_SIZE)
706 			device->bm_resync_fo = bit + 1;
707 #endif
708 
709 		/* adjust very last sectors, in case we are oddly sized */
710 		if (sector + (size>>9) > capacity)
711 			size = (capacity-sector)<<9;
712 
713 		if (device->use_csums) {
714 			switch (read_for_csum(peer_device, sector, size)) {
715 			case -EIO: /* Disk failure */
716 				put_ldev(device);
717 				return -EIO;
718 			case -EAGAIN: /* allocation failed, or ldev busy */
719 				drbd_rs_complete_io(device, sector);
720 				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
721 				i = rollback_i;
722 				goto requeue;
723 			case 0:
724 				/* everything ok */
725 				break;
726 			default:
727 				BUG();
728 			}
729 		} else {
730 			int err;
731 
732 			inc_rs_pending(device);
733 			err = drbd_send_drequest(peer_device,
734 						 size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
735 						 sector, size, ID_SYNCER);
736 			if (err) {
737 				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
738 				dec_rs_pending(device);
739 				put_ldev(device);
740 				return err;
741 			}
742 		}
743 	}
744 
745 	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
746 		/* last syncer _request_ was sent,
747 		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
748 		 * next sync group will resume), as soon as we receive the last
749 		 * resync data block, and the last bit is cleared.
750 		 * until then resync "work" is "inactive" ...
751 		 */
752 		put_ldev(device);
753 		return 0;
754 	}
755 
756  requeue:
757 	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
758 	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
759 	put_ldev(device);
760 	return 0;
761 }
762 
763 static int make_ov_request(struct drbd_device *device, int cancel)
764 {
765 	int number, i, size;
766 	sector_t sector;
767 	const sector_t capacity = get_capacity(device->vdisk);
768 	bool stop_sector_reached = false;
769 
770 	if (unlikely(cancel))
771 		return 1;
772 
773 	number = drbd_rs_number_requests(device);
774 
775 	sector = device->ov_position;
776 	for (i = 0; i < number; i++) {
777 		if (sector >= capacity)
778 			return 1;
779 
780 		/* We check for "finished" only in the reply path:
781 		 * w_e_end_ov_reply().
782 		 * We need to send at least one request out. */
783 		stop_sector_reached = i > 0
784 			&& verify_can_do_stop_sector(device)
785 			&& sector >= device->ov_stop_sector;
786 		if (stop_sector_reached)
787 			break;
788 
789 		size = BM_BLOCK_SIZE;
790 
791 		if (drbd_try_rs_begin_io(device, sector)) {
792 			device->ov_position = sector;
793 			goto requeue;
794 		}
795 
796 		if (sector + (size>>9) > capacity)
797 			size = (capacity-sector)<<9;
798 
799 		inc_rs_pending(device);
800 		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
801 			dec_rs_pending(device);
802 			return 0;
803 		}
804 		sector += BM_SECT_PER_BIT;
805 	}
806 	device->ov_position = sector;
807 
808  requeue:
809 	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
810 	if (i == 0 || !stop_sector_reached)
811 		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
812 	return 1;
813 }
814 
815 int w_ov_finished(struct drbd_work *w, int cancel)
816 {
817 	struct drbd_device_work *dw =
818 		container_of(w, struct drbd_device_work, w);
819 	struct drbd_device *device = dw->device;
820 	kfree(dw);
821 	ov_out_of_sync_print(device);
822 	drbd_resync_finished(device);
823 
824 	return 0;
825 }
826 
827 static int w_resync_finished(struct drbd_work *w, int cancel)
828 {
829 	struct drbd_device_work *dw =
830 		container_of(w, struct drbd_device_work, w);
831 	struct drbd_device *device = dw->device;
832 	kfree(dw);
833 
834 	drbd_resync_finished(device);
835 
836 	return 0;
837 }
838 
839 static void ping_peer(struct drbd_device *device)
840 {
841 	struct drbd_connection *connection = first_peer_device(device)->connection;
842 
843 	clear_bit(GOT_PING_ACK, &connection->flags);
844 	request_ping(connection);
845 	wait_event(connection->ping_wait,
846 		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
847 }
848 
849 int drbd_resync_finished(struct drbd_device *device)
850 {
851 	struct drbd_connection *connection = first_peer_device(device)->connection;
852 	unsigned long db, dt, dbdt;
853 	unsigned long n_oos;
854 	union drbd_state os, ns;
855 	struct drbd_device_work *dw;
856 	char *khelper_cmd = NULL;
857 	int verify_done = 0;
858 
859 	/* Remove all elements from the resync LRU. Since future actions
860 	 * might set bits in the (main) bitmap, then the entries in the
861 	 * resync LRU would be wrong. */
862 	if (drbd_rs_del_all(device)) {
863 		/* In case this is not possible now, most probably because
864 		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
865 		 * queue (or even the read operations for those packets
866 		 * is not finished by now).   Retry in 100ms. */
867 
868 		schedule_timeout_interruptible(HZ / 10);
869 		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
870 		if (dw) {
871 			dw->w.cb = w_resync_finished;
872 			dw->device = device;
873 			drbd_queue_work(&connection->sender_work, &dw->w);
874 			return 1;
875 		}
876 		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
877 	}
878 
879 	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
880 	if (dt <= 0)
881 		dt = 1;
882 
883 	db = device->rs_total;
884 	/* adjust for verify start and stop sectors, respective reached position */
885 	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
886 		db -= device->ov_left;
887 
888 	dbdt = Bit2KB(db/dt);
889 	device->rs_paused /= HZ;
890 
891 	if (!get_ldev(device))
892 		goto out;
893 
894 	ping_peer(device);
895 
896 	spin_lock_irq(&device->resource->req_lock);
897 	os = drbd_read_state(device);
898 
899 	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
900 
901 	/* This protects us against multiple calls (that can happen in the presence
902 	   of application IO), and against connectivity loss just before we arrive here. */
903 	if (os.conn <= C_CONNECTED)
904 		goto out_unlock;
905 
906 	ns = os;
907 	ns.conn = C_CONNECTED;
908 
909 	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
910 	     verify_done ? "Online verify" : "Resync",
911 	     dt + device->rs_paused, device->rs_paused, dbdt);
912 
913 	n_oos = drbd_bm_total_weight(device);
914 
915 	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
916 		if (n_oos) {
917 			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
918 			      n_oos, Bit2KB(1));
919 			khelper_cmd = "out-of-sync";
920 		}
921 	} else {
922 		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
923 
924 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
925 			khelper_cmd = "after-resync-target";
926 
927 		if (device->use_csums && device->rs_total) {
928 			const unsigned long s = device->rs_same_csum;
929 			const unsigned long t = device->rs_total;
930 			const int ratio =
931 				(t == 0)     ? 0 :
932 			(t < 100000) ? ((s*100)/t) : (s/(t/100));
933 			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
934 			     "transferred %luK total %luK\n",
935 			     ratio,
936 			     Bit2KB(device->rs_same_csum),
937 			     Bit2KB(device->rs_total - device->rs_same_csum),
938 			     Bit2KB(device->rs_total));
939 		}
940 	}
941 
942 	if (device->rs_failed) {
943 		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
944 
945 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
946 			ns.disk = D_INCONSISTENT;
947 			ns.pdsk = D_UP_TO_DATE;
948 		} else {
949 			ns.disk = D_UP_TO_DATE;
950 			ns.pdsk = D_INCONSISTENT;
951 		}
952 	} else {
953 		ns.disk = D_UP_TO_DATE;
954 		ns.pdsk = D_UP_TO_DATE;
955 
956 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
957 			if (device->p_uuid) {
958 				int i;
959 				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
960 					_drbd_uuid_set(device, i, device->p_uuid[i]);
961 				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
962 				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
963 			} else {
964 				drbd_err(device, "device->p_uuid is NULL! BUG\n");
965 			}
966 		}
967 
968 		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
969 			/* for verify runs, we don't update uuids here,
970 			 * so there would be nothing to report. */
971 			drbd_uuid_set_bm(device, 0UL);
972 			drbd_print_uuids(device, "updated UUIDs");
973 			if (device->p_uuid) {
974 				/* Now the two UUID sets are equal, update what we
975 				 * know of the peer. */
976 				int i;
977 				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
978 					device->p_uuid[i] = device->ldev->md.uuid[i];
979 			}
980 		}
981 	}
982 
983 	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
984 out_unlock:
985 	spin_unlock_irq(&device->resource->req_lock);
986 
987 	/* If we have been sync source, and have an effective fencing-policy,
988 	 * once *all* volumes are back in sync, call "unfence". */
989 	if (os.conn == C_SYNC_SOURCE) {
990 		enum drbd_disk_state disk_state = D_MASK;
991 		enum drbd_disk_state pdsk_state = D_MASK;
992 		enum drbd_fencing_p fp = FP_DONT_CARE;
993 
994 		rcu_read_lock();
995 		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
996 		if (fp != FP_DONT_CARE) {
997 			struct drbd_peer_device *peer_device;
998 			int vnr;
999 			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1000 				struct drbd_device *device = peer_device->device;
1001 				disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
1002 				pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
1003 			}
1004 		}
1005 		rcu_read_unlock();
1006 		if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
1007 			conn_khelper(connection, "unfence-peer");
1008 	}
1009 
1010 	put_ldev(device);
1011 out:
1012 	device->rs_total  = 0;
1013 	device->rs_failed = 0;
1014 	device->rs_paused = 0;
1015 
1016 	/* reset start sector, if we reached end of device */
1017 	if (verify_done && device->ov_left == 0)
1018 		device->ov_start_sector = 0;
1019 
1020 	drbd_md_sync(device);
1021 
1022 	if (khelper_cmd)
1023 		drbd_khelper(device, khelper_cmd);
1024 
1025 	return 1;
1026 }
1027 
1028 /* helper */
1029 static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
1030 {
1031 	if (drbd_peer_req_has_active_page(peer_req)) {
1032 		/* This might happen if sendpage() has not finished */
1033 		int i = PFN_UP(peer_req->i.size);
1034 		atomic_add(i, &device->pp_in_use_by_net);
1035 		atomic_sub(i, &device->pp_in_use);
1036 		spin_lock_irq(&device->resource->req_lock);
1037 		list_add_tail(&peer_req->w.list, &device->net_ee);
1038 		spin_unlock_irq(&device->resource->req_lock);
1039 		wake_up(&drbd_pp_wait);
1040 	} else
1041 		drbd_free_peer_req(device, peer_req);
1042 }
1043 
1044 /**
1045  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
1046  * @w:		work object.
1047  * @cancel:	The connection will be closed anyways
1048  */
1049 int w_e_end_data_req(struct drbd_work *w, int cancel)
1050 {
1051 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1052 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1053 	struct drbd_device *device = peer_device->device;
1054 	int err;
1055 
1056 	if (unlikely(cancel)) {
1057 		drbd_free_peer_req(device, peer_req);
1058 		dec_unacked(device);
1059 		return 0;
1060 	}
1061 
1062 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1063 		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
1064 	} else {
1065 		if (__ratelimit(&drbd_ratelimit_state))
1066 			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
1067 			    (unsigned long long)peer_req->i.sector);
1068 
1069 		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
1070 	}
1071 
1072 	dec_unacked(device);
1073 
1074 	move_to_net_ee_or_free(device, peer_req);
1075 
1076 	if (unlikely(err))
1077 		drbd_err(device, "drbd_send_block() failed\n");
1078 	return err;
1079 }
1080 
1081 static bool all_zero(struct drbd_peer_request *peer_req)
1082 {
1083 	struct page *page = peer_req->pages;
1084 	unsigned int len = peer_req->i.size;
1085 
1086 	page_chain_for_each(page) {
1087 		unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
1088 		unsigned int i, words = l / sizeof(long);
1089 		unsigned long *d;
1090 
1091 		d = kmap_atomic(page);
1092 		for (i = 0; i < words; i++) {
1093 			if (d[i]) {
1094 				kunmap_atomic(d);
1095 				return false;
1096 			}
1097 		}
1098 		kunmap_atomic(d);
1099 		len -= l;
1100 	}
1101 
1102 	return true;
1103 }
1104 
1105 /**
1106  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1107  * @w:		work object.
1108  * @cancel:	The connection will be closed anyways
1109  */
1110 int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1111 {
1112 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1113 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1114 	struct drbd_device *device = peer_device->device;
1115 	int err;
1116 
1117 	if (unlikely(cancel)) {
1118 		drbd_free_peer_req(device, peer_req);
1119 		dec_unacked(device);
1120 		return 0;
1121 	}
1122 
1123 	if (get_ldev_if_state(device, D_FAILED)) {
1124 		drbd_rs_complete_io(device, peer_req->i.sector);
1125 		put_ldev(device);
1126 	}
1127 
1128 	if (device->state.conn == C_AHEAD) {
1129 		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
1130 	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1131 		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1132 			inc_rs_pending(device);
1133 			if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
1134 				err = drbd_send_rs_deallocated(peer_device, peer_req);
1135 			else
1136 				err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1137 		} else {
1138 			if (__ratelimit(&drbd_ratelimit_state))
1139 				drbd_err(device, "Not sending RSDataReply, "
1140 				    "partner DISKLESS!\n");
1141 			err = 0;
1142 		}
1143 	} else {
1144 		if (__ratelimit(&drbd_ratelimit_state))
1145 			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
1146 			    (unsigned long long)peer_req->i.sector);
1147 
1148 		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1149 
1150 		/* update resync data with failure */
1151 		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
1152 	}
1153 
1154 	dec_unacked(device);
1155 
1156 	move_to_net_ee_or_free(device, peer_req);
1157 
1158 	if (unlikely(err))
1159 		drbd_err(device, "drbd_send_block() failed\n");
1160 	return err;
1161 }
1162 
1163 int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1164 {
1165 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1166 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1167 	struct drbd_device *device = peer_device->device;
1168 	struct digest_info *di;
1169 	int digest_size;
1170 	void *digest = NULL;
1171 	int err, eq = 0;
1172 
1173 	if (unlikely(cancel)) {
1174 		drbd_free_peer_req(device, peer_req);
1175 		dec_unacked(device);
1176 		return 0;
1177 	}
1178 
1179 	if (get_ldev(device)) {
1180 		drbd_rs_complete_io(device, peer_req->i.sector);
1181 		put_ldev(device);
1182 	}
1183 
1184 	di = peer_req->digest;
1185 
1186 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1187 		/* quick hack to try to avoid a race against reconfiguration.
1188 		 * a real fix would be much more involved,
1189 		 * introducing more locking mechanisms */
1190 		if (peer_device->connection->csums_tfm) {
1191 			digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm);
1192 			D_ASSERT(device, digest_size == di->digest_size);
1193 			digest = kmalloc(digest_size, GFP_NOIO);
1194 		}
1195 		if (digest) {
1196 			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
1197 			eq = !memcmp(digest, di->digest, digest_size);
1198 			kfree(digest);
1199 		}
1200 
1201 		if (eq) {
1202 			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
1203 			/* rs_same_csums unit is BM_BLOCK_SIZE */
1204 			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1205 			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
1206 		} else {
1207 			inc_rs_pending(device);
1208 			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1209 			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1210 			kfree(di);
1211 			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1212 		}
1213 	} else {
1214 		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1215 		if (__ratelimit(&drbd_ratelimit_state))
1216 			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
1217 	}
1218 
1219 	dec_unacked(device);
1220 	move_to_net_ee_or_free(device, peer_req);
1221 
1222 	if (unlikely(err))
1223 		drbd_err(device, "drbd_send_block/ack() failed\n");
1224 	return err;
1225 }
1226 
1227 int w_e_end_ov_req(struct drbd_work *w, int cancel)
1228 {
1229 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1230 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1231 	struct drbd_device *device = peer_device->device;
1232 	sector_t sector = peer_req->i.sector;
1233 	unsigned int size = peer_req->i.size;
1234 	int digest_size;
1235 	void *digest;
1236 	int err = 0;
1237 
1238 	if (unlikely(cancel))
1239 		goto out;
1240 
1241 	digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm);
1242 	digest = kmalloc(digest_size, GFP_NOIO);
1243 	if (!digest) {
1244 		err = 1;	/* terminate the connection in case the allocation failed */
1245 		goto out;
1246 	}
1247 
1248 	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1249 		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1250 	else
1251 		memset(digest, 0, digest_size);
1252 
1253 	/* Free e and pages before send.
1254 	 * In case we block on congestion, we could otherwise run into
1255 	 * some distributed deadlock, if the other side blocks on
1256 	 * congestion as well, because our receiver blocks in
1257 	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1258 	drbd_free_peer_req(device, peer_req);
1259 	peer_req = NULL;
1260 	inc_rs_pending(device);
1261 	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
1262 	if (err)
1263 		dec_rs_pending(device);
1264 	kfree(digest);
1265 
1266 out:
1267 	if (peer_req)
1268 		drbd_free_peer_req(device, peer_req);
1269 	dec_unacked(device);
1270 	return err;
1271 }
1272 
1273 void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
1274 {
1275 	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1276 		device->ov_last_oos_size += size>>9;
1277 	} else {
1278 		device->ov_last_oos_start = sector;
1279 		device->ov_last_oos_size = size>>9;
1280 	}
1281 	drbd_set_out_of_sync(device, sector, size);
1282 }
1283 
1284 int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1285 {
1286 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1287 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1288 	struct drbd_device *device = peer_device->device;
1289 	struct digest_info *di;
1290 	void *digest;
1291 	sector_t sector = peer_req->i.sector;
1292 	unsigned int size = peer_req->i.size;
1293 	int digest_size;
1294 	int err, eq = 0;
1295 	bool stop_sector_reached = false;
1296 
1297 	if (unlikely(cancel)) {
1298 		drbd_free_peer_req(device, peer_req);
1299 		dec_unacked(device);
1300 		return 0;
1301 	}
1302 
1303 	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1304 	 * the resync lru has been cleaned up already */
1305 	if (get_ldev(device)) {
1306 		drbd_rs_complete_io(device, peer_req->i.sector);
1307 		put_ldev(device);
1308 	}
1309 
1310 	di = peer_req->digest;
1311 
1312 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1313 		digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm);
1314 		digest = kmalloc(digest_size, GFP_NOIO);
1315 		if (digest) {
1316 			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1317 
1318 			D_ASSERT(device, digest_size == di->digest_size);
1319 			eq = !memcmp(digest, di->digest, digest_size);
1320 			kfree(digest);
1321 		}
1322 	}
1323 
1324 	/* Free peer_req and pages before send.
1325 	 * In case we block on congestion, we could otherwise run into
1326 	 * some distributed deadlock, if the other side blocks on
1327 	 * congestion as well, because our receiver blocks in
1328 	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1329 	drbd_free_peer_req(device, peer_req);
1330 	if (!eq)
1331 		drbd_ov_out_of_sync_found(device, sector, size);
1332 	else
1333 		ov_out_of_sync_print(device);
1334 
1335 	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
1336 			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1337 
1338 	dec_unacked(device);
1339 
1340 	--device->ov_left;
1341 
1342 	/* let's advance progress step marks only for every other megabyte */
1343 	if ((device->ov_left & 0x200) == 0x200)
1344 		drbd_advance_rs_marks(device, device->ov_left);
1345 
1346 	stop_sector_reached = verify_can_do_stop_sector(device) &&
1347 		(sector + (size>>9)) >= device->ov_stop_sector;
1348 
1349 	if (device->ov_left == 0 || stop_sector_reached) {
1350 		ov_out_of_sync_print(device);
1351 		drbd_resync_finished(device);
1352 	}
1353 
1354 	return err;
1355 }
1356 
1357 /* FIXME
1358  * We need to track the number of pending barrier acks,
1359  * and to be able to wait for them.
1360  * See also comment in drbd_adm_attach before drbd_suspend_io.
1361  */
1362 static int drbd_send_barrier(struct drbd_connection *connection)
1363 {
1364 	struct p_barrier *p;
1365 	struct drbd_socket *sock;
1366 
1367 	sock = &connection->data;
1368 	p = conn_prepare_command(connection, sock);
1369 	if (!p)
1370 		return -EIO;
1371 	p->barrier = connection->send.current_epoch_nr;
1372 	p->pad = 0;
1373 	connection->send.current_epoch_writes = 0;
1374 	connection->send.last_sent_barrier_jif = jiffies;
1375 
1376 	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1377 }
1378 
1379 static int pd_send_unplug_remote(struct drbd_peer_device *pd)
1380 {
1381 	struct drbd_socket *sock = &pd->connection->data;
1382 	if (!drbd_prepare_command(pd, sock))
1383 		return -EIO;
1384 	return drbd_send_command(pd, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1385 }
1386 
1387 int w_send_write_hint(struct drbd_work *w, int cancel)
1388 {
1389 	struct drbd_device *device =
1390 		container_of(w, struct drbd_device, unplug_work);
1391 
1392 	if (cancel)
1393 		return 0;
1394 	return pd_send_unplug_remote(first_peer_device(device));
1395 }
1396 
1397 static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
1398 {
1399 	if (!connection->send.seen_any_write_yet) {
1400 		connection->send.seen_any_write_yet = true;
1401 		connection->send.current_epoch_nr = epoch;
1402 		connection->send.current_epoch_writes = 0;
1403 		connection->send.last_sent_barrier_jif = jiffies;
1404 	}
1405 }
1406 
1407 static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
1408 {
1409 	/* re-init if first write on this connection */
1410 	if (!connection->send.seen_any_write_yet)
1411 		return;
1412 	if (connection->send.current_epoch_nr != epoch) {
1413 		if (connection->send.current_epoch_writes)
1414 			drbd_send_barrier(connection);
1415 		connection->send.current_epoch_nr = epoch;
1416 	}
1417 }
1418 
1419 int w_send_out_of_sync(struct drbd_work *w, int cancel)
1420 {
1421 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1422 	struct drbd_device *device = req->device;
1423 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1424 	struct drbd_connection *const connection = peer_device->connection;
1425 	int err;
1426 
1427 	if (unlikely(cancel)) {
1428 		req_mod(req, SEND_CANCELED);
1429 		return 0;
1430 	}
1431 	req->pre_send_jif = jiffies;
1432 
1433 	/* this time, no connection->send.current_epoch_writes++;
1434 	 * If it was sent, it was the closing barrier for the last
1435 	 * replicated epoch, before we went into AHEAD mode.
1436 	 * No more barriers will be sent, until we leave AHEAD mode again. */
1437 	maybe_send_barrier(connection, req->epoch);
1438 
1439 	err = drbd_send_out_of_sync(peer_device, req);
1440 	req_mod(req, OOS_HANDED_TO_NETWORK);
1441 
1442 	return err;
1443 }
1444 
1445 /**
1446  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1447  * @w:		work object.
1448  * @cancel:	The connection will be closed anyways
1449  */
1450 int w_send_dblock(struct drbd_work *w, int cancel)
1451 {
1452 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1453 	struct drbd_device *device = req->device;
1454 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1455 	struct drbd_connection *connection = peer_device->connection;
1456 	bool do_send_unplug = req->rq_state & RQ_UNPLUG;
1457 	int err;
1458 
1459 	if (unlikely(cancel)) {
1460 		req_mod(req, SEND_CANCELED);
1461 		return 0;
1462 	}
1463 	req->pre_send_jif = jiffies;
1464 
1465 	re_init_if_first_write(connection, req->epoch);
1466 	maybe_send_barrier(connection, req->epoch);
1467 	connection->send.current_epoch_writes++;
1468 
1469 	err = drbd_send_dblock(peer_device, req);
1470 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1471 
1472 	if (do_send_unplug && !err)
1473 		pd_send_unplug_remote(peer_device);
1474 
1475 	return err;
1476 }
1477 
1478 /**
1479  * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1480  * @w:		work object.
1481  * @cancel:	The connection will be closed anyways
1482  */
1483 int w_send_read_req(struct drbd_work *w, int cancel)
1484 {
1485 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1486 	struct drbd_device *device = req->device;
1487 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1488 	struct drbd_connection *connection = peer_device->connection;
1489 	bool do_send_unplug = req->rq_state & RQ_UNPLUG;
1490 	int err;
1491 
1492 	if (unlikely(cancel)) {
1493 		req_mod(req, SEND_CANCELED);
1494 		return 0;
1495 	}
1496 	req->pre_send_jif = jiffies;
1497 
1498 	/* Even read requests may close a write epoch,
1499 	 * if there was any yet. */
1500 	maybe_send_barrier(connection, req->epoch);
1501 
1502 	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
1503 				 (unsigned long)req);
1504 
1505 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1506 
1507 	if (do_send_unplug && !err)
1508 		pd_send_unplug_remote(peer_device);
1509 
1510 	return err;
1511 }
1512 
1513 int w_restart_disk_io(struct drbd_work *w, int cancel)
1514 {
1515 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1516 	struct drbd_device *device = req->device;
1517 
1518 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1519 		drbd_al_begin_io(device, &req->i);
1520 
1521 	req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
1522 					   req->master_bio, GFP_NOIO,
1523 					  &drbd_io_bio_set);
1524 	req->private_bio->bi_private = req;
1525 	req->private_bio->bi_end_io = drbd_request_endio;
1526 	submit_bio_noacct(req->private_bio);
1527 
1528 	return 0;
1529 }
1530 
1531 static int _drbd_may_sync_now(struct drbd_device *device)
1532 {
1533 	struct drbd_device *odev = device;
1534 	int resync_after;
1535 
1536 	while (1) {
1537 		if (!odev->ldev || odev->state.disk == D_DISKLESS)
1538 			return 1;
1539 		rcu_read_lock();
1540 		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1541 		rcu_read_unlock();
1542 		if (resync_after == -1)
1543 			return 1;
1544 		odev = minor_to_device(resync_after);
1545 		if (!odev)
1546 			return 1;
1547 		if ((odev->state.conn >= C_SYNC_SOURCE &&
1548 		     odev->state.conn <= C_PAUSED_SYNC_T) ||
1549 		    odev->state.aftr_isp || odev->state.peer_isp ||
1550 		    odev->state.user_isp)
1551 			return 0;
1552 	}
1553 }
1554 
1555 /**
1556  * drbd_pause_after() - Pause resync on all devices that may not resync now
1557  * @device:	DRBD device.
1558  *
1559  * Called from process context only (admin command and after_state_ch).
1560  */
1561 static bool drbd_pause_after(struct drbd_device *device)
1562 {
1563 	bool changed = false;
1564 	struct drbd_device *odev;
1565 	int i;
1566 
1567 	rcu_read_lock();
1568 	idr_for_each_entry(&drbd_devices, odev, i) {
1569 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1570 			continue;
1571 		if (!_drbd_may_sync_now(odev) &&
1572 		    _drbd_set_state(_NS(odev, aftr_isp, 1),
1573 				    CS_HARD, NULL) != SS_NOTHING_TO_DO)
1574 			changed = true;
1575 	}
1576 	rcu_read_unlock();
1577 
1578 	return changed;
1579 }
1580 
1581 /**
1582  * drbd_resume_next() - Resume resync on all devices that may resync now
1583  * @device:	DRBD device.
1584  *
1585  * Called from process context only (admin command and worker).
1586  */
1587 static bool drbd_resume_next(struct drbd_device *device)
1588 {
1589 	bool changed = false;
1590 	struct drbd_device *odev;
1591 	int i;
1592 
1593 	rcu_read_lock();
1594 	idr_for_each_entry(&drbd_devices, odev, i) {
1595 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1596 			continue;
1597 		if (odev->state.aftr_isp) {
1598 			if (_drbd_may_sync_now(odev) &&
1599 			    _drbd_set_state(_NS(odev, aftr_isp, 0),
1600 					    CS_HARD, NULL) != SS_NOTHING_TO_DO)
1601 				changed = true;
1602 		}
1603 	}
1604 	rcu_read_unlock();
1605 	return changed;
1606 }
1607 
1608 void resume_next_sg(struct drbd_device *device)
1609 {
1610 	lock_all_resources();
1611 	drbd_resume_next(device);
1612 	unlock_all_resources();
1613 }
1614 
1615 void suspend_other_sg(struct drbd_device *device)
1616 {
1617 	lock_all_resources();
1618 	drbd_pause_after(device);
1619 	unlock_all_resources();
1620 }
1621 
1622 /* caller must lock_all_resources() */
1623 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1624 {
1625 	struct drbd_device *odev;
1626 	int resync_after;
1627 
1628 	if (o_minor == -1)
1629 		return NO_ERROR;
1630 	if (o_minor < -1 || o_minor > MINORMASK)
1631 		return ERR_RESYNC_AFTER;
1632 
1633 	/* check for loops */
1634 	odev = minor_to_device(o_minor);
1635 	while (1) {
1636 		if (odev == device)
1637 			return ERR_RESYNC_AFTER_CYCLE;
1638 
1639 		/* You are free to depend on diskless, non-existing,
1640 		 * or not yet/no longer existing minors.
1641 		 * We only reject dependency loops.
1642 		 * We cannot follow the dependency chain beyond a detached or
1643 		 * missing minor.
1644 		 */
1645 		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1646 			return NO_ERROR;
1647 
1648 		rcu_read_lock();
1649 		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1650 		rcu_read_unlock();
1651 		/* dependency chain ends here, no cycles. */
1652 		if (resync_after == -1)
1653 			return NO_ERROR;
1654 
1655 		/* follow the dependency chain */
1656 		odev = minor_to_device(resync_after);
1657 	}
1658 }
1659 
1660 /* caller must lock_all_resources() */
1661 void drbd_resync_after_changed(struct drbd_device *device)
1662 {
1663 	int changed;
1664 
1665 	do {
1666 		changed  = drbd_pause_after(device);
1667 		changed |= drbd_resume_next(device);
1668 	} while (changed);
1669 }
1670 
1671 void drbd_rs_controller_reset(struct drbd_device *device)
1672 {
1673 	struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
1674 	struct fifo_buffer *plan;
1675 
1676 	atomic_set(&device->rs_sect_in, 0);
1677 	atomic_set(&device->rs_sect_ev, 0);
1678 	device->rs_in_flight = 0;
1679 	device->rs_last_events =
1680 		(int)part_stat_read_accum(disk->part0, sectors);
1681 
1682 	/* Updating the RCU protected object in place is necessary since
1683 	   this function gets called from atomic context.
1684 	   It is valid since all other updates also lead to an completely
1685 	   empty fifo */
1686 	rcu_read_lock();
1687 	plan = rcu_dereference(device->rs_plan_s);
1688 	plan->total = 0;
1689 	fifo_set(plan, 0);
1690 	rcu_read_unlock();
1691 }
1692 
1693 void start_resync_timer_fn(struct timer_list *t)
1694 {
1695 	struct drbd_device *device = from_timer(device, t, start_resync_timer);
1696 	drbd_device_post_work(device, RS_START);
1697 }
1698 
1699 static void do_start_resync(struct drbd_device *device)
1700 {
1701 	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1702 		drbd_warn(device, "postponing start_resync ...\n");
1703 		device->start_resync_timer.expires = jiffies + HZ/10;
1704 		add_timer(&device->start_resync_timer);
1705 		return;
1706 	}
1707 
1708 	drbd_start_resync(device, C_SYNC_SOURCE);
1709 	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1710 }
1711 
1712 static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
1713 {
1714 	bool csums_after_crash_only;
1715 	rcu_read_lock();
1716 	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
1717 	rcu_read_unlock();
1718 	return connection->agreed_pro_version >= 89 &&		/* supported? */
1719 		connection->csums_tfm &&			/* configured? */
1720 		(csums_after_crash_only == false		/* use for each resync? */
1721 		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
1722 }
1723 
1724 /**
1725  * drbd_start_resync() - Start the resync process
1726  * @device:	DRBD device.
1727  * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
1728  *
1729  * This function might bring you directly into one of the
1730  * C_PAUSED_SYNC_* states.
1731  */
1732 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1733 {
1734 	struct drbd_peer_device *peer_device = first_peer_device(device);
1735 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1736 	union drbd_state ns;
1737 	int r;
1738 
1739 	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
1740 		drbd_err(device, "Resync already running!\n");
1741 		return;
1742 	}
1743 
1744 	if (!connection) {
1745 		drbd_err(device, "No connection to peer, aborting!\n");
1746 		return;
1747 	}
1748 
1749 	if (!test_bit(B_RS_H_DONE, &device->flags)) {
1750 		if (side == C_SYNC_TARGET) {
1751 			/* Since application IO was locked out during C_WF_BITMAP_T and
1752 			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1753 			   we check that we might make the data inconsistent. */
1754 			r = drbd_khelper(device, "before-resync-target");
1755 			r = (r >> 8) & 0xff;
1756 			if (r > 0) {
1757 				drbd_info(device, "before-resync-target handler returned %d, "
1758 					 "dropping connection.\n", r);
1759 				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
1760 				return;
1761 			}
1762 		} else /* C_SYNC_SOURCE */ {
1763 			r = drbd_khelper(device, "before-resync-source");
1764 			r = (r >> 8) & 0xff;
1765 			if (r > 0) {
1766 				if (r == 3) {
1767 					drbd_info(device, "before-resync-source handler returned %d, "
1768 						 "ignoring. Old userland tools?", r);
1769 				} else {
1770 					drbd_info(device, "before-resync-source handler returned %d, "
1771 						 "dropping connection.\n", r);
1772 					conn_request_state(connection,
1773 							   NS(conn, C_DISCONNECTING), CS_HARD);
1774 					return;
1775 				}
1776 			}
1777 		}
1778 	}
1779 
1780 	if (current == connection->worker.task) {
1781 		/* The worker should not sleep waiting for state_mutex,
1782 		   that can take long */
1783 		if (!mutex_trylock(device->state_mutex)) {
1784 			set_bit(B_RS_H_DONE, &device->flags);
1785 			device->start_resync_timer.expires = jiffies + HZ/5;
1786 			add_timer(&device->start_resync_timer);
1787 			return;
1788 		}
1789 	} else {
1790 		mutex_lock(device->state_mutex);
1791 	}
1792 
1793 	lock_all_resources();
1794 	clear_bit(B_RS_H_DONE, &device->flags);
1795 	/* Did some connection breakage or IO error race with us? */
1796 	if (device->state.conn < C_CONNECTED
1797 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
1798 		unlock_all_resources();
1799 		goto out;
1800 	}
1801 
1802 	ns = drbd_read_state(device);
1803 
1804 	ns.aftr_isp = !_drbd_may_sync_now(device);
1805 
1806 	ns.conn = side;
1807 
1808 	if (side == C_SYNC_TARGET)
1809 		ns.disk = D_INCONSISTENT;
1810 	else /* side == C_SYNC_SOURCE */
1811 		ns.pdsk = D_INCONSISTENT;
1812 
1813 	r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1814 	ns = drbd_read_state(device);
1815 
1816 	if (ns.conn < C_CONNECTED)
1817 		r = SS_UNKNOWN_ERROR;
1818 
1819 	if (r == SS_SUCCESS) {
1820 		unsigned long tw = drbd_bm_total_weight(device);
1821 		unsigned long now = jiffies;
1822 		int i;
1823 
1824 		device->rs_failed    = 0;
1825 		device->rs_paused    = 0;
1826 		device->rs_same_csum = 0;
1827 		device->rs_last_sect_ev = 0;
1828 		device->rs_total     = tw;
1829 		device->rs_start     = now;
1830 		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1831 			device->rs_mark_left[i] = tw;
1832 			device->rs_mark_time[i] = now;
1833 		}
1834 		drbd_pause_after(device);
1835 		/* Forget potentially stale cached per resync extent bit-counts.
1836 		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1837 		 * disabled, and know the disk state is ok. */
1838 		spin_lock(&device->al_lock);
1839 		lc_reset(device->resync);
1840 		device->resync_locked = 0;
1841 		device->resync_wenr = LC_FREE;
1842 		spin_unlock(&device->al_lock);
1843 	}
1844 	unlock_all_resources();
1845 
1846 	if (r == SS_SUCCESS) {
1847 		wake_up(&device->al_wait); /* for lc_reset() above */
1848 		/* reset rs_last_bcast when a resync or verify is started,
1849 		 * to deal with potential jiffies wrap. */
1850 		device->rs_last_bcast = jiffies - HZ;
1851 
1852 		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1853 		     drbd_conn_str(ns.conn),
1854 		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1855 		     (unsigned long) device->rs_total);
1856 		if (side == C_SYNC_TARGET) {
1857 			device->bm_resync_fo = 0;
1858 			device->use_csums = use_checksum_based_resync(connection, device);
1859 		} else {
1860 			device->use_csums = false;
1861 		}
1862 
1863 		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1864 		 * with w_send_oos, or the sync target will get confused as to
1865 		 * how much bits to resync.  We cannot do that always, because for an
1866 		 * empty resync and protocol < 95, we need to do it here, as we call
1867 		 * drbd_resync_finished from here in that case.
1868 		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1869 		 * and from after_state_ch otherwise. */
1870 		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
1871 			drbd_gen_and_send_sync_uuid(peer_device);
1872 
1873 		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
1874 			/* This still has a race (about when exactly the peers
1875 			 * detect connection loss) that can lead to a full sync
1876 			 * on next handshake. In 8.3.9 we fixed this with explicit
1877 			 * resync-finished notifications, but the fix
1878 			 * introduces a protocol change.  Sleeping for some
1879 			 * time longer than the ping interval + timeout on the
1880 			 * SyncSource, to give the SyncTarget the chance to
1881 			 * detect connection loss, then waiting for a ping
1882 			 * response (implicit in drbd_resync_finished) reduces
1883 			 * the race considerably, but does not solve it. */
1884 			if (side == C_SYNC_SOURCE) {
1885 				struct net_conf *nc;
1886 				int timeo;
1887 
1888 				rcu_read_lock();
1889 				nc = rcu_dereference(connection->net_conf);
1890 				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1891 				rcu_read_unlock();
1892 				schedule_timeout_interruptible(timeo);
1893 			}
1894 			drbd_resync_finished(device);
1895 		}
1896 
1897 		drbd_rs_controller_reset(device);
1898 		/* ns.conn may already be != device->state.conn,
1899 		 * we may have been paused in between, or become paused until
1900 		 * the timer triggers.
1901 		 * No matter, that is handled in resync_timer_fn() */
1902 		if (ns.conn == C_SYNC_TARGET)
1903 			mod_timer(&device->resync_timer, jiffies);
1904 
1905 		drbd_md_sync(device);
1906 	}
1907 	put_ldev(device);
1908 out:
1909 	mutex_unlock(device->state_mutex);
1910 }
1911 
1912 static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
1913 {
1914 	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1915 	device->rs_last_bcast = jiffies;
1916 
1917 	if (!get_ldev(device))
1918 		return;
1919 
1920 	drbd_bm_write_lazy(device, 0);
1921 	if (resync_done && is_sync_state(device->state.conn))
1922 		drbd_resync_finished(device);
1923 
1924 	drbd_bcast_event(device, &sib);
1925 	/* update timestamp, in case it took a while to write out stuff */
1926 	device->rs_last_bcast = jiffies;
1927 	put_ldev(device);
1928 }
1929 
1930 static void drbd_ldev_destroy(struct drbd_device *device)
1931 {
1932 	lc_destroy(device->resync);
1933 	device->resync = NULL;
1934 	lc_destroy(device->act_log);
1935 	device->act_log = NULL;
1936 
1937 	__acquire(local);
1938 	drbd_backing_dev_free(device, device->ldev);
1939 	device->ldev = NULL;
1940 	__release(local);
1941 
1942 	clear_bit(GOING_DISKLESS, &device->flags);
1943 	wake_up(&device->misc_wait);
1944 }
1945 
1946 static void go_diskless(struct drbd_device *device)
1947 {
1948 	D_ASSERT(device, device->state.disk == D_FAILED);
1949 	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
1950 	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
1951 	 * the protected members anymore, though, so once put_ldev reaches zero
1952 	 * again, it will be safe to free them. */
1953 
1954 	/* Try to write changed bitmap pages, read errors may have just
1955 	 * set some bits outside the area covered by the activity log.
1956 	 *
1957 	 * If we have an IO error during the bitmap writeout,
1958 	 * we will want a full sync next time, just in case.
1959 	 * (Do we want a specific meta data flag for this?)
1960 	 *
1961 	 * If that does not make it to stable storage either,
1962 	 * we cannot do anything about that anymore.
1963 	 *
1964 	 * We still need to check if both bitmap and ldev are present, we may
1965 	 * end up here after a failed attach, before ldev was even assigned.
1966 	 */
1967 	if (device->bitmap && device->ldev) {
1968 		/* An interrupted resync or similar is allowed to recounts bits
1969 		 * while we detach.
1970 		 * Any modifications would not be expected anymore, though.
1971 		 */
1972 		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
1973 					"detach", BM_LOCKED_TEST_ALLOWED)) {
1974 			if (test_bit(WAS_READ_ERROR, &device->flags)) {
1975 				drbd_md_set_flag(device, MDF_FULL_SYNC);
1976 				drbd_md_sync(device);
1977 			}
1978 		}
1979 	}
1980 
1981 	drbd_force_state(device, NS(disk, D_DISKLESS));
1982 }
1983 
1984 static int do_md_sync(struct drbd_device *device)
1985 {
1986 	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
1987 	drbd_md_sync(device);
1988 	return 0;
1989 }
1990 
1991 /* only called from drbd_worker thread, no locking */
1992 void __update_timing_details(
1993 		struct drbd_thread_timing_details *tdp,
1994 		unsigned int *cb_nr,
1995 		void *cb,
1996 		const char *fn, const unsigned int line)
1997 {
1998 	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
1999 	struct drbd_thread_timing_details *td = tdp + i;
2000 
2001 	td->start_jif = jiffies;
2002 	td->cb_addr = cb;
2003 	td->caller_fn = fn;
2004 	td->line = line;
2005 	td->cb_nr = *cb_nr;
2006 
2007 	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
2008 	td = tdp + i;
2009 	memset(td, 0, sizeof(*td));
2010 
2011 	++(*cb_nr);
2012 }
2013 
2014 static void do_device_work(struct drbd_device *device, const unsigned long todo)
2015 {
2016 	if (test_bit(MD_SYNC, &todo))
2017 		do_md_sync(device);
2018 	if (test_bit(RS_DONE, &todo) ||
2019 	    test_bit(RS_PROGRESS, &todo))
2020 		update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
2021 	if (test_bit(GO_DISKLESS, &todo))
2022 		go_diskless(device);
2023 	if (test_bit(DESTROY_DISK, &todo))
2024 		drbd_ldev_destroy(device);
2025 	if (test_bit(RS_START, &todo))
2026 		do_start_resync(device);
2027 }
2028 
2029 #define DRBD_DEVICE_WORK_MASK	\
2030 	((1UL << GO_DISKLESS)	\
2031 	|(1UL << DESTROY_DISK)	\
2032 	|(1UL << MD_SYNC)	\
2033 	|(1UL << RS_START)	\
2034 	|(1UL << RS_PROGRESS)	\
2035 	|(1UL << RS_DONE)	\
2036 	)
2037 
2038 static unsigned long get_work_bits(unsigned long *flags)
2039 {
2040 	unsigned long old, new;
2041 	do {
2042 		old = *flags;
2043 		new = old & ~DRBD_DEVICE_WORK_MASK;
2044 	} while (cmpxchg(flags, old, new) != old);
2045 	return old & DRBD_DEVICE_WORK_MASK;
2046 }
2047 
2048 static void do_unqueued_work(struct drbd_connection *connection)
2049 {
2050 	struct drbd_peer_device *peer_device;
2051 	int vnr;
2052 
2053 	rcu_read_lock();
2054 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2055 		struct drbd_device *device = peer_device->device;
2056 		unsigned long todo = get_work_bits(&device->flags);
2057 		if (!todo)
2058 			continue;
2059 
2060 		kref_get(&device->kref);
2061 		rcu_read_unlock();
2062 		do_device_work(device, todo);
2063 		kref_put(&device->kref, drbd_destroy_device);
2064 		rcu_read_lock();
2065 	}
2066 	rcu_read_unlock();
2067 }
2068 
2069 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
2070 {
2071 	spin_lock_irq(&queue->q_lock);
2072 	list_splice_tail_init(&queue->q, work_list);
2073 	spin_unlock_irq(&queue->q_lock);
2074 	return !list_empty(work_list);
2075 }
2076 
2077 static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
2078 {
2079 	DEFINE_WAIT(wait);
2080 	struct net_conf *nc;
2081 	int uncork, cork;
2082 
2083 	dequeue_work_batch(&connection->sender_work, work_list);
2084 	if (!list_empty(work_list))
2085 		return;
2086 
2087 	/* Still nothing to do?
2088 	 * Maybe we still need to close the current epoch,
2089 	 * even if no new requests are queued yet.
2090 	 *
2091 	 * Also, poke TCP, just in case.
2092 	 * Then wait for new work (or signal). */
2093 	rcu_read_lock();
2094 	nc = rcu_dereference(connection->net_conf);
2095 	uncork = nc ? nc->tcp_cork : 0;
2096 	rcu_read_unlock();
2097 	if (uncork) {
2098 		mutex_lock(&connection->data.mutex);
2099 		if (connection->data.socket)
2100 			tcp_sock_set_cork(connection->data.socket->sk, false);
2101 		mutex_unlock(&connection->data.mutex);
2102 	}
2103 
2104 	for (;;) {
2105 		int send_barrier;
2106 		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
2107 		spin_lock_irq(&connection->resource->req_lock);
2108 		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2109 		if (!list_empty(&connection->sender_work.q))
2110 			list_splice_tail_init(&connection->sender_work.q, work_list);
2111 		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2112 		if (!list_empty(work_list) || signal_pending(current)) {
2113 			spin_unlock_irq(&connection->resource->req_lock);
2114 			break;
2115 		}
2116 
2117 		/* We found nothing new to do, no to-be-communicated request,
2118 		 * no other work item.  We may still need to close the last
2119 		 * epoch.  Next incoming request epoch will be connection ->
2120 		 * current transfer log epoch number.  If that is different
2121 		 * from the epoch of the last request we communicated, it is
2122 		 * safe to send the epoch separating barrier now.
2123 		 */
2124 		send_barrier =
2125 			atomic_read(&connection->current_tle_nr) !=
2126 			connection->send.current_epoch_nr;
2127 		spin_unlock_irq(&connection->resource->req_lock);
2128 
2129 		if (send_barrier)
2130 			maybe_send_barrier(connection,
2131 					connection->send.current_epoch_nr + 1);
2132 
2133 		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
2134 			break;
2135 
2136 		/* drbd_send() may have called flush_signals() */
2137 		if (get_t_state(&connection->worker) != RUNNING)
2138 			break;
2139 
2140 		schedule();
2141 		/* may be woken up for other things but new work, too,
2142 		 * e.g. if the current epoch got closed.
2143 		 * In which case we send the barrier above. */
2144 	}
2145 	finish_wait(&connection->sender_work.q_wait, &wait);
2146 
2147 	/* someone may have changed the config while we have been waiting above. */
2148 	rcu_read_lock();
2149 	nc = rcu_dereference(connection->net_conf);
2150 	cork = nc ? nc->tcp_cork : 0;
2151 	rcu_read_unlock();
2152 	mutex_lock(&connection->data.mutex);
2153 	if (connection->data.socket) {
2154 		if (cork)
2155 			tcp_sock_set_cork(connection->data.socket->sk, true);
2156 		else if (!uncork)
2157 			tcp_sock_set_cork(connection->data.socket->sk, false);
2158 	}
2159 	mutex_unlock(&connection->data.mutex);
2160 }
2161 
2162 int drbd_worker(struct drbd_thread *thi)
2163 {
2164 	struct drbd_connection *connection = thi->connection;
2165 	struct drbd_work *w = NULL;
2166 	struct drbd_peer_device *peer_device;
2167 	LIST_HEAD(work_list);
2168 	int vnr;
2169 
2170 	while (get_t_state(thi) == RUNNING) {
2171 		drbd_thread_current_set_cpu(thi);
2172 
2173 		if (list_empty(&work_list)) {
2174 			update_worker_timing_details(connection, wait_for_work);
2175 			wait_for_work(connection, &work_list);
2176 		}
2177 
2178 		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2179 			update_worker_timing_details(connection, do_unqueued_work);
2180 			do_unqueued_work(connection);
2181 		}
2182 
2183 		if (signal_pending(current)) {
2184 			flush_signals(current);
2185 			if (get_t_state(thi) == RUNNING) {
2186 				drbd_warn(connection, "Worker got an unexpected signal\n");
2187 				continue;
2188 			}
2189 			break;
2190 		}
2191 
2192 		if (get_t_state(thi) != RUNNING)
2193 			break;
2194 
2195 		if (!list_empty(&work_list)) {
2196 			w = list_first_entry(&work_list, struct drbd_work, list);
2197 			list_del_init(&w->list);
2198 			update_worker_timing_details(connection, w->cb);
2199 			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
2200 				continue;
2201 			if (connection->cstate >= C_WF_REPORT_PARAMS)
2202 				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
2203 		}
2204 	}
2205 
2206 	do {
2207 		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2208 			update_worker_timing_details(connection, do_unqueued_work);
2209 			do_unqueued_work(connection);
2210 		}
2211 		if (!list_empty(&work_list)) {
2212 			w = list_first_entry(&work_list, struct drbd_work, list);
2213 			list_del_init(&w->list);
2214 			update_worker_timing_details(connection, w->cb);
2215 			w->cb(w, 1);
2216 		} else
2217 			dequeue_work_batch(&connection->sender_work, &work_list);
2218 	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
2219 
2220 	rcu_read_lock();
2221 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2222 		struct drbd_device *device = peer_device->device;
2223 		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
2224 		kref_get(&device->kref);
2225 		rcu_read_unlock();
2226 		drbd_device_cleanup(device);
2227 		kref_put(&device->kref, drbd_destroy_device);
2228 		rcu_read_lock();
2229 	}
2230 	rcu_read_unlock();
2231 
2232 	return 0;
2233 }
2234