xref: /openbmc/linux/drivers/block/drbd/drbd_worker.c (revision f5005f78)
1 /*
2    drbd_worker.c
3 
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5 
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9 
10    drbd is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 2, or (at your option)
13    any later version.
14 
15    drbd is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with drbd; see the file COPYING.  If not, write to
22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 
24 */
25 
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/wait.h>
30 #include <linux/mm.h>
31 #include <linux/memcontrol.h>
32 #include <linux/mm_inline.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/string.h>
36 #include <linux/scatterlist.h>
37 
38 #include "drbd_int.h"
39 #include "drbd_protocol.h"
40 #include "drbd_req.h"
41 
42 static int make_ov_request(struct drbd_device *, int);
43 static int make_resync_request(struct drbd_device *, int);
44 
45 /* endio handlers:
46  *   drbd_md_endio (defined here)
47  *   drbd_request_endio (defined here)
48  *   drbd_peer_request_endio (defined here)
49  *   drbd_bm_endio (defined in drbd_bitmap.c)
50  *
51  * For all these callbacks, note the following:
52  * The callbacks will be called in irq context by the IDE drivers,
53  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
54  * Try to get the locking right :)
55  *
56  */
57 
58 
59 /* About the global_state_lock
60    Each state transition on an device holds a read lock. In case we have
61    to evaluate the resync after dependencies, we grab a write lock, because
62    we need stable states on all devices for that.  */
63 rwlock_t global_state_lock;
64 
65 /* used for synchronous meta data and bitmap IO
66  * submitted by drbd_md_sync_page_io()
67  */
68 void drbd_md_endio(struct bio *bio, int error)
69 {
70 	struct drbd_device *device;
71 
72 	device = bio->bi_private;
73 	device->md_io.error = error;
74 
75 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
76 	 * to timeout on the lower level device, and eventually detach from it.
77 	 * If this io completion runs after that timeout expired, this
78 	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
79 	 * During normal operation, this only puts that extra reference
80 	 * down to 1 again.
81 	 * Make sure we first drop the reference, and only then signal
82 	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
83 	 * next drbd_md_sync_page_io(), that we trigger the
84 	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
85 	 */
86 	drbd_md_put_buffer(device);
87 	device->md_io.done = 1;
88 	wake_up(&device->misc_wait);
89 	bio_put(bio);
90 	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
91 		put_ldev(device);
92 }
93 
94 /* reads on behalf of the partner,
95  * "submitted" by the receiver
96  */
97 static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
98 {
99 	unsigned long flags = 0;
100 	struct drbd_peer_device *peer_device = peer_req->peer_device;
101 	struct drbd_device *device = peer_device->device;
102 
103 	spin_lock_irqsave(&device->resource->req_lock, flags);
104 	device->read_cnt += peer_req->i.size >> 9;
105 	list_del(&peer_req->w.list);
106 	if (list_empty(&device->read_ee))
107 		wake_up(&device->ee_wait);
108 	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
109 		__drbd_chk_io_error(device, DRBD_READ_ERROR);
110 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
111 
112 	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
113 	put_ldev(device);
114 }
115 
116 /* writes on behalf of the partner, or resync writes,
117  * "submitted" by the receiver, final stage.  */
118 void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
119 {
120 	unsigned long flags = 0;
121 	struct drbd_peer_device *peer_device = peer_req->peer_device;
122 	struct drbd_device *device = peer_device->device;
123 	struct drbd_interval i;
124 	int do_wake;
125 	u64 block_id;
126 	int do_al_complete_io;
127 
128 	/* after we moved peer_req to done_ee,
129 	 * we may no longer access it,
130 	 * it may be freed/reused already!
131 	 * (as soon as we release the req_lock) */
132 	i = peer_req->i;
133 	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
134 	block_id = peer_req->block_id;
135 	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
136 
137 	spin_lock_irqsave(&device->resource->req_lock, flags);
138 	device->writ_cnt += peer_req->i.size >> 9;
139 	list_move_tail(&peer_req->w.list, &device->done_ee);
140 
141 	/*
142 	 * Do not remove from the write_requests tree here: we did not send the
143 	 * Ack yet and did not wake possibly waiting conflicting requests.
144 	 * Removed from the tree from "drbd_process_done_ee" within the
145 	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
146 	 * _drbd_clear_done_ee.
147 	 */
148 
149 	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
150 
151 	/* FIXME do we want to detach for failed REQ_DISCARD?
152 	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
153 	if (peer_req->flags & EE_WAS_ERROR)
154 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
155 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
156 
157 	if (block_id == ID_SYNCER)
158 		drbd_rs_complete_io(device, i.sector);
159 
160 	if (do_wake)
161 		wake_up(&device->ee_wait);
162 
163 	if (do_al_complete_io)
164 		drbd_al_complete_io(device, &i);
165 
166 	wake_asender(peer_device->connection);
167 	put_ldev(device);
168 }
169 
170 /* writes on behalf of the partner, or resync writes,
171  * "submitted" by the receiver.
172  */
173 void drbd_peer_request_endio(struct bio *bio, int error)
174 {
175 	struct drbd_peer_request *peer_req = bio->bi_private;
176 	struct drbd_device *device = peer_req->peer_device->device;
177 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
178 	int is_write = bio_data_dir(bio) == WRITE;
179 	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
180 
181 	if (error && __ratelimit(&drbd_ratelimit_state))
182 		drbd_warn(device, "%s: error=%d s=%llus\n",
183 				is_write ? (is_discard ? "discard" : "write")
184 					: "read", error,
185 				(unsigned long long)peer_req->i.sector);
186 	if (!error && !uptodate) {
187 		if (__ratelimit(&drbd_ratelimit_state))
188 			drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
189 					is_write ? "write" : "read",
190 					(unsigned long long)peer_req->i.sector);
191 		/* strange behavior of some lower level drivers...
192 		 * fail the request by clearing the uptodate flag,
193 		 * but do not return any error?! */
194 		error = -EIO;
195 	}
196 
197 	if (error)
198 		set_bit(__EE_WAS_ERROR, &peer_req->flags);
199 
200 	bio_put(bio); /* no need for the bio anymore */
201 	if (atomic_dec_and_test(&peer_req->pending_bios)) {
202 		if (is_write)
203 			drbd_endio_write_sec_final(peer_req);
204 		else
205 			drbd_endio_read_sec_final(peer_req);
206 	}
207 }
208 
209 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
210  */
211 void drbd_request_endio(struct bio *bio, int error)
212 {
213 	unsigned long flags;
214 	struct drbd_request *req = bio->bi_private;
215 	struct drbd_device *device = req->device;
216 	struct bio_and_error m;
217 	enum drbd_req_event what;
218 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
219 
220 	if (!error && !uptodate) {
221 		drbd_warn(device, "p %s: setting error to -EIO\n",
222 			 bio_data_dir(bio) == WRITE ? "write" : "read");
223 		/* strange behavior of some lower level drivers...
224 		 * fail the request by clearing the uptodate flag,
225 		 * but do not return any error?! */
226 		error = -EIO;
227 	}
228 
229 
230 	/* If this request was aborted locally before,
231 	 * but now was completed "successfully",
232 	 * chances are that this caused arbitrary data corruption.
233 	 *
234 	 * "aborting" requests, or force-detaching the disk, is intended for
235 	 * completely blocked/hung local backing devices which do no longer
236 	 * complete requests at all, not even do error completions.  In this
237 	 * situation, usually a hard-reset and failover is the only way out.
238 	 *
239 	 * By "aborting", basically faking a local error-completion,
240 	 * we allow for a more graceful swichover by cleanly migrating services.
241 	 * Still the affected node has to be rebooted "soon".
242 	 *
243 	 * By completing these requests, we allow the upper layers to re-use
244 	 * the associated data pages.
245 	 *
246 	 * If later the local backing device "recovers", and now DMAs some data
247 	 * from disk into the original request pages, in the best case it will
248 	 * just put random data into unused pages; but typically it will corrupt
249 	 * meanwhile completely unrelated data, causing all sorts of damage.
250 	 *
251 	 * Which means delayed successful completion,
252 	 * especially for READ requests,
253 	 * is a reason to panic().
254 	 *
255 	 * We assume that a delayed *error* completion is OK,
256 	 * though we still will complain noisily about it.
257 	 */
258 	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
259 		if (__ratelimit(&drbd_ratelimit_state))
260 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
261 
262 		if (!error)
263 			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
264 	}
265 
266 	/* to avoid recursion in __req_mod */
267 	if (unlikely(error)) {
268 		if (bio->bi_rw & REQ_DISCARD)
269 			what = (error == -EOPNOTSUPP)
270 				? DISCARD_COMPLETED_NOTSUPP
271 				: DISCARD_COMPLETED_WITH_ERROR;
272 		else
273 			what = (bio_data_dir(bio) == WRITE)
274 			? WRITE_COMPLETED_WITH_ERROR
275 			: (bio_rw(bio) == READ)
276 			  ? READ_COMPLETED_WITH_ERROR
277 			  : READ_AHEAD_COMPLETED_WITH_ERROR;
278 	} else
279 		what = COMPLETED_OK;
280 
281 	bio_put(req->private_bio);
282 	req->private_bio = ERR_PTR(error);
283 
284 	/* not req_mod(), we need irqsave here! */
285 	spin_lock_irqsave(&device->resource->req_lock, flags);
286 	__req_mod(req, what, &m);
287 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
288 	put_ldev(device);
289 
290 	if (m.bio)
291 		complete_master_bio(device, &m);
292 }
293 
294 void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
295 {
296 	struct hash_desc desc;
297 	struct scatterlist sg;
298 	struct page *page = peer_req->pages;
299 	struct page *tmp;
300 	unsigned len;
301 
302 	desc.tfm = tfm;
303 	desc.flags = 0;
304 
305 	sg_init_table(&sg, 1);
306 	crypto_hash_init(&desc);
307 
308 	while ((tmp = page_chain_next(page))) {
309 		/* all but the last page will be fully used */
310 		sg_set_page(&sg, page, PAGE_SIZE, 0);
311 		crypto_hash_update(&desc, &sg, sg.length);
312 		page = tmp;
313 	}
314 	/* and now the last, possibly only partially used page */
315 	len = peer_req->i.size & (PAGE_SIZE - 1);
316 	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
317 	crypto_hash_update(&desc, &sg, sg.length);
318 	crypto_hash_final(&desc, digest);
319 }
320 
321 void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
322 {
323 	struct hash_desc desc;
324 	struct scatterlist sg;
325 	struct bio_vec bvec;
326 	struct bvec_iter iter;
327 
328 	desc.tfm = tfm;
329 	desc.flags = 0;
330 
331 	sg_init_table(&sg, 1);
332 	crypto_hash_init(&desc);
333 
334 	bio_for_each_segment(bvec, bio, iter) {
335 		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
336 		crypto_hash_update(&desc, &sg, sg.length);
337 	}
338 	crypto_hash_final(&desc, digest);
339 }
340 
341 /* MAYBE merge common code with w_e_end_ov_req */
342 static int w_e_send_csum(struct drbd_work *w, int cancel)
343 {
344 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
345 	struct drbd_peer_device *peer_device = peer_req->peer_device;
346 	struct drbd_device *device = peer_device->device;
347 	int digest_size;
348 	void *digest;
349 	int err = 0;
350 
351 	if (unlikely(cancel))
352 		goto out;
353 
354 	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
355 		goto out;
356 
357 	digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
358 	digest = kmalloc(digest_size, GFP_NOIO);
359 	if (digest) {
360 		sector_t sector = peer_req->i.sector;
361 		unsigned int size = peer_req->i.size;
362 		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
363 		/* Free peer_req and pages before send.
364 		 * In case we block on congestion, we could otherwise run into
365 		 * some distributed deadlock, if the other side blocks on
366 		 * congestion as well, because our receiver blocks in
367 		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
368 		drbd_free_peer_req(device, peer_req);
369 		peer_req = NULL;
370 		inc_rs_pending(device);
371 		err = drbd_send_drequest_csum(peer_device, sector, size,
372 					      digest, digest_size,
373 					      P_CSUM_RS_REQUEST);
374 		kfree(digest);
375 	} else {
376 		drbd_err(device, "kmalloc() of digest failed.\n");
377 		err = -ENOMEM;
378 	}
379 
380 out:
381 	if (peer_req)
382 		drbd_free_peer_req(device, peer_req);
383 
384 	if (unlikely(err))
385 		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
386 	return err;
387 }
388 
389 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
390 
391 static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
392 {
393 	struct drbd_device *device = peer_device->device;
394 	struct drbd_peer_request *peer_req;
395 
396 	if (!get_ldev(device))
397 		return -EIO;
398 
399 	/* GFP_TRY, because if there is no memory available right now, this may
400 	 * be rescheduled for later. It is "only" background resync, after all. */
401 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
402 				       size, true /* has real payload */, GFP_TRY);
403 	if (!peer_req)
404 		goto defer;
405 
406 	peer_req->w.cb = w_e_send_csum;
407 	spin_lock_irq(&device->resource->req_lock);
408 	list_add_tail(&peer_req->w.list, &device->read_ee);
409 	spin_unlock_irq(&device->resource->req_lock);
410 
411 	atomic_add(size >> 9, &device->rs_sect_ev);
412 	if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
413 		return 0;
414 
415 	/* If it failed because of ENOMEM, retry should help.  If it failed
416 	 * because bio_add_page failed (probably broken lower level driver),
417 	 * retry may or may not help.
418 	 * If it does not, you may need to force disconnect. */
419 	spin_lock_irq(&device->resource->req_lock);
420 	list_del(&peer_req->w.list);
421 	spin_unlock_irq(&device->resource->req_lock);
422 
423 	drbd_free_peer_req(device, peer_req);
424 defer:
425 	put_ldev(device);
426 	return -EAGAIN;
427 }
428 
429 int w_resync_timer(struct drbd_work *w, int cancel)
430 {
431 	struct drbd_device *device =
432 		container_of(w, struct drbd_device, resync_work);
433 
434 	switch (device->state.conn) {
435 	case C_VERIFY_S:
436 		make_ov_request(device, cancel);
437 		break;
438 	case C_SYNC_TARGET:
439 		make_resync_request(device, cancel);
440 		break;
441 	}
442 
443 	return 0;
444 }
445 
446 void resync_timer_fn(unsigned long data)
447 {
448 	struct drbd_device *device = (struct drbd_device *) data;
449 
450 	drbd_queue_work_if_unqueued(
451 		&first_peer_device(device)->connection->sender_work,
452 		&device->resync_work);
453 }
454 
455 static void fifo_set(struct fifo_buffer *fb, int value)
456 {
457 	int i;
458 
459 	for (i = 0; i < fb->size; i++)
460 		fb->values[i] = value;
461 }
462 
463 static int fifo_push(struct fifo_buffer *fb, int value)
464 {
465 	int ov;
466 
467 	ov = fb->values[fb->head_index];
468 	fb->values[fb->head_index++] = value;
469 
470 	if (fb->head_index >= fb->size)
471 		fb->head_index = 0;
472 
473 	return ov;
474 }
475 
476 static void fifo_add_val(struct fifo_buffer *fb, int value)
477 {
478 	int i;
479 
480 	for (i = 0; i < fb->size; i++)
481 		fb->values[i] += value;
482 }
483 
484 struct fifo_buffer *fifo_alloc(int fifo_size)
485 {
486 	struct fifo_buffer *fb;
487 
488 	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
489 	if (!fb)
490 		return NULL;
491 
492 	fb->head_index = 0;
493 	fb->size = fifo_size;
494 	fb->total = 0;
495 
496 	return fb;
497 }
498 
499 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
500 {
501 	struct disk_conf *dc;
502 	unsigned int want;     /* The number of sectors we want in-flight */
503 	int req_sect; /* Number of sectors to request in this turn */
504 	int correction; /* Number of sectors more we need in-flight */
505 	int cps; /* correction per invocation of drbd_rs_controller() */
506 	int steps; /* Number of time steps to plan ahead */
507 	int curr_corr;
508 	int max_sect;
509 	struct fifo_buffer *plan;
510 
511 	dc = rcu_dereference(device->ldev->disk_conf);
512 	plan = rcu_dereference(device->rs_plan_s);
513 
514 	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
515 
516 	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
517 		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
518 	} else { /* normal path */
519 		want = dc->c_fill_target ? dc->c_fill_target :
520 			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
521 	}
522 
523 	correction = want - device->rs_in_flight - plan->total;
524 
525 	/* Plan ahead */
526 	cps = correction / steps;
527 	fifo_add_val(plan, cps);
528 	plan->total += cps * steps;
529 
530 	/* What we do in this step */
531 	curr_corr = fifo_push(plan, 0);
532 	plan->total -= curr_corr;
533 
534 	req_sect = sect_in + curr_corr;
535 	if (req_sect < 0)
536 		req_sect = 0;
537 
538 	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
539 	if (req_sect > max_sect)
540 		req_sect = max_sect;
541 
542 	/*
543 	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
544 		 sect_in, device->rs_in_flight, want, correction,
545 		 steps, cps, device->rs_planed, curr_corr, req_sect);
546 	*/
547 
548 	return req_sect;
549 }
550 
551 static int drbd_rs_number_requests(struct drbd_device *device)
552 {
553 	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
554 	int number, mxb;
555 
556 	sect_in = atomic_xchg(&device->rs_sect_in, 0);
557 	device->rs_in_flight -= sect_in;
558 
559 	rcu_read_lock();
560 	mxb = drbd_get_max_buffers(device) / 2;
561 	if (rcu_dereference(device->rs_plan_s)->size) {
562 		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
563 		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
564 	} else {
565 		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
566 		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
567 	}
568 	rcu_read_unlock();
569 
570 	/* Don't have more than "max-buffers"/2 in-flight.
571 	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
572 	 * potentially causing a distributed deadlock on congestion during
573 	 * online-verify or (checksum-based) resync, if max-buffers,
574 	 * socket buffer sizes and resync rate settings are mis-configured. */
575 
576 	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
577 	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
578 	 * "number of pages" (typically also 4k),
579 	 * but "rs_in_flight" is in "sectors" (512 Byte). */
580 	if (mxb - device->rs_in_flight/8 < number)
581 		number = mxb - device->rs_in_flight/8;
582 
583 	return number;
584 }
585 
586 static int make_resync_request(struct drbd_device *const device, int cancel)
587 {
588 	struct drbd_peer_device *const peer_device = first_peer_device(device);
589 	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
590 	unsigned long bit;
591 	sector_t sector;
592 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
593 	int max_bio_size;
594 	int number, rollback_i, size;
595 	int align, requeue = 0;
596 	int i = 0;
597 
598 	if (unlikely(cancel))
599 		return 0;
600 
601 	if (device->rs_total == 0) {
602 		/* empty resync? */
603 		drbd_resync_finished(device);
604 		return 0;
605 	}
606 
607 	if (!get_ldev(device)) {
608 		/* Since we only need to access device->rsync a
609 		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
610 		   to continue resync with a broken disk makes no sense at
611 		   all */
612 		drbd_err(device, "Disk broke down during resync!\n");
613 		return 0;
614 	}
615 
616 	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
617 	number = drbd_rs_number_requests(device);
618 	if (number <= 0)
619 		goto requeue;
620 
621 	for (i = 0; i < number; i++) {
622 		/* Stop generating RS requests when half of the send buffer is filled,
623 		 * but notify TCP that we'd like to have more space. */
624 		mutex_lock(&connection->data.mutex);
625 		if (connection->data.socket) {
626 			struct sock *sk = connection->data.socket->sk;
627 			int queued = sk->sk_wmem_queued;
628 			int sndbuf = sk->sk_sndbuf;
629 			if (queued > sndbuf / 2) {
630 				requeue = 1;
631 				if (sk->sk_socket)
632 					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
633 			}
634 		} else
635 			requeue = 1;
636 		mutex_unlock(&connection->data.mutex);
637 		if (requeue)
638 			goto requeue;
639 
640 next_sector:
641 		size = BM_BLOCK_SIZE;
642 		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
643 
644 		if (bit == DRBD_END_OF_BITMAP) {
645 			device->bm_resync_fo = drbd_bm_bits(device);
646 			put_ldev(device);
647 			return 0;
648 		}
649 
650 		sector = BM_BIT_TO_SECT(bit);
651 
652 		if (drbd_try_rs_begin_io(device, sector)) {
653 			device->bm_resync_fo = bit;
654 			goto requeue;
655 		}
656 		device->bm_resync_fo = bit + 1;
657 
658 		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
659 			drbd_rs_complete_io(device, sector);
660 			goto next_sector;
661 		}
662 
663 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
664 		/* try to find some adjacent bits.
665 		 * we stop if we have already the maximum req size.
666 		 *
667 		 * Additionally always align bigger requests, in order to
668 		 * be prepared for all stripe sizes of software RAIDs.
669 		 */
670 		align = 1;
671 		rollback_i = i;
672 		while (i < number) {
673 			if (size + BM_BLOCK_SIZE > max_bio_size)
674 				break;
675 
676 			/* Be always aligned */
677 			if (sector & ((1<<(align+3))-1))
678 				break;
679 
680 			/* do not cross extent boundaries */
681 			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
682 				break;
683 			/* now, is it actually dirty, after all?
684 			 * caution, drbd_bm_test_bit is tri-state for some
685 			 * obscure reason; ( b == 0 ) would get the out-of-band
686 			 * only accidentally right because of the "oddly sized"
687 			 * adjustment below */
688 			if (drbd_bm_test_bit(device, bit+1) != 1)
689 				break;
690 			bit++;
691 			size += BM_BLOCK_SIZE;
692 			if ((BM_BLOCK_SIZE << align) <= size)
693 				align++;
694 			i++;
695 		}
696 		/* if we merged some,
697 		 * reset the offset to start the next drbd_bm_find_next from */
698 		if (size > BM_BLOCK_SIZE)
699 			device->bm_resync_fo = bit + 1;
700 #endif
701 
702 		/* adjust very last sectors, in case we are oddly sized */
703 		if (sector + (size>>9) > capacity)
704 			size = (capacity-sector)<<9;
705 
706 		if (device->use_csums) {
707 			switch (read_for_csum(peer_device, sector, size)) {
708 			case -EIO: /* Disk failure */
709 				put_ldev(device);
710 				return -EIO;
711 			case -EAGAIN: /* allocation failed, or ldev busy */
712 				drbd_rs_complete_io(device, sector);
713 				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
714 				i = rollback_i;
715 				goto requeue;
716 			case 0:
717 				/* everything ok */
718 				break;
719 			default:
720 				BUG();
721 			}
722 		} else {
723 			int err;
724 
725 			inc_rs_pending(device);
726 			err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
727 						 sector, size, ID_SYNCER);
728 			if (err) {
729 				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
730 				dec_rs_pending(device);
731 				put_ldev(device);
732 				return err;
733 			}
734 		}
735 	}
736 
737 	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
738 		/* last syncer _request_ was sent,
739 		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
740 		 * next sync group will resume), as soon as we receive the last
741 		 * resync data block, and the last bit is cleared.
742 		 * until then resync "work" is "inactive" ...
743 		 */
744 		put_ldev(device);
745 		return 0;
746 	}
747 
748  requeue:
749 	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
750 	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
751 	put_ldev(device);
752 	return 0;
753 }
754 
755 static int make_ov_request(struct drbd_device *device, int cancel)
756 {
757 	int number, i, size;
758 	sector_t sector;
759 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
760 	bool stop_sector_reached = false;
761 
762 	if (unlikely(cancel))
763 		return 1;
764 
765 	number = drbd_rs_number_requests(device);
766 
767 	sector = device->ov_position;
768 	for (i = 0; i < number; i++) {
769 		if (sector >= capacity)
770 			return 1;
771 
772 		/* We check for "finished" only in the reply path:
773 		 * w_e_end_ov_reply().
774 		 * We need to send at least one request out. */
775 		stop_sector_reached = i > 0
776 			&& verify_can_do_stop_sector(device)
777 			&& sector >= device->ov_stop_sector;
778 		if (stop_sector_reached)
779 			break;
780 
781 		size = BM_BLOCK_SIZE;
782 
783 		if (drbd_try_rs_begin_io(device, sector)) {
784 			device->ov_position = sector;
785 			goto requeue;
786 		}
787 
788 		if (sector + (size>>9) > capacity)
789 			size = (capacity-sector)<<9;
790 
791 		inc_rs_pending(device);
792 		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
793 			dec_rs_pending(device);
794 			return 0;
795 		}
796 		sector += BM_SECT_PER_BIT;
797 	}
798 	device->ov_position = sector;
799 
800  requeue:
801 	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
802 	if (i == 0 || !stop_sector_reached)
803 		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
804 	return 1;
805 }
806 
807 int w_ov_finished(struct drbd_work *w, int cancel)
808 {
809 	struct drbd_device_work *dw =
810 		container_of(w, struct drbd_device_work, w);
811 	struct drbd_device *device = dw->device;
812 	kfree(dw);
813 	ov_out_of_sync_print(device);
814 	drbd_resync_finished(device);
815 
816 	return 0;
817 }
818 
819 static int w_resync_finished(struct drbd_work *w, int cancel)
820 {
821 	struct drbd_device_work *dw =
822 		container_of(w, struct drbd_device_work, w);
823 	struct drbd_device *device = dw->device;
824 	kfree(dw);
825 
826 	drbd_resync_finished(device);
827 
828 	return 0;
829 }
830 
831 static void ping_peer(struct drbd_device *device)
832 {
833 	struct drbd_connection *connection = first_peer_device(device)->connection;
834 
835 	clear_bit(GOT_PING_ACK, &connection->flags);
836 	request_ping(connection);
837 	wait_event(connection->ping_wait,
838 		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
839 }
840 
841 int drbd_resync_finished(struct drbd_device *device)
842 {
843 	unsigned long db, dt, dbdt;
844 	unsigned long n_oos;
845 	union drbd_state os, ns;
846 	struct drbd_device_work *dw;
847 	char *khelper_cmd = NULL;
848 	int verify_done = 0;
849 
850 	/* Remove all elements from the resync LRU. Since future actions
851 	 * might set bits in the (main) bitmap, then the entries in the
852 	 * resync LRU would be wrong. */
853 	if (drbd_rs_del_all(device)) {
854 		/* In case this is not possible now, most probably because
855 		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
856 		 * queue (or even the read operations for those packets
857 		 * is not finished by now).   Retry in 100ms. */
858 
859 		schedule_timeout_interruptible(HZ / 10);
860 		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
861 		if (dw) {
862 			dw->w.cb = w_resync_finished;
863 			dw->device = device;
864 			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
865 					&dw->w);
866 			return 1;
867 		}
868 		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
869 	}
870 
871 	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
872 	if (dt <= 0)
873 		dt = 1;
874 
875 	db = device->rs_total;
876 	/* adjust for verify start and stop sectors, respective reached position */
877 	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
878 		db -= device->ov_left;
879 
880 	dbdt = Bit2KB(db/dt);
881 	device->rs_paused /= HZ;
882 
883 	if (!get_ldev(device))
884 		goto out;
885 
886 	ping_peer(device);
887 
888 	spin_lock_irq(&device->resource->req_lock);
889 	os = drbd_read_state(device);
890 
891 	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
892 
893 	/* This protects us against multiple calls (that can happen in the presence
894 	   of application IO), and against connectivity loss just before we arrive here. */
895 	if (os.conn <= C_CONNECTED)
896 		goto out_unlock;
897 
898 	ns = os;
899 	ns.conn = C_CONNECTED;
900 
901 	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
902 	     verify_done ? "Online verify" : "Resync",
903 	     dt + device->rs_paused, device->rs_paused, dbdt);
904 
905 	n_oos = drbd_bm_total_weight(device);
906 
907 	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
908 		if (n_oos) {
909 			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
910 			      n_oos, Bit2KB(1));
911 			khelper_cmd = "out-of-sync";
912 		}
913 	} else {
914 		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
915 
916 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
917 			khelper_cmd = "after-resync-target";
918 
919 		if (device->use_csums && device->rs_total) {
920 			const unsigned long s = device->rs_same_csum;
921 			const unsigned long t = device->rs_total;
922 			const int ratio =
923 				(t == 0)     ? 0 :
924 			(t < 100000) ? ((s*100)/t) : (s/(t/100));
925 			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
926 			     "transferred %luK total %luK\n",
927 			     ratio,
928 			     Bit2KB(device->rs_same_csum),
929 			     Bit2KB(device->rs_total - device->rs_same_csum),
930 			     Bit2KB(device->rs_total));
931 		}
932 	}
933 
934 	if (device->rs_failed) {
935 		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
936 
937 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
938 			ns.disk = D_INCONSISTENT;
939 			ns.pdsk = D_UP_TO_DATE;
940 		} else {
941 			ns.disk = D_UP_TO_DATE;
942 			ns.pdsk = D_INCONSISTENT;
943 		}
944 	} else {
945 		ns.disk = D_UP_TO_DATE;
946 		ns.pdsk = D_UP_TO_DATE;
947 
948 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
949 			if (device->p_uuid) {
950 				int i;
951 				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
952 					_drbd_uuid_set(device, i, device->p_uuid[i]);
953 				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
954 				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
955 			} else {
956 				drbd_err(device, "device->p_uuid is NULL! BUG\n");
957 			}
958 		}
959 
960 		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
961 			/* for verify runs, we don't update uuids here,
962 			 * so there would be nothing to report. */
963 			drbd_uuid_set_bm(device, 0UL);
964 			drbd_print_uuids(device, "updated UUIDs");
965 			if (device->p_uuid) {
966 				/* Now the two UUID sets are equal, update what we
967 				 * know of the peer. */
968 				int i;
969 				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
970 					device->p_uuid[i] = device->ldev->md.uuid[i];
971 			}
972 		}
973 	}
974 
975 	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
976 out_unlock:
977 	spin_unlock_irq(&device->resource->req_lock);
978 	put_ldev(device);
979 out:
980 	device->rs_total  = 0;
981 	device->rs_failed = 0;
982 	device->rs_paused = 0;
983 
984 	/* reset start sector, if we reached end of device */
985 	if (verify_done && device->ov_left == 0)
986 		device->ov_start_sector = 0;
987 
988 	drbd_md_sync(device);
989 
990 	if (khelper_cmd)
991 		drbd_khelper(device, khelper_cmd);
992 
993 	return 1;
994 }
995 
996 /* helper */
997 static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
998 {
999 	if (drbd_peer_req_has_active_page(peer_req)) {
1000 		/* This might happen if sendpage() has not finished */
1001 		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
1002 		atomic_add(i, &device->pp_in_use_by_net);
1003 		atomic_sub(i, &device->pp_in_use);
1004 		spin_lock_irq(&device->resource->req_lock);
1005 		list_add_tail(&peer_req->w.list, &device->net_ee);
1006 		spin_unlock_irq(&device->resource->req_lock);
1007 		wake_up(&drbd_pp_wait);
1008 	} else
1009 		drbd_free_peer_req(device, peer_req);
1010 }
1011 
1012 /**
1013  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
1014  * @device:	DRBD device.
1015  * @w:		work object.
1016  * @cancel:	The connection will be closed anyways
1017  */
1018 int w_e_end_data_req(struct drbd_work *w, int cancel)
1019 {
1020 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1021 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1022 	struct drbd_device *device = peer_device->device;
1023 	int err;
1024 
1025 	if (unlikely(cancel)) {
1026 		drbd_free_peer_req(device, peer_req);
1027 		dec_unacked(device);
1028 		return 0;
1029 	}
1030 
1031 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1032 		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
1033 	} else {
1034 		if (__ratelimit(&drbd_ratelimit_state))
1035 			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
1036 			    (unsigned long long)peer_req->i.sector);
1037 
1038 		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
1039 	}
1040 
1041 	dec_unacked(device);
1042 
1043 	move_to_net_ee_or_free(device, peer_req);
1044 
1045 	if (unlikely(err))
1046 		drbd_err(device, "drbd_send_block() failed\n");
1047 	return err;
1048 }
1049 
1050 /**
1051  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1052  * @w:		work object.
1053  * @cancel:	The connection will be closed anyways
1054  */
1055 int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1056 {
1057 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1058 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1059 	struct drbd_device *device = peer_device->device;
1060 	int err;
1061 
1062 	if (unlikely(cancel)) {
1063 		drbd_free_peer_req(device, peer_req);
1064 		dec_unacked(device);
1065 		return 0;
1066 	}
1067 
1068 	if (get_ldev_if_state(device, D_FAILED)) {
1069 		drbd_rs_complete_io(device, peer_req->i.sector);
1070 		put_ldev(device);
1071 	}
1072 
1073 	if (device->state.conn == C_AHEAD) {
1074 		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
1075 	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1076 		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1077 			inc_rs_pending(device);
1078 			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1079 		} else {
1080 			if (__ratelimit(&drbd_ratelimit_state))
1081 				drbd_err(device, "Not sending RSDataReply, "
1082 				    "partner DISKLESS!\n");
1083 			err = 0;
1084 		}
1085 	} else {
1086 		if (__ratelimit(&drbd_ratelimit_state))
1087 			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
1088 			    (unsigned long long)peer_req->i.sector);
1089 
1090 		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1091 
1092 		/* update resync data with failure */
1093 		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
1094 	}
1095 
1096 	dec_unacked(device);
1097 
1098 	move_to_net_ee_or_free(device, peer_req);
1099 
1100 	if (unlikely(err))
1101 		drbd_err(device, "drbd_send_block() failed\n");
1102 	return err;
1103 }
1104 
1105 int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1106 {
1107 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1108 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1109 	struct drbd_device *device = peer_device->device;
1110 	struct digest_info *di;
1111 	int digest_size;
1112 	void *digest = NULL;
1113 	int err, eq = 0;
1114 
1115 	if (unlikely(cancel)) {
1116 		drbd_free_peer_req(device, peer_req);
1117 		dec_unacked(device);
1118 		return 0;
1119 	}
1120 
1121 	if (get_ldev(device)) {
1122 		drbd_rs_complete_io(device, peer_req->i.sector);
1123 		put_ldev(device);
1124 	}
1125 
1126 	di = peer_req->digest;
1127 
1128 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1129 		/* quick hack to try to avoid a race against reconfiguration.
1130 		 * a real fix would be much more involved,
1131 		 * introducing more locking mechanisms */
1132 		if (peer_device->connection->csums_tfm) {
1133 			digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
1134 			D_ASSERT(device, digest_size == di->digest_size);
1135 			digest = kmalloc(digest_size, GFP_NOIO);
1136 		}
1137 		if (digest) {
1138 			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
1139 			eq = !memcmp(digest, di->digest, digest_size);
1140 			kfree(digest);
1141 		}
1142 
1143 		if (eq) {
1144 			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
1145 			/* rs_same_csums unit is BM_BLOCK_SIZE */
1146 			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1147 			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
1148 		} else {
1149 			inc_rs_pending(device);
1150 			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1151 			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1152 			kfree(di);
1153 			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1154 		}
1155 	} else {
1156 		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1157 		if (__ratelimit(&drbd_ratelimit_state))
1158 			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
1159 	}
1160 
1161 	dec_unacked(device);
1162 	move_to_net_ee_or_free(device, peer_req);
1163 
1164 	if (unlikely(err))
1165 		drbd_err(device, "drbd_send_block/ack() failed\n");
1166 	return err;
1167 }
1168 
1169 int w_e_end_ov_req(struct drbd_work *w, int cancel)
1170 {
1171 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1172 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1173 	struct drbd_device *device = peer_device->device;
1174 	sector_t sector = peer_req->i.sector;
1175 	unsigned int size = peer_req->i.size;
1176 	int digest_size;
1177 	void *digest;
1178 	int err = 0;
1179 
1180 	if (unlikely(cancel))
1181 		goto out;
1182 
1183 	digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
1184 	digest = kmalloc(digest_size, GFP_NOIO);
1185 	if (!digest) {
1186 		err = 1;	/* terminate the connection in case the allocation failed */
1187 		goto out;
1188 	}
1189 
1190 	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1191 		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1192 	else
1193 		memset(digest, 0, digest_size);
1194 
1195 	/* Free e and pages before send.
1196 	 * In case we block on congestion, we could otherwise run into
1197 	 * some distributed deadlock, if the other side blocks on
1198 	 * congestion as well, because our receiver blocks in
1199 	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1200 	drbd_free_peer_req(device, peer_req);
1201 	peer_req = NULL;
1202 	inc_rs_pending(device);
1203 	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
1204 	if (err)
1205 		dec_rs_pending(device);
1206 	kfree(digest);
1207 
1208 out:
1209 	if (peer_req)
1210 		drbd_free_peer_req(device, peer_req);
1211 	dec_unacked(device);
1212 	return err;
1213 }
1214 
1215 void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
1216 {
1217 	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1218 		device->ov_last_oos_size += size>>9;
1219 	} else {
1220 		device->ov_last_oos_start = sector;
1221 		device->ov_last_oos_size = size>>9;
1222 	}
1223 	drbd_set_out_of_sync(device, sector, size);
1224 }
1225 
1226 int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1227 {
1228 	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1229 	struct drbd_peer_device *peer_device = peer_req->peer_device;
1230 	struct drbd_device *device = peer_device->device;
1231 	struct digest_info *di;
1232 	void *digest;
1233 	sector_t sector = peer_req->i.sector;
1234 	unsigned int size = peer_req->i.size;
1235 	int digest_size;
1236 	int err, eq = 0;
1237 	bool stop_sector_reached = false;
1238 
1239 	if (unlikely(cancel)) {
1240 		drbd_free_peer_req(device, peer_req);
1241 		dec_unacked(device);
1242 		return 0;
1243 	}
1244 
1245 	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1246 	 * the resync lru has been cleaned up already */
1247 	if (get_ldev(device)) {
1248 		drbd_rs_complete_io(device, peer_req->i.sector);
1249 		put_ldev(device);
1250 	}
1251 
1252 	di = peer_req->digest;
1253 
1254 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1255 		digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
1256 		digest = kmalloc(digest_size, GFP_NOIO);
1257 		if (digest) {
1258 			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1259 
1260 			D_ASSERT(device, digest_size == di->digest_size);
1261 			eq = !memcmp(digest, di->digest, digest_size);
1262 			kfree(digest);
1263 		}
1264 	}
1265 
1266 	/* Free peer_req and pages before send.
1267 	 * In case we block on congestion, we could otherwise run into
1268 	 * some distributed deadlock, if the other side blocks on
1269 	 * congestion as well, because our receiver blocks in
1270 	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1271 	drbd_free_peer_req(device, peer_req);
1272 	if (!eq)
1273 		drbd_ov_out_of_sync_found(device, sector, size);
1274 	else
1275 		ov_out_of_sync_print(device);
1276 
1277 	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
1278 			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1279 
1280 	dec_unacked(device);
1281 
1282 	--device->ov_left;
1283 
1284 	/* let's advance progress step marks only for every other megabyte */
1285 	if ((device->ov_left & 0x200) == 0x200)
1286 		drbd_advance_rs_marks(device, device->ov_left);
1287 
1288 	stop_sector_reached = verify_can_do_stop_sector(device) &&
1289 		(sector + (size>>9)) >= device->ov_stop_sector;
1290 
1291 	if (device->ov_left == 0 || stop_sector_reached) {
1292 		ov_out_of_sync_print(device);
1293 		drbd_resync_finished(device);
1294 	}
1295 
1296 	return err;
1297 }
1298 
1299 /* FIXME
1300  * We need to track the number of pending barrier acks,
1301  * and to be able to wait for them.
1302  * See also comment in drbd_adm_attach before drbd_suspend_io.
1303  */
1304 static int drbd_send_barrier(struct drbd_connection *connection)
1305 {
1306 	struct p_barrier *p;
1307 	struct drbd_socket *sock;
1308 
1309 	sock = &connection->data;
1310 	p = conn_prepare_command(connection, sock);
1311 	if (!p)
1312 		return -EIO;
1313 	p->barrier = connection->send.current_epoch_nr;
1314 	p->pad = 0;
1315 	connection->send.current_epoch_writes = 0;
1316 
1317 	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1318 }
1319 
1320 int w_send_write_hint(struct drbd_work *w, int cancel)
1321 {
1322 	struct drbd_device *device =
1323 		container_of(w, struct drbd_device, unplug_work);
1324 	struct drbd_socket *sock;
1325 
1326 	if (cancel)
1327 		return 0;
1328 	sock = &first_peer_device(device)->connection->data;
1329 	if (!drbd_prepare_command(first_peer_device(device), sock))
1330 		return -EIO;
1331 	return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1332 }
1333 
1334 static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
1335 {
1336 	if (!connection->send.seen_any_write_yet) {
1337 		connection->send.seen_any_write_yet = true;
1338 		connection->send.current_epoch_nr = epoch;
1339 		connection->send.current_epoch_writes = 0;
1340 	}
1341 }
1342 
1343 static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
1344 {
1345 	/* re-init if first write on this connection */
1346 	if (!connection->send.seen_any_write_yet)
1347 		return;
1348 	if (connection->send.current_epoch_nr != epoch) {
1349 		if (connection->send.current_epoch_writes)
1350 			drbd_send_barrier(connection);
1351 		connection->send.current_epoch_nr = epoch;
1352 	}
1353 }
1354 
1355 int w_send_out_of_sync(struct drbd_work *w, int cancel)
1356 {
1357 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1358 	struct drbd_device *device = req->device;
1359 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1360 	struct drbd_connection *const connection = peer_device->connection;
1361 	int err;
1362 
1363 	if (unlikely(cancel)) {
1364 		req_mod(req, SEND_CANCELED);
1365 		return 0;
1366 	}
1367 	req->pre_send_jif = jiffies;
1368 
1369 	/* this time, no connection->send.current_epoch_writes++;
1370 	 * If it was sent, it was the closing barrier for the last
1371 	 * replicated epoch, before we went into AHEAD mode.
1372 	 * No more barriers will be sent, until we leave AHEAD mode again. */
1373 	maybe_send_barrier(connection, req->epoch);
1374 
1375 	err = drbd_send_out_of_sync(peer_device, req);
1376 	req_mod(req, OOS_HANDED_TO_NETWORK);
1377 
1378 	return err;
1379 }
1380 
1381 /**
1382  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1383  * @w:		work object.
1384  * @cancel:	The connection will be closed anyways
1385  */
1386 int w_send_dblock(struct drbd_work *w, int cancel)
1387 {
1388 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1389 	struct drbd_device *device = req->device;
1390 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1391 	struct drbd_connection *connection = peer_device->connection;
1392 	int err;
1393 
1394 	if (unlikely(cancel)) {
1395 		req_mod(req, SEND_CANCELED);
1396 		return 0;
1397 	}
1398 	req->pre_send_jif = jiffies;
1399 
1400 	re_init_if_first_write(connection, req->epoch);
1401 	maybe_send_barrier(connection, req->epoch);
1402 	connection->send.current_epoch_writes++;
1403 
1404 	err = drbd_send_dblock(peer_device, req);
1405 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1406 
1407 	return err;
1408 }
1409 
1410 /**
1411  * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1412  * @w:		work object.
1413  * @cancel:	The connection will be closed anyways
1414  */
1415 int w_send_read_req(struct drbd_work *w, int cancel)
1416 {
1417 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1418 	struct drbd_device *device = req->device;
1419 	struct drbd_peer_device *const peer_device = first_peer_device(device);
1420 	struct drbd_connection *connection = peer_device->connection;
1421 	int err;
1422 
1423 	if (unlikely(cancel)) {
1424 		req_mod(req, SEND_CANCELED);
1425 		return 0;
1426 	}
1427 	req->pre_send_jif = jiffies;
1428 
1429 	/* Even read requests may close a write epoch,
1430 	 * if there was any yet. */
1431 	maybe_send_barrier(connection, req->epoch);
1432 
1433 	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
1434 				 (unsigned long)req);
1435 
1436 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1437 
1438 	return err;
1439 }
1440 
1441 int w_restart_disk_io(struct drbd_work *w, int cancel)
1442 {
1443 	struct drbd_request *req = container_of(w, struct drbd_request, w);
1444 	struct drbd_device *device = req->device;
1445 
1446 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1447 		drbd_al_begin_io(device, &req->i);
1448 
1449 	drbd_req_make_private_bio(req, req->master_bio);
1450 	req->private_bio->bi_bdev = device->ldev->backing_bdev;
1451 	generic_make_request(req->private_bio);
1452 
1453 	return 0;
1454 }
1455 
1456 static int _drbd_may_sync_now(struct drbd_device *device)
1457 {
1458 	struct drbd_device *odev = device;
1459 	int resync_after;
1460 
1461 	while (1) {
1462 		if (!odev->ldev || odev->state.disk == D_DISKLESS)
1463 			return 1;
1464 		rcu_read_lock();
1465 		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1466 		rcu_read_unlock();
1467 		if (resync_after == -1)
1468 			return 1;
1469 		odev = minor_to_device(resync_after);
1470 		if (!odev)
1471 			return 1;
1472 		if ((odev->state.conn >= C_SYNC_SOURCE &&
1473 		     odev->state.conn <= C_PAUSED_SYNC_T) ||
1474 		    odev->state.aftr_isp || odev->state.peer_isp ||
1475 		    odev->state.user_isp)
1476 			return 0;
1477 	}
1478 }
1479 
1480 /**
1481  * _drbd_pause_after() - Pause resync on all devices that may not resync now
1482  * @device:	DRBD device.
1483  *
1484  * Called from process context only (admin command and after_state_ch).
1485  */
1486 static int _drbd_pause_after(struct drbd_device *device)
1487 {
1488 	struct drbd_device *odev;
1489 	int i, rv = 0;
1490 
1491 	rcu_read_lock();
1492 	idr_for_each_entry(&drbd_devices, odev, i) {
1493 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1494 			continue;
1495 		if (!_drbd_may_sync_now(odev))
1496 			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1497 			       != SS_NOTHING_TO_DO);
1498 	}
1499 	rcu_read_unlock();
1500 
1501 	return rv;
1502 }
1503 
1504 /**
1505  * _drbd_resume_next() - Resume resync on all devices that may resync now
1506  * @device:	DRBD device.
1507  *
1508  * Called from process context only (admin command and worker).
1509  */
1510 static int _drbd_resume_next(struct drbd_device *device)
1511 {
1512 	struct drbd_device *odev;
1513 	int i, rv = 0;
1514 
1515 	rcu_read_lock();
1516 	idr_for_each_entry(&drbd_devices, odev, i) {
1517 		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1518 			continue;
1519 		if (odev->state.aftr_isp) {
1520 			if (_drbd_may_sync_now(odev))
1521 				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1522 							CS_HARD, NULL)
1523 				       != SS_NOTHING_TO_DO) ;
1524 		}
1525 	}
1526 	rcu_read_unlock();
1527 	return rv;
1528 }
1529 
1530 void resume_next_sg(struct drbd_device *device)
1531 {
1532 	write_lock_irq(&global_state_lock);
1533 	_drbd_resume_next(device);
1534 	write_unlock_irq(&global_state_lock);
1535 }
1536 
1537 void suspend_other_sg(struct drbd_device *device)
1538 {
1539 	write_lock_irq(&global_state_lock);
1540 	_drbd_pause_after(device);
1541 	write_unlock_irq(&global_state_lock);
1542 }
1543 
1544 /* caller must hold global_state_lock */
1545 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1546 {
1547 	struct drbd_device *odev;
1548 	int resync_after;
1549 
1550 	if (o_minor == -1)
1551 		return NO_ERROR;
1552 	if (o_minor < -1 || o_minor > MINORMASK)
1553 		return ERR_RESYNC_AFTER;
1554 
1555 	/* check for loops */
1556 	odev = minor_to_device(o_minor);
1557 	while (1) {
1558 		if (odev == device)
1559 			return ERR_RESYNC_AFTER_CYCLE;
1560 
1561 		/* You are free to depend on diskless, non-existing,
1562 		 * or not yet/no longer existing minors.
1563 		 * We only reject dependency loops.
1564 		 * We cannot follow the dependency chain beyond a detached or
1565 		 * missing minor.
1566 		 */
1567 		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1568 			return NO_ERROR;
1569 
1570 		rcu_read_lock();
1571 		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1572 		rcu_read_unlock();
1573 		/* dependency chain ends here, no cycles. */
1574 		if (resync_after == -1)
1575 			return NO_ERROR;
1576 
1577 		/* follow the dependency chain */
1578 		odev = minor_to_device(resync_after);
1579 	}
1580 }
1581 
1582 /* caller must hold global_state_lock */
1583 void drbd_resync_after_changed(struct drbd_device *device)
1584 {
1585 	int changes;
1586 
1587 	do {
1588 		changes  = _drbd_pause_after(device);
1589 		changes |= _drbd_resume_next(device);
1590 	} while (changes);
1591 }
1592 
1593 void drbd_rs_controller_reset(struct drbd_device *device)
1594 {
1595 	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
1596 	struct fifo_buffer *plan;
1597 
1598 	atomic_set(&device->rs_sect_in, 0);
1599 	atomic_set(&device->rs_sect_ev, 0);
1600 	device->rs_in_flight = 0;
1601 	device->rs_last_events =
1602 		(int)part_stat_read(&disk->part0, sectors[0]) +
1603 		(int)part_stat_read(&disk->part0, sectors[1]);
1604 
1605 	/* Updating the RCU protected object in place is necessary since
1606 	   this function gets called from atomic context.
1607 	   It is valid since all other updates also lead to an completely
1608 	   empty fifo */
1609 	rcu_read_lock();
1610 	plan = rcu_dereference(device->rs_plan_s);
1611 	plan->total = 0;
1612 	fifo_set(plan, 0);
1613 	rcu_read_unlock();
1614 }
1615 
1616 void start_resync_timer_fn(unsigned long data)
1617 {
1618 	struct drbd_device *device = (struct drbd_device *) data;
1619 	drbd_device_post_work(device, RS_START);
1620 }
1621 
1622 static void do_start_resync(struct drbd_device *device)
1623 {
1624 	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1625 		drbd_warn(device, "postponing start_resync ...\n");
1626 		device->start_resync_timer.expires = jiffies + HZ/10;
1627 		add_timer(&device->start_resync_timer);
1628 		return;
1629 	}
1630 
1631 	drbd_start_resync(device, C_SYNC_SOURCE);
1632 	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1633 }
1634 
1635 static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
1636 {
1637 	bool csums_after_crash_only;
1638 	rcu_read_lock();
1639 	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
1640 	rcu_read_unlock();
1641 	return connection->agreed_pro_version >= 89 &&		/* supported? */
1642 		connection->csums_tfm &&			/* configured? */
1643 		(csums_after_crash_only == 0			/* use for each resync? */
1644 		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
1645 }
1646 
1647 /**
1648  * drbd_start_resync() - Start the resync process
1649  * @device:	DRBD device.
1650  * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
1651  *
1652  * This function might bring you directly into one of the
1653  * C_PAUSED_SYNC_* states.
1654  */
1655 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1656 {
1657 	struct drbd_peer_device *peer_device = first_peer_device(device);
1658 	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1659 	union drbd_state ns;
1660 	int r;
1661 
1662 	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
1663 		drbd_err(device, "Resync already running!\n");
1664 		return;
1665 	}
1666 
1667 	if (!test_bit(B_RS_H_DONE, &device->flags)) {
1668 		if (side == C_SYNC_TARGET) {
1669 			/* Since application IO was locked out during C_WF_BITMAP_T and
1670 			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1671 			   we check that we might make the data inconsistent. */
1672 			r = drbd_khelper(device, "before-resync-target");
1673 			r = (r >> 8) & 0xff;
1674 			if (r > 0) {
1675 				drbd_info(device, "before-resync-target handler returned %d, "
1676 					 "dropping connection.\n", r);
1677 				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
1678 				return;
1679 			}
1680 		} else /* C_SYNC_SOURCE */ {
1681 			r = drbd_khelper(device, "before-resync-source");
1682 			r = (r >> 8) & 0xff;
1683 			if (r > 0) {
1684 				if (r == 3) {
1685 					drbd_info(device, "before-resync-source handler returned %d, "
1686 						 "ignoring. Old userland tools?", r);
1687 				} else {
1688 					drbd_info(device, "before-resync-source handler returned %d, "
1689 						 "dropping connection.\n", r);
1690 					conn_request_state(connection,
1691 							   NS(conn, C_DISCONNECTING), CS_HARD);
1692 					return;
1693 				}
1694 			}
1695 		}
1696 	}
1697 
1698 	if (current == connection->worker.task) {
1699 		/* The worker should not sleep waiting for state_mutex,
1700 		   that can take long */
1701 		if (!mutex_trylock(device->state_mutex)) {
1702 			set_bit(B_RS_H_DONE, &device->flags);
1703 			device->start_resync_timer.expires = jiffies + HZ/5;
1704 			add_timer(&device->start_resync_timer);
1705 			return;
1706 		}
1707 	} else {
1708 		mutex_lock(device->state_mutex);
1709 	}
1710 	clear_bit(B_RS_H_DONE, &device->flags);
1711 
1712 	/* req_lock: serialize with drbd_send_and_submit() and others
1713 	 * global_state_lock: for stable sync-after dependencies */
1714 	spin_lock_irq(&device->resource->req_lock);
1715 	write_lock(&global_state_lock);
1716 	/* Did some connection breakage or IO error race with us? */
1717 	if (device->state.conn < C_CONNECTED
1718 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
1719 		write_unlock(&global_state_lock);
1720 		spin_unlock_irq(&device->resource->req_lock);
1721 		mutex_unlock(device->state_mutex);
1722 		return;
1723 	}
1724 
1725 	ns = drbd_read_state(device);
1726 
1727 	ns.aftr_isp = !_drbd_may_sync_now(device);
1728 
1729 	ns.conn = side;
1730 
1731 	if (side == C_SYNC_TARGET)
1732 		ns.disk = D_INCONSISTENT;
1733 	else /* side == C_SYNC_SOURCE */
1734 		ns.pdsk = D_INCONSISTENT;
1735 
1736 	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
1737 	ns = drbd_read_state(device);
1738 
1739 	if (ns.conn < C_CONNECTED)
1740 		r = SS_UNKNOWN_ERROR;
1741 
1742 	if (r == SS_SUCCESS) {
1743 		unsigned long tw = drbd_bm_total_weight(device);
1744 		unsigned long now = jiffies;
1745 		int i;
1746 
1747 		device->rs_failed    = 0;
1748 		device->rs_paused    = 0;
1749 		device->rs_same_csum = 0;
1750 		device->rs_last_sect_ev = 0;
1751 		device->rs_total     = tw;
1752 		device->rs_start     = now;
1753 		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1754 			device->rs_mark_left[i] = tw;
1755 			device->rs_mark_time[i] = now;
1756 		}
1757 		_drbd_pause_after(device);
1758 		/* Forget potentially stale cached per resync extent bit-counts.
1759 		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1760 		 * disabled, and know the disk state is ok. */
1761 		spin_lock(&device->al_lock);
1762 		lc_reset(device->resync);
1763 		device->resync_locked = 0;
1764 		device->resync_wenr = LC_FREE;
1765 		spin_unlock(&device->al_lock);
1766 	}
1767 	write_unlock(&global_state_lock);
1768 	spin_unlock_irq(&device->resource->req_lock);
1769 
1770 	if (r == SS_SUCCESS) {
1771 		wake_up(&device->al_wait); /* for lc_reset() above */
1772 		/* reset rs_last_bcast when a resync or verify is started,
1773 		 * to deal with potential jiffies wrap. */
1774 		device->rs_last_bcast = jiffies - HZ;
1775 
1776 		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1777 		     drbd_conn_str(ns.conn),
1778 		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1779 		     (unsigned long) device->rs_total);
1780 		if (side == C_SYNC_TARGET) {
1781 			device->bm_resync_fo = 0;
1782 			device->use_csums = use_checksum_based_resync(connection, device);
1783 		} else {
1784 			device->use_csums = 0;
1785 		}
1786 
1787 		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1788 		 * with w_send_oos, or the sync target will get confused as to
1789 		 * how much bits to resync.  We cannot do that always, because for an
1790 		 * empty resync and protocol < 95, we need to do it here, as we call
1791 		 * drbd_resync_finished from here in that case.
1792 		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1793 		 * and from after_state_ch otherwise. */
1794 		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
1795 			drbd_gen_and_send_sync_uuid(peer_device);
1796 
1797 		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
1798 			/* This still has a race (about when exactly the peers
1799 			 * detect connection loss) that can lead to a full sync
1800 			 * on next handshake. In 8.3.9 we fixed this with explicit
1801 			 * resync-finished notifications, but the fix
1802 			 * introduces a protocol change.  Sleeping for some
1803 			 * time longer than the ping interval + timeout on the
1804 			 * SyncSource, to give the SyncTarget the chance to
1805 			 * detect connection loss, then waiting for a ping
1806 			 * response (implicit in drbd_resync_finished) reduces
1807 			 * the race considerably, but does not solve it. */
1808 			if (side == C_SYNC_SOURCE) {
1809 				struct net_conf *nc;
1810 				int timeo;
1811 
1812 				rcu_read_lock();
1813 				nc = rcu_dereference(connection->net_conf);
1814 				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1815 				rcu_read_unlock();
1816 				schedule_timeout_interruptible(timeo);
1817 			}
1818 			drbd_resync_finished(device);
1819 		}
1820 
1821 		drbd_rs_controller_reset(device);
1822 		/* ns.conn may already be != device->state.conn,
1823 		 * we may have been paused in between, or become paused until
1824 		 * the timer triggers.
1825 		 * No matter, that is handled in resync_timer_fn() */
1826 		if (ns.conn == C_SYNC_TARGET)
1827 			mod_timer(&device->resync_timer, jiffies);
1828 
1829 		drbd_md_sync(device);
1830 	}
1831 	put_ldev(device);
1832 	mutex_unlock(device->state_mutex);
1833 }
1834 
1835 static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
1836 {
1837 	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1838 	device->rs_last_bcast = jiffies;
1839 
1840 	if (!get_ldev(device))
1841 		return;
1842 
1843 	drbd_bm_write_lazy(device, 0);
1844 	if (resync_done && is_sync_state(device->state.conn))
1845 		drbd_resync_finished(device);
1846 
1847 	drbd_bcast_event(device, &sib);
1848 	/* update timestamp, in case it took a while to write out stuff */
1849 	device->rs_last_bcast = jiffies;
1850 	put_ldev(device);
1851 }
1852 
1853 static void drbd_ldev_destroy(struct drbd_device *device)
1854 {
1855 	lc_destroy(device->resync);
1856 	device->resync = NULL;
1857 	lc_destroy(device->act_log);
1858 	device->act_log = NULL;
1859 
1860 	__acquire(local);
1861 	drbd_free_ldev(device->ldev);
1862 	device->ldev = NULL;
1863 	__release(local);
1864 
1865 	clear_bit(GOING_DISKLESS, &device->flags);
1866 	wake_up(&device->misc_wait);
1867 }
1868 
1869 static void go_diskless(struct drbd_device *device)
1870 {
1871 	D_ASSERT(device, device->state.disk == D_FAILED);
1872 	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
1873 	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
1874 	 * the protected members anymore, though, so once put_ldev reaches zero
1875 	 * again, it will be safe to free them. */
1876 
1877 	/* Try to write changed bitmap pages, read errors may have just
1878 	 * set some bits outside the area covered by the activity log.
1879 	 *
1880 	 * If we have an IO error during the bitmap writeout,
1881 	 * we will want a full sync next time, just in case.
1882 	 * (Do we want a specific meta data flag for this?)
1883 	 *
1884 	 * If that does not make it to stable storage either,
1885 	 * we cannot do anything about that anymore.
1886 	 *
1887 	 * We still need to check if both bitmap and ldev are present, we may
1888 	 * end up here after a failed attach, before ldev was even assigned.
1889 	 */
1890 	if (device->bitmap && device->ldev) {
1891 		/* An interrupted resync or similar is allowed to recounts bits
1892 		 * while we detach.
1893 		 * Any modifications would not be expected anymore, though.
1894 		 */
1895 		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
1896 					"detach", BM_LOCKED_TEST_ALLOWED)) {
1897 			if (test_bit(WAS_READ_ERROR, &device->flags)) {
1898 				drbd_md_set_flag(device, MDF_FULL_SYNC);
1899 				drbd_md_sync(device);
1900 			}
1901 		}
1902 	}
1903 
1904 	drbd_force_state(device, NS(disk, D_DISKLESS));
1905 }
1906 
1907 static int do_md_sync(struct drbd_device *device)
1908 {
1909 	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
1910 	drbd_md_sync(device);
1911 	return 0;
1912 }
1913 
1914 /* only called from drbd_worker thread, no locking */
1915 void __update_timing_details(
1916 		struct drbd_thread_timing_details *tdp,
1917 		unsigned int *cb_nr,
1918 		void *cb,
1919 		const char *fn, const unsigned int line)
1920 {
1921 	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
1922 	struct drbd_thread_timing_details *td = tdp + i;
1923 
1924 	td->start_jif = jiffies;
1925 	td->cb_addr = cb;
1926 	td->caller_fn = fn;
1927 	td->line = line;
1928 	td->cb_nr = *cb_nr;
1929 
1930 	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
1931 	td = tdp + i;
1932 	memset(td, 0, sizeof(*td));
1933 
1934 	++(*cb_nr);
1935 }
1936 
1937 static void do_device_work(struct drbd_device *device, const unsigned long todo)
1938 {
1939 	if (test_bit(MD_SYNC, &todo))
1940 		do_md_sync(device);
1941 	if (test_bit(RS_DONE, &todo) ||
1942 	    test_bit(RS_PROGRESS, &todo))
1943 		update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
1944 	if (test_bit(GO_DISKLESS, &todo))
1945 		go_diskless(device);
1946 	if (test_bit(DESTROY_DISK, &todo))
1947 		drbd_ldev_destroy(device);
1948 	if (test_bit(RS_START, &todo))
1949 		do_start_resync(device);
1950 }
1951 
1952 #define DRBD_DEVICE_WORK_MASK	\
1953 	((1UL << GO_DISKLESS)	\
1954 	|(1UL << DESTROY_DISK)	\
1955 	|(1UL << MD_SYNC)	\
1956 	|(1UL << RS_START)	\
1957 	|(1UL << RS_PROGRESS)	\
1958 	|(1UL << RS_DONE)	\
1959 	)
1960 
1961 static unsigned long get_work_bits(unsigned long *flags)
1962 {
1963 	unsigned long old, new;
1964 	do {
1965 		old = *flags;
1966 		new = old & ~DRBD_DEVICE_WORK_MASK;
1967 	} while (cmpxchg(flags, old, new) != old);
1968 	return old & DRBD_DEVICE_WORK_MASK;
1969 }
1970 
1971 static void do_unqueued_work(struct drbd_connection *connection)
1972 {
1973 	struct drbd_peer_device *peer_device;
1974 	int vnr;
1975 
1976 	rcu_read_lock();
1977 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1978 		struct drbd_device *device = peer_device->device;
1979 		unsigned long todo = get_work_bits(&device->flags);
1980 		if (!todo)
1981 			continue;
1982 
1983 		kref_get(&device->kref);
1984 		rcu_read_unlock();
1985 		do_device_work(device, todo);
1986 		kref_put(&device->kref, drbd_destroy_device);
1987 		rcu_read_lock();
1988 	}
1989 	rcu_read_unlock();
1990 }
1991 
1992 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1993 {
1994 	spin_lock_irq(&queue->q_lock);
1995 	list_splice_tail_init(&queue->q, work_list);
1996 	spin_unlock_irq(&queue->q_lock);
1997 	return !list_empty(work_list);
1998 }
1999 
2000 static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
2001 {
2002 	DEFINE_WAIT(wait);
2003 	struct net_conf *nc;
2004 	int uncork, cork;
2005 
2006 	dequeue_work_batch(&connection->sender_work, work_list);
2007 	if (!list_empty(work_list))
2008 		return;
2009 
2010 	/* Still nothing to do?
2011 	 * Maybe we still need to close the current epoch,
2012 	 * even if no new requests are queued yet.
2013 	 *
2014 	 * Also, poke TCP, just in case.
2015 	 * Then wait for new work (or signal). */
2016 	rcu_read_lock();
2017 	nc = rcu_dereference(connection->net_conf);
2018 	uncork = nc ? nc->tcp_cork : 0;
2019 	rcu_read_unlock();
2020 	if (uncork) {
2021 		mutex_lock(&connection->data.mutex);
2022 		if (connection->data.socket)
2023 			drbd_tcp_uncork(connection->data.socket);
2024 		mutex_unlock(&connection->data.mutex);
2025 	}
2026 
2027 	for (;;) {
2028 		int send_barrier;
2029 		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
2030 		spin_lock_irq(&connection->resource->req_lock);
2031 		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2032 		if (!list_empty(&connection->sender_work.q))
2033 			list_splice_tail_init(&connection->sender_work.q, work_list);
2034 		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2035 		if (!list_empty(work_list) || signal_pending(current)) {
2036 			spin_unlock_irq(&connection->resource->req_lock);
2037 			break;
2038 		}
2039 
2040 		/* We found nothing new to do, no to-be-communicated request,
2041 		 * no other work item.  We may still need to close the last
2042 		 * epoch.  Next incoming request epoch will be connection ->
2043 		 * current transfer log epoch number.  If that is different
2044 		 * from the epoch of the last request we communicated, it is
2045 		 * safe to send the epoch separating barrier now.
2046 		 */
2047 		send_barrier =
2048 			atomic_read(&connection->current_tle_nr) !=
2049 			connection->send.current_epoch_nr;
2050 		spin_unlock_irq(&connection->resource->req_lock);
2051 
2052 		if (send_barrier)
2053 			maybe_send_barrier(connection,
2054 					connection->send.current_epoch_nr + 1);
2055 
2056 		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
2057 			break;
2058 
2059 		/* drbd_send() may have called flush_signals() */
2060 		if (get_t_state(&connection->worker) != RUNNING)
2061 			break;
2062 
2063 		schedule();
2064 		/* may be woken up for other things but new work, too,
2065 		 * e.g. if the current epoch got closed.
2066 		 * In which case we send the barrier above. */
2067 	}
2068 	finish_wait(&connection->sender_work.q_wait, &wait);
2069 
2070 	/* someone may have changed the config while we have been waiting above. */
2071 	rcu_read_lock();
2072 	nc = rcu_dereference(connection->net_conf);
2073 	cork = nc ? nc->tcp_cork : 0;
2074 	rcu_read_unlock();
2075 	mutex_lock(&connection->data.mutex);
2076 	if (connection->data.socket) {
2077 		if (cork)
2078 			drbd_tcp_cork(connection->data.socket);
2079 		else if (!uncork)
2080 			drbd_tcp_uncork(connection->data.socket);
2081 	}
2082 	mutex_unlock(&connection->data.mutex);
2083 }
2084 
2085 int drbd_worker(struct drbd_thread *thi)
2086 {
2087 	struct drbd_connection *connection = thi->connection;
2088 	struct drbd_work *w = NULL;
2089 	struct drbd_peer_device *peer_device;
2090 	LIST_HEAD(work_list);
2091 	int vnr;
2092 
2093 	while (get_t_state(thi) == RUNNING) {
2094 		drbd_thread_current_set_cpu(thi);
2095 
2096 		if (list_empty(&work_list)) {
2097 			update_worker_timing_details(connection, wait_for_work);
2098 			wait_for_work(connection, &work_list);
2099 		}
2100 
2101 		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2102 			update_worker_timing_details(connection, do_unqueued_work);
2103 			do_unqueued_work(connection);
2104 		}
2105 
2106 		if (signal_pending(current)) {
2107 			flush_signals(current);
2108 			if (get_t_state(thi) == RUNNING) {
2109 				drbd_warn(connection, "Worker got an unexpected signal\n");
2110 				continue;
2111 			}
2112 			break;
2113 		}
2114 
2115 		if (get_t_state(thi) != RUNNING)
2116 			break;
2117 
2118 		if (!list_empty(&work_list)) {
2119 			w = list_first_entry(&work_list, struct drbd_work, list);
2120 			list_del_init(&w->list);
2121 			update_worker_timing_details(connection, w->cb);
2122 			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
2123 				continue;
2124 			if (connection->cstate >= C_WF_REPORT_PARAMS)
2125 				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
2126 		}
2127 	}
2128 
2129 	do {
2130 		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2131 			update_worker_timing_details(connection, do_unqueued_work);
2132 			do_unqueued_work(connection);
2133 		}
2134 		if (!list_empty(&work_list)) {
2135 			w = list_first_entry(&work_list, struct drbd_work, list);
2136 			list_del_init(&w->list);
2137 			update_worker_timing_details(connection, w->cb);
2138 			w->cb(w, 1);
2139 		} else
2140 			dequeue_work_batch(&connection->sender_work, &work_list);
2141 	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
2142 
2143 	rcu_read_lock();
2144 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2145 		struct drbd_device *device = peer_device->device;
2146 		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
2147 		kref_get(&device->kref);
2148 		rcu_read_unlock();
2149 		drbd_device_cleanup(device);
2150 		kref_put(&device->kref, drbd_destroy_device);
2151 		rcu_read_lock();
2152 	}
2153 	rcu_read_unlock();
2154 
2155 	return 0;
2156 }
2157