block/drbd/drbd_int.h

1 /* SPDX-License-Identifier: GPL-2.0-only */
7   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
29 #include <linux/backing-dev.h>
63 #define ID_SYNCER (-1ULL)
125 	/* statistics; index: (h->command == P_BITMAP) */
143 	c->word_offset = c->bit_offset >> 6;  in bm_xfer_ctx_bit_to_word_offset()
145 	c->word_offset = c->bit_offset >> 5;  in bm_xfer_ctx_bit_to_word_offset()
146 	c->word_offset &= ~(1UL);  in bm_xfer_ctx_bit_to_word_offset()
178 	 *	--lge */  in get_t_state()
181 	return thi->t_state;  in get_t_state()
235 	/* Minimal set of time stamps to determine if we wait for activity log
236 	 * transactions, local disk or peer.  32 bit "jiffies" are good enough,
243 	/* local disk */
259 	 *      how long did we wait for activity log transactions
340 	((peer_req)->opf & REQ_OP_MASK)
346  * non-atomic modification to ee->flags is ok.
354 	/* explicit zero-out requested, or
416 	MD_DIRTY,		/* current uuids and flags not yet on disk */
417 	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
430 	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
431 	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
432 	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
437 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
438 	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
443 	FLUSH_PENDING,		/* if set, device->flush_jif is when we submitted that flush
447 	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
472 	 * and still allow all non-bulk operations */
480 	 * requires sending of "out-of-sync" information, though. */
513 	s32 al_offset;	/* signed relative sector offset to activity log */
516 	/* cached value of bdev->disk_conf->meta_dev_idx (see below) */
529 	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
595 	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
650 …pto_shash *integrity_tfm;  /* checksums we compute, updates protected by connection->data->mutex */
674 	 * protected by resource->req_lock */
699 		 * with req->epoch == current_epoch_nr.
710 	has_net_conf = rcu_dereference(connection->net_conf);  in has_net_conf()
723 	__update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
725 	__update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
731 	/* protected by ..->resource->req_lock */
766 	/* things that are stored as / read from meta data on disk */
772 	sector_t p_size;     /* partner's disk size */
784 	/* Used after attach while negotiating new disk state. */
797 	atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
827 	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
842 	/* size of out-of-sync range in sectors. */
859 	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
860 	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
863 	struct list_head net_ee;    /* zero-copy network send in progress */
873 	struct lru_cache *act_log;	/* activity log */
884 …struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cst…
892 …struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update…
899 	 * are deferred to this single-threaded work queue */
905 	struct list_head list; /* on device->pending_bitmap_io */;
923 #define VOLUME_UNSPECIFIED		(-1U)
947 	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);  in first_peer_device()
953 	return idr_find(&connection->peer_devices, volume_number);  in conn_peer_device()
966 	list_for_each_entry(connection, &resource->connections, connections)
969 	list_for_each_entry_rcu(connection, &resource->connections, connections)
972 	list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
975 	list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
978 	list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
981 	list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
985 	return device->minor;  in device_to_minor()
1092  *   |----------- md_size_sect ------------------|
1093  *   [ 4k superblock ][ activity log ][  Bitmap  ]
1096  *  ==> bitmap sectors = md_size_sect - bm_offset
1102  *            |----------- md_size_sect ------------------|
1103  * [data.....][  Bitmap  ][ activity log ][ 4k superblock ][padding*]
1105  *            | bm_offset = al_offset - Y |
1106  *  ==> bitmap sectors = Y = al_offset - bm_offset
1111  *  The activity log consists of 4k transaction blocks,
1112  *  which are written in a ring-buffer, or striped ring-buffer like fashion,
1124 /* One activity log extent represents 4M of storage */
1129  * variables at create-md time (or even re-configurable at runtime?).
1143 #define AL_CONTEXT_PER_TRANSACTION	919	// (4096 - 36 - 6*64)/4
1173  * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
1195 #define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
1196 #define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
1200 #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
1204 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
1205 #define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1208 #define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
1212 #define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1214 #define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
1218 #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1220 /* the extent in "PER_EXTENT" below is an activity log extent
1225  * bit	 0	  bit 37   bit 38	     bit (512*8)-1
1227  * sect. 0	 `296	  `304			   ^(512*8*8)-1
1235 /* we have a certain meta data variant that has a fixed on-disk size of 128
1236  * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
1241 	  ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
1257  * Since we may live in a mixed-platform cluster,
1271  * activity log transaction to be discarded in one go. We may need to rework
1331 extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
1353 /* We also need a standard (emergency-reserve backed) page pool
1354  * for meta data IO (activity log, bitmap).
1404 	DS_ERROR_SHRINK = -3,
1405 	DS_ERROR_SPACE_MD = -2,
1406 	DS_ERROR = -1,
1450 	struct drbd_device *device = peer_device->device;  in ov_out_of_sync_print()
1452 	if (device->ov_last_oos_size) {  in ov_out_of_sync_print()
1454 		     (unsigned long long)device->ov_last_oos_start,  in ov_out_of_sync_print()
1455 		     (unsigned long)device->ov_last_oos_size);  in ov_out_of_sync_print()
1457 	device->ov_last_oos_size = 0;  in ov_out_of_sync_print()
1519 	if (!bio->bi_bdev) {  in drbd_submit_bio_noacct()
1520 		drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n");  in drbd_submit_bio_noacct()
1521 		bio->bi_status = BLK_STS_IOERR;  in drbd_submit_bio_noacct()
1625 	struct page *page = peer_req->pages;  in drbd_peer_req_has_active_page()
1635 	struct drbd_resource *resource = device->resource;  in drbd_read_state()
1638 	rv.i = device->state.i;  in drbd_read_state()
1639 	rv.susp = resource->susp;  in drbd_read_state()
1640 	rv.susp_nod = resource->susp_nod;  in drbd_read_state()
1641 	rv.susp_fen = resource->susp_fen;  in drbd_read_state()
1661 	ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;  in __drbd_chk_io_error_()
1668 			if (device->state.disk > D_INCONSISTENT)  in __drbd_chk_io_error_()
1669 				_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);  in __drbd_chk_io_error_()
1678 		 * by the activity log.  in __drbd_chk_io_error_()
1681 		 * blocks, which triggers block re-allocation in lower layers.  in __drbd_chk_io_error_()
1686 		 * Force-detach is not really an IO error, but rather a  in __drbd_chk_io_error_()
1695 		set_bit(WAS_IO_ERROR, &device->flags);  in __drbd_chk_io_error_()
1697 			set_bit(WAS_READ_ERROR, &device->flags);  in __drbd_chk_io_error_()
1699 			set_bit(FORCE_DETACH, &device->flags);  in __drbd_chk_io_error_()
1700 		if (device->state.disk > D_FAILED) {  in __drbd_chk_io_error_()
1701 			_drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);  in __drbd_chk_io_error_()
1715  * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
1723 		spin_lock_irqsave(&device->resource->req_lock, flags);  in drbd_chk_io_error_()
1725 		spin_unlock_irqrestore(&device->resource->req_lock, flags);  in drbd_chk_io_error_()
1731  * drbd_md_first_sector() - Returns the first sector number of the meta data area
1739 	switch (bdev->md.meta_dev_idx) {  in drbd_md_first_sector()
1742 		return bdev->md.md_offset + bdev->md.bm_offset;  in drbd_md_first_sector()
1745 		return bdev->md.md_offset;  in drbd_md_first_sector()
1750  * drbd_md_last_sector() - Return the last sector number of the meta data area
1755 	switch (bdev->md.meta_dev_idx) {  in drbd_md_last_sector()
1758 		return bdev->md.md_offset + MD_4kB_SECT -1;  in drbd_md_last_sector()
1761 		return bdev->md.md_offset + bdev->md.md_size_sect -1;  in drbd_md_last_sector()
1772  * drbd_get_max_capacity() - Returns the capacity we announce to out peer
1783 	switch (bdev->md.meta_dev_idx) {  in drbd_get_max_capacity()
1786 		s = drbd_get_capacity(bdev->backing_bdev)  in drbd_get_max_capacity()
1793 				drbd_get_capacity(bdev->backing_bdev));  in drbd_get_max_capacity()
1796 			BM_EXT_TO_SECT(bdev->md.md_size_sect  in drbd_get_max_capacity()
1797 				     - bdev->md.bm_offset));  in drbd_get_max_capacity()
1801 				drbd_get_capacity(bdev->backing_bdev));  in drbd_get_max_capacity()
1807  * drbd_md_ss() - Return the sector number of our meta data super block
1812 	const int meta_dev_idx = bdev->md.meta_dev_idx;  in drbd_md_ss()
1821 		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;  in drbd_md_ss()
1824 	return MD_128MB_SECT * bdev->md.meta_dev_idx;  in drbd_md_ss()
1831 	spin_lock_irqsave(&q->q_lock, flags);  in drbd_queue_work()
1832 	list_add_tail(&w->list, &q->q);  in drbd_queue_work()
1833 	spin_unlock_irqrestore(&q->q_lock, flags);  in drbd_queue_work()
1834 	wake_up(&q->q_wait);  in drbd_queue_work()
1841 	spin_lock_irqsave(&q->q_lock, flags);  in drbd_queue_work_if_unqueued()
1842 	if (list_empty_careful(&w->list))  in drbd_queue_work_if_unqueued()
1843 		list_add_tail(&w->list, &q->q);  in drbd_queue_work_if_unqueued()
1844 	spin_unlock_irqrestore(&q->q_lock, flags);  in drbd_queue_work_if_unqueued()
1845 	wake_up(&q->q_wait);  in drbd_queue_work_if_unqueued()
1851 	if (!test_and_set_bit(work_bit, &device->flags)) {  in drbd_device_post_work()
1853 			first_peer_device(device)->connection;  in drbd_device_post_work()
1854 		struct drbd_work_queue *q = &connection->sender_work;  in drbd_device_post_work()
1855 		if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))  in drbd_device_post_work()
1856 			wake_up(&q->q_wait);  in drbd_device_post_work()
1863  * so it can change its sk_rcvtimeo from idle- to ping-timeout,
1868 	struct task_struct *task = connection->ack_receiver.task;  in wake_ack_receiver()
1869 	if (task && get_t_state(&connection->ack_receiver) == RUNNING)  in wake_ack_receiver()
1875 	set_bit(SEND_PING, &connection->flags);  in request_ping()
1916  *    (drbd_make_request_common; recovery path on read io-error)
1932 	atomic_inc(&device->ap_pending_cnt);  in inc_ap_pending()
1938 	int ap_pending_cnt = atomic_dec_return(&device->ap_pending_cnt);  in __dec_ap_pending()
1941 		wake_up(&device->misc_wait);  in __dec_ap_pending()
1945 /* counts how many resync-related answers we still expect from the peer
1953 	atomic_inc(&peer_device->device->rs_pending_cnt);  in inc_rs_pending()
1960 	return atomic_dec_return(&peer_device->device->rs_pending_cnt);  in __dec_rs_pending()
1974 	atomic_inc(&device->unacked_cnt);  in inc_unacked()
1980 	return atomic_dec_return(&device->unacked_cnt);  in __dec_unacked()
1986 	return atomic_sub_return(n, &device->unacked_cnt);  in __sub_unacked()
2008  * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
2012  * You have to call put_ldev() when finished working with device->ldev.
2021 	enum drbd_disk_state disk_state = device->state.disk;  in put_ldev()
2026 	int i = atomic_dec_return(&device->local_cnt);  in put_ldev()
2039 			if (!test_and_set_bit(GOING_DISKLESS, &device->flags))  in put_ldev()
2041 		wake_up(&device->misc_wait);  in put_ldev()
2051 	if (device->state.disk == D_DISKLESS)  in _get_ldev_if_state()
2054 	atomic_inc(&device->local_cnt);  in _get_ldev_if_state()
2055 	io_allowed = (device->state.disk >= mins);  in _get_ldev_if_state()
2064 /* this throttles on-the-fly application requests
2066  * maybe re-implement using semaphores? */
2073 	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);  in drbd_get_max_buffers()
2074 	mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */  in drbd_get_max_buffers()
2082 	union drbd_dev_state s = device->state;  in drbd_state_is_stable()
2116 		if (first_peer_device(device)->connection->agreed_pro_version < 96)  in drbd_state_is_stable()
2128 	switch ((enum drbd_disk_state)s.disk) {  in drbd_state_is_stable()
2135 		/* disk state is stable as well. */  in drbd_state_is_stable()
2152 	struct drbd_resource *resource = device->resource;  in drbd_suspended()
2154 	return resource->susp || resource->susp_fen || resource->susp_nod;  in drbd_suspended()
2163 	if (atomic_read(&device->suspend_cnt))  in may_inc_ap_bio()
2170 	/* no new io accepted when attaching or detaching the disk */  in may_inc_ap_bio()
2176 	if (atomic_read(&device->ap_bio_cnt) > mxb)  in may_inc_ap_bio()
2178 	if (test_bit(BITMAP_IO, &device->flags))  in may_inc_ap_bio()
2187 	spin_lock_irq(&device->resource->req_lock);  in inc_ap_bio_cond()
2190 		atomic_inc(&device->ap_bio_cnt);  in inc_ap_bio_cond()
2191 	spin_unlock_irq(&device->resource->req_lock);  in inc_ap_bio_cond()
2206 	wait_event(device->misc_wait, inc_ap_bio_cond(device));  in inc_ap_bio()
2212 	int ap_bio = atomic_dec_return(&device->ap_bio_cnt);  in dec_ap_bio()
2216 	if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {  in dec_ap_bio()
2217 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))  in dec_ap_bio()
2218 			drbd_queue_work(&first_peer_device(device)->  in dec_ap_bio()
2219 				connection->sender_work,  in dec_ap_bio()
2220 				&device->bm_io_work.w);  in dec_ap_bio()
2227 		wake_up(&device->misc_wait);  in dec_ap_bio()
2232 	return first_peer_device(device)->connection->agreed_pro_version >= 97 &&  in verify_can_do_stop_sector()
2233 		first_peer_device(device)->connection->agreed_pro_version != 100;  in verify_can_do_stop_sector()
2238 	int changed = device->ed_uuid != val;  in drbd_set_ed_uuid()
2239 	device->ed_uuid = val;  in drbd_set_ed_uuid()
2255 	return list_first_entry_or_null(&resource->connections,  in first_connection()