1 /* 2 md_k.h : kernel internal structure of the Linux MD driver 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 You should have received a copy of the GNU General Public License 11 (for example /usr/src/linux/COPYING); if not, write to the Free 12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 13 */ 14 15 #ifndef _MD_MD_H 16 #define _MD_MD_H 17 18 #include <linux/blkdev.h> 19 #include <linux/kobject.h> 20 #include <linux/list.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/timer.h> 24 #include <linux/wait.h> 25 #include <linux/workqueue.h> 26 27 #define MaxSector (~(sector_t)0) 28 29 typedef struct mddev_s mddev_t; 30 typedef struct mdk_rdev_s mdk_rdev_t; 31 32 /* 33 * MD's 'extended' device 34 */ 35 struct mdk_rdev_s 36 { 37 struct list_head same_set; /* RAID devices within the same set */ 38 39 sector_t sectors; /* Device size (in 512bytes sectors) */ 40 mddev_t *mddev; /* RAID array if running */ 41 int last_events; /* IO event timestamp */ 42 43 /* 44 * If meta_bdev is non-NULL, it means that a separate device is 45 * being used to store the metadata (superblock/bitmap) which 46 * would otherwise be contained on the same device as the data (bdev). 47 */ 48 struct block_device *meta_bdev; 49 struct block_device *bdev; /* block device handle */ 50 51 struct page *sb_page; 52 int sb_loaded; 53 __u64 sb_events; 54 sector_t data_offset; /* start of data in array */ 55 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 56 int sb_size; /* bytes in the superblock */ 57 int preferred_minor; /* autorun support */ 58 59 struct kobject kobj; 60 61 /* A device can be in one of three states based on two flags: 62 * Not working: faulty==1 in_sync==0 63 * Fully working: faulty==0 in_sync==1 64 * Working, but not 65 * in sync with array 66 * faulty==0 in_sync==0 67 * 68 * It can never have faulty==1, in_sync==1 69 * This reduces the burden of testing multiple flags in many cases 70 */ 71 72 unsigned long flags; 73 #define Faulty 1 /* device is known to have a fault */ 74 #define In_sync 2 /* device is in_sync with rest of array */ 75 #define WriteMostly 4 /* Avoid reading if at all possible */ 76 #define AutoDetected 7 /* added by auto-detect */ 77 #define Blocked 8 /* An error occurred on an externally 78 * managed array, don't allow writes 79 * until it is cleared */ 80 wait_queue_head_t blocked_wait; 81 82 int desc_nr; /* descriptor index in the superblock */ 83 int raid_disk; /* role of device in array */ 84 int new_raid_disk; /* role that the device will have in 85 * the array after a level-change completes. 86 */ 87 int saved_raid_disk; /* role that device used to have in the 88 * array and could again if we did a partial 89 * resync from the bitmap 90 */ 91 sector_t recovery_offset;/* If this device has been partially 92 * recovered, this is where we were 93 * up to. 94 */ 95 96 atomic_t nr_pending; /* number of pending requests. 97 * only maintained for arrays that 98 * support hot removal 99 */ 100 atomic_t read_errors; /* number of consecutive read errors that 101 * we have tried to ignore. 102 */ 103 struct timespec last_read_error; /* monotonic time since our 104 * last read error 105 */ 106 atomic_t corrected_errors; /* number of corrected read errors, 107 * for reporting to userspace and storing 108 * in superblock. 109 */ 110 struct work_struct del_work; /* used for delayed sysfs removal */ 111 112 struct sysfs_dirent *sysfs_state; /* handle for 'state' 113 * sysfs entry */ 114 }; 115 116 struct mddev_s 117 { 118 void *private; 119 struct mdk_personality *pers; 120 dev_t unit; 121 int md_minor; 122 struct list_head disks; 123 unsigned long flags; 124 #define MD_CHANGE_DEVS 0 /* Some device status has changed */ 125 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 126 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 127 128 int suspended; 129 atomic_t active_io; 130 int ro; 131 int sysfs_active; /* set when sysfs deletes 132 * are happening, so run/ 133 * takeover/stop are not safe 134 */ 135 int ready; /* See when safe to pass 136 * IO requests down */ 137 struct gendisk *gendisk; 138 139 struct kobject kobj; 140 int hold_active; 141 #define UNTIL_IOCTL 1 142 #define UNTIL_STOP 2 143 144 /* Superblock information */ 145 int major_version, 146 minor_version, 147 patch_version; 148 int persistent; 149 int external; /* metadata is 150 * managed externally */ 151 char metadata_type[17]; /* externally set*/ 152 int chunk_sectors; 153 time_t ctime, utime; 154 int level, layout; 155 char clevel[16]; 156 int raid_disks; 157 int max_disks; 158 sector_t dev_sectors; /* used size of 159 * component devices */ 160 sector_t array_sectors; /* exported array size */ 161 int external_size; /* size managed 162 * externally */ 163 __u64 events; 164 /* If the last 'event' was simply a clean->dirty transition, and 165 * we didn't write it to the spares, then it is safe and simple 166 * to just decrement the event count on a dirty->clean transition. 167 * So we record that possibility here. 168 */ 169 int can_decrease_events; 170 171 char uuid[16]; 172 173 /* If the array is being reshaped, we need to record the 174 * new shape and an indication of where we are up to. 175 * This is written to the superblock. 176 * If reshape_position is MaxSector, then no reshape is happening (yet). 177 */ 178 sector_t reshape_position; 179 int delta_disks, new_level, new_layout; 180 int new_chunk_sectors; 181 182 atomic_t plug_cnt; /* If device is expecting 183 * more bios soon. 184 */ 185 struct mdk_thread_s *thread; /* management thread */ 186 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 187 sector_t curr_resync; /* last block scheduled */ 188 /* As resync requests can complete out of order, we cannot easily track 189 * how much resync has been completed. So we occasionally pause until 190 * everything completes, then set curr_resync_completed to curr_resync. 191 * As such it may be well behind the real resync mark, but it is a value 192 * we are certain of. 193 */ 194 sector_t curr_resync_completed; 195 unsigned long resync_mark; /* a recent timestamp */ 196 sector_t resync_mark_cnt;/* blocks written at resync_mark */ 197 sector_t curr_mark_cnt; /* blocks scheduled now */ 198 199 sector_t resync_max_sectors; /* may be set by personality */ 200 201 sector_t resync_mismatches; /* count of sectors where 202 * parity/replica mismatch found 203 */ 204 205 /* allow user-space to request suspension of IO to regions of the array */ 206 sector_t suspend_lo; 207 sector_t suspend_hi; 208 /* if zero, use the system-wide default */ 209 int sync_speed_min; 210 int sync_speed_max; 211 212 /* resync even though the same disks are shared among md-devices */ 213 int parallel_resync; 214 215 int ok_start_degraded; 216 /* recovery/resync flags 217 * NEEDED: we might need to start a resync/recover 218 * RUNNING: a thread is running, or about to be started 219 * SYNC: actually doing a resync, not a recovery 220 * RECOVER: doing recovery, or need to try it. 221 * INTR: resync needs to be aborted for some reason 222 * DONE: thread is done and is waiting to be reaped 223 * REQUEST: user-space has requested a sync (used with SYNC) 224 * CHECK: user-space request for check-only, no repair 225 * RESHAPE: A reshape is happening 226 * 227 * If neither SYNC or RESHAPE are set, then it is a recovery. 228 */ 229 #define MD_RECOVERY_RUNNING 0 230 #define MD_RECOVERY_SYNC 1 231 #define MD_RECOVERY_RECOVER 2 232 #define MD_RECOVERY_INTR 3 233 #define MD_RECOVERY_DONE 4 234 #define MD_RECOVERY_NEEDED 5 235 #define MD_RECOVERY_REQUESTED 6 236 #define MD_RECOVERY_CHECK 7 237 #define MD_RECOVERY_RESHAPE 8 238 #define MD_RECOVERY_FROZEN 9 239 240 unsigned long recovery; 241 int recovery_disabled; /* if we detect that recovery 242 * will always fail, set this 243 * so we don't loop trying */ 244 245 int in_sync; /* know to not need resync */ 246 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 247 * that we are never stopping an array while it is open. 248 * 'reconfig_mutex' protects all other reconfiguration. 249 * These locks are separate due to conflicting interactions 250 * with bdev->bd_mutex. 251 * Lock ordering is: 252 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 253 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 254 */ 255 struct mutex open_mutex; 256 struct mutex reconfig_mutex; 257 atomic_t active; /* general refcount */ 258 atomic_t openers; /* number of active opens */ 259 260 int changed; /* True if we might need to 261 * reread partition info */ 262 int degraded; /* whether md should consider 263 * adding a spare 264 */ 265 266 atomic_t recovery_active; /* blocks scheduled, but not written */ 267 wait_queue_head_t recovery_wait; 268 sector_t recovery_cp; 269 sector_t resync_min; /* user requested sync 270 * starts here */ 271 sector_t resync_max; /* resync should pause 272 * when it gets here */ 273 274 struct sysfs_dirent *sysfs_state; /* handle for 'array_state' 275 * file in sysfs. 276 */ 277 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ 278 279 struct work_struct del_work; /* used for delayed sysfs removal */ 280 281 spinlock_t write_lock; 282 wait_queue_head_t sb_wait; /* for waiting on superblock updates */ 283 atomic_t pending_writes; /* number of active superblock writes */ 284 285 unsigned int safemode; /* if set, update "clean" superblock 286 * when no writes pending. 287 */ 288 unsigned int safemode_delay; 289 struct timer_list safemode_timer; 290 atomic_t writes_pending; 291 struct request_queue *queue; /* for plugging ... */ 292 293 struct bitmap *bitmap; /* the bitmap for the device */ 294 struct { 295 struct file *file; /* the bitmap file */ 296 loff_t offset; /* offset from superblock of 297 * start of bitmap. May be 298 * negative, but not '0' 299 * For external metadata, offset 300 * from start of device. 301 */ 302 loff_t default_offset; /* this is the offset to use when 303 * hot-adding a bitmap. It should 304 * eventually be settable by sysfs. 305 */ 306 /* When md is serving under dm, it might use a 307 * dirty_log to store the bits. 308 */ 309 struct dm_dirty_log *log; 310 311 struct mutex mutex; 312 unsigned long chunksize; 313 unsigned long daemon_sleep; /* how many jiffies between updates? */ 314 unsigned long max_write_behind; /* write-behind mode */ 315 int external; 316 } bitmap_info; 317 318 atomic_t max_corr_read_errors; /* max read retries */ 319 struct list_head all_mddevs; 320 321 struct attribute_group *to_remove; 322 323 struct bio_set *bio_set; 324 325 /* Generic flush handling. 326 * The last to finish preflush schedules a worker to submit 327 * the rest of the request (without the REQ_FLUSH flag). 328 */ 329 struct bio *flush_bio; 330 atomic_t flush_pending; 331 struct work_struct flush_work; 332 struct work_struct event_work; /* used by dm to report failure event */ 333 }; 334 335 336 static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) 337 { 338 int faulty = test_bit(Faulty, &rdev->flags); 339 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 340 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 341 } 342 343 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 344 { 345 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 346 } 347 348 struct mdk_personality 349 { 350 char *name; 351 int level; 352 struct list_head list; 353 struct module *owner; 354 int (*make_request)(mddev_t *mddev, struct bio *bio); 355 int (*run)(mddev_t *mddev); 356 int (*stop)(mddev_t *mddev); 357 void (*status)(struct seq_file *seq, mddev_t *mddev); 358 /* error_handler must set ->faulty and clear ->in_sync 359 * if appropriate, and should abort recovery if needed 360 */ 361 void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); 362 int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); 363 int (*hot_remove_disk) (mddev_t *mddev, int number); 364 int (*spare_active) (mddev_t *mddev); 365 sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); 366 int (*resize) (mddev_t *mddev, sector_t sectors); 367 sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); 368 int (*check_reshape) (mddev_t *mddev); 369 int (*start_reshape) (mddev_t *mddev); 370 void (*finish_reshape) (mddev_t *mddev); 371 /* quiesce moves between quiescence states 372 * 0 - fully active 373 * 1 - no new requests allowed 374 * others - reserved 375 */ 376 void (*quiesce) (mddev_t *mddev, int state); 377 /* takeover is used to transition an array from one 378 * personality to another. The new personality must be able 379 * to handle the data in the current layout. 380 * e.g. 2drive raid1 -> 2drive raid5 381 * ndrive raid5 -> degraded n+1drive raid6 with special layout 382 * If the takeover succeeds, a new 'private' structure is returned. 383 * This needs to be installed and then ->run used to activate the 384 * array. 385 */ 386 void *(*takeover) (mddev_t *mddev); 387 }; 388 389 390 struct md_sysfs_entry { 391 struct attribute attr; 392 ssize_t (*show)(mddev_t *, char *); 393 ssize_t (*store)(mddev_t *, const char *, size_t); 394 }; 395 extern struct attribute_group md_bitmap_group; 396 397 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 398 { 399 if (sd) 400 return sysfs_get_dirent(sd, NULL, name); 401 return sd; 402 } 403 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 404 { 405 if (sd) 406 sysfs_notify_dirent(sd); 407 } 408 409 static inline char * mdname (mddev_t * mddev) 410 { 411 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 412 } 413 414 /* 415 * iterates through some rdev ringlist. It's safe to remove the 416 * current 'rdev'. Dont touch 'tmp' though. 417 */ 418 #define rdev_for_each_list(rdev, tmp, head) \ 419 list_for_each_entry_safe(rdev, tmp, head, same_set) 420 421 /* 422 * iterates through the 'same array disks' ringlist 423 */ 424 #define rdev_for_each(rdev, tmp, mddev) \ 425 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 426 427 #define rdev_for_each_rcu(rdev, mddev) \ 428 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 429 430 typedef struct mdk_thread_s { 431 void (*run) (mddev_t *mddev); 432 mddev_t *mddev; 433 wait_queue_head_t wqueue; 434 unsigned long flags; 435 struct task_struct *tsk; 436 unsigned long timeout; 437 } mdk_thread_t; 438 439 #define THREAD_WAKEUP 0 440 441 #define __wait_event_lock_irq(wq, condition, lock, cmd) \ 442 do { \ 443 wait_queue_t __wait; \ 444 init_waitqueue_entry(&__wait, current); \ 445 \ 446 add_wait_queue(&wq, &__wait); \ 447 for (;;) { \ 448 set_current_state(TASK_UNINTERRUPTIBLE); \ 449 if (condition) \ 450 break; \ 451 spin_unlock_irq(&lock); \ 452 cmd; \ 453 schedule(); \ 454 spin_lock_irq(&lock); \ 455 } \ 456 current->state = TASK_RUNNING; \ 457 remove_wait_queue(&wq, &__wait); \ 458 } while (0) 459 460 #define wait_event_lock_irq(wq, condition, lock, cmd) \ 461 do { \ 462 if (condition) \ 463 break; \ 464 __wait_event_lock_irq(wq, condition, lock, cmd); \ 465 } while (0) 466 467 static inline void safe_put_page(struct page *p) 468 { 469 if (p) put_page(p); 470 } 471 472 extern int register_md_personality(struct mdk_personality *p); 473 extern int unregister_md_personality(struct mdk_personality *p); 474 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), 475 mddev_t *mddev, const char *name); 476 extern void md_unregister_thread(mdk_thread_t *thread); 477 extern void md_wakeup_thread(mdk_thread_t *thread); 478 extern void md_check_recovery(mddev_t *mddev); 479 extern void md_write_start(mddev_t *mddev, struct bio *bi); 480 extern void md_write_end(mddev_t *mddev); 481 extern void md_done_sync(mddev_t *mddev, int blocks, int ok); 482 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 483 484 extern int mddev_congested(mddev_t *mddev, int bits); 485 extern void md_flush_request(mddev_t *mddev, struct bio *bio); 486 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 487 sector_t sector, int size, struct page *page); 488 extern void md_super_wait(mddev_t *mddev); 489 extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 490 struct page *page, int rw, bool metadata_op); 491 extern void md_do_sync(mddev_t *mddev); 492 extern void md_new_event(mddev_t *mddev); 493 extern int md_allow_write(mddev_t *mddev); 494 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 495 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 496 extern int md_check_no_bitmap(mddev_t *mddev); 497 extern int md_integrity_register(mddev_t *mddev); 498 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 499 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 500 extern void restore_bitmap_write_access(struct file *file); 501 502 extern void mddev_init(mddev_t *mddev); 503 extern int md_run(mddev_t *mddev); 504 extern void md_stop(mddev_t *mddev); 505 extern void md_stop_writes(mddev_t *mddev); 506 extern void md_rdev_init(mdk_rdev_t *rdev); 507 508 extern void mddev_suspend(mddev_t *mddev); 509 extern void mddev_resume(mddev_t *mddev); 510 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 511 mddev_t *mddev); 512 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 513 mddev_t *mddev); 514 extern int mddev_check_plugged(mddev_t *mddev); 515 #endif /* _MD_MD_H */ 516