1 /* 2 md_k.h : kernel internal structure of the Linux MD driver 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 You should have received a copy of the GNU General Public License 11 (for example /usr/src/linux/COPYING); if not, write to the Free 12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 13 */ 14 15 #ifndef _MD_MD_H 16 #define _MD_MD_H 17 18 #include <linux/blkdev.h> 19 #include <linux/kobject.h> 20 #include <linux/list.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/timer.h> 24 #include <linux/wait.h> 25 #include <linux/workqueue.h> 26 27 #define MaxSector (~(sector_t)0) 28 29 typedef struct mddev_s mddev_t; 30 typedef struct mdk_rdev_s mdk_rdev_t; 31 32 /* generic plugging support - like that provided with request_queue, 33 * but does not require a request_queue 34 */ 35 struct plug_handle { 36 void (*unplug_fn)(struct plug_handle *); 37 struct timer_list unplug_timer; 38 struct work_struct unplug_work; 39 unsigned long unplug_flag; 40 }; 41 #define PLUGGED_FLAG 1 42 void plugger_init(struct plug_handle *plug, 43 void (*unplug_fn)(struct plug_handle *)); 44 void plugger_set_plug(struct plug_handle *plug); 45 int plugger_remove_plug(struct plug_handle *plug); 46 static inline void plugger_flush(struct plug_handle *plug) 47 { 48 del_timer_sync(&plug->unplug_timer); 49 cancel_work_sync(&plug->unplug_work); 50 } 51 52 /* 53 * MD's 'extended' device 54 */ 55 struct mdk_rdev_s 56 { 57 struct list_head same_set; /* RAID devices within the same set */ 58 59 sector_t sectors; /* Device size (in 512bytes sectors) */ 60 mddev_t *mddev; /* RAID array if running */ 61 int last_events; /* IO event timestamp */ 62 63 /* 64 * If meta_bdev is non-NULL, it means that a separate device is 65 * being used to store the metadata (superblock/bitmap) which 66 * would otherwise be contained on the same device as the data (bdev). 67 */ 68 struct block_device *meta_bdev; 69 struct block_device *bdev; /* block device handle */ 70 71 struct page *sb_page; 72 int sb_loaded; 73 __u64 sb_events; 74 sector_t data_offset; /* start of data in array */ 75 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 76 int sb_size; /* bytes in the superblock */ 77 int preferred_minor; /* autorun support */ 78 79 struct kobject kobj; 80 81 /* A device can be in one of three states based on two flags: 82 * Not working: faulty==1 in_sync==0 83 * Fully working: faulty==0 in_sync==1 84 * Working, but not 85 * in sync with array 86 * faulty==0 in_sync==0 87 * 88 * It can never have faulty==1, in_sync==1 89 * This reduces the burden of testing multiple flags in many cases 90 */ 91 92 unsigned long flags; 93 #define Faulty 1 /* device is known to have a fault */ 94 #define In_sync 2 /* device is in_sync with rest of array */ 95 #define WriteMostly 4 /* Avoid reading if at all possible */ 96 #define AutoDetected 7 /* added by auto-detect */ 97 #define Blocked 8 /* An error occured on an externally 98 * managed array, don't allow writes 99 * until it is cleared */ 100 wait_queue_head_t blocked_wait; 101 102 int desc_nr; /* descriptor index in the superblock */ 103 int raid_disk; /* role of device in array */ 104 int new_raid_disk; /* role that the device will have in 105 * the array after a level-change completes. 106 */ 107 int saved_raid_disk; /* role that device used to have in the 108 * array and could again if we did a partial 109 * resync from the bitmap 110 */ 111 sector_t recovery_offset;/* If this device has been partially 112 * recovered, this is where we were 113 * up to. 114 */ 115 116 atomic_t nr_pending; /* number of pending requests. 117 * only maintained for arrays that 118 * support hot removal 119 */ 120 atomic_t read_errors; /* number of consecutive read errors that 121 * we have tried to ignore. 122 */ 123 struct timespec last_read_error; /* monotonic time since our 124 * last read error 125 */ 126 atomic_t corrected_errors; /* number of corrected read errors, 127 * for reporting to userspace and storing 128 * in superblock. 129 */ 130 struct work_struct del_work; /* used for delayed sysfs removal */ 131 132 struct sysfs_dirent *sysfs_state; /* handle for 'state' 133 * sysfs entry */ 134 }; 135 136 struct mddev_s 137 { 138 void *private; 139 struct mdk_personality *pers; 140 dev_t unit; 141 int md_minor; 142 struct list_head disks; 143 unsigned long flags; 144 #define MD_CHANGE_DEVS 0 /* Some device status has changed */ 145 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 146 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 147 148 int suspended; 149 atomic_t active_io; 150 int ro; 151 int sysfs_active; /* set when sysfs deletes 152 * are happening, so run/ 153 * takeover/stop are not safe 154 */ 155 int ready; /* See when safe to pass 156 * IO requests down */ 157 struct gendisk *gendisk; 158 159 struct kobject kobj; 160 int hold_active; 161 #define UNTIL_IOCTL 1 162 #define UNTIL_STOP 2 163 164 /* Superblock information */ 165 int major_version, 166 minor_version, 167 patch_version; 168 int persistent; 169 int external; /* metadata is 170 * managed externally */ 171 char metadata_type[17]; /* externally set*/ 172 int chunk_sectors; 173 time_t ctime, utime; 174 int level, layout; 175 char clevel[16]; 176 int raid_disks; 177 int max_disks; 178 sector_t dev_sectors; /* used size of 179 * component devices */ 180 sector_t array_sectors; /* exported array size */ 181 int external_size; /* size managed 182 * externally */ 183 __u64 events; 184 /* If the last 'event' was simply a clean->dirty transition, and 185 * we didn't write it to the spares, then it is safe and simple 186 * to just decrement the event count on a dirty->clean transition. 187 * So we record that possibility here. 188 */ 189 int can_decrease_events; 190 191 char uuid[16]; 192 193 /* If the array is being reshaped, we need to record the 194 * new shape and an indication of where we are up to. 195 * This is written to the superblock. 196 * If reshape_position is MaxSector, then no reshape is happening (yet). 197 */ 198 sector_t reshape_position; 199 int delta_disks, new_level, new_layout; 200 int new_chunk_sectors; 201 202 struct mdk_thread_s *thread; /* management thread */ 203 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 204 sector_t curr_resync; /* last block scheduled */ 205 /* As resync requests can complete out of order, we cannot easily track 206 * how much resync has been completed. So we occasionally pause until 207 * everything completes, then set curr_resync_completed to curr_resync. 208 * As such it may be well behind the real resync mark, but it is a value 209 * we are certain of. 210 */ 211 sector_t curr_resync_completed; 212 unsigned long resync_mark; /* a recent timestamp */ 213 sector_t resync_mark_cnt;/* blocks written at resync_mark */ 214 sector_t curr_mark_cnt; /* blocks scheduled now */ 215 216 sector_t resync_max_sectors; /* may be set by personality */ 217 218 sector_t resync_mismatches; /* count of sectors where 219 * parity/replica mismatch found 220 */ 221 222 /* allow user-space to request suspension of IO to regions of the array */ 223 sector_t suspend_lo; 224 sector_t suspend_hi; 225 /* if zero, use the system-wide default */ 226 int sync_speed_min; 227 int sync_speed_max; 228 229 /* resync even though the same disks are shared among md-devices */ 230 int parallel_resync; 231 232 int ok_start_degraded; 233 /* recovery/resync flags 234 * NEEDED: we might need to start a resync/recover 235 * RUNNING: a thread is running, or about to be started 236 * SYNC: actually doing a resync, not a recovery 237 * RECOVER: doing recovery, or need to try it. 238 * INTR: resync needs to be aborted for some reason 239 * DONE: thread is done and is waiting to be reaped 240 * REQUEST: user-space has requested a sync (used with SYNC) 241 * CHECK: user-space request for check-only, no repair 242 * RESHAPE: A reshape is happening 243 * 244 * If neither SYNC or RESHAPE are set, then it is a recovery. 245 */ 246 #define MD_RECOVERY_RUNNING 0 247 #define MD_RECOVERY_SYNC 1 248 #define MD_RECOVERY_RECOVER 2 249 #define MD_RECOVERY_INTR 3 250 #define MD_RECOVERY_DONE 4 251 #define MD_RECOVERY_NEEDED 5 252 #define MD_RECOVERY_REQUESTED 6 253 #define MD_RECOVERY_CHECK 7 254 #define MD_RECOVERY_RESHAPE 8 255 #define MD_RECOVERY_FROZEN 9 256 257 unsigned long recovery; 258 int recovery_disabled; /* if we detect that recovery 259 * will always fail, set this 260 * so we don't loop trying */ 261 262 int in_sync; /* know to not need resync */ 263 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 264 * that we are never stopping an array while it is open. 265 * 'reconfig_mutex' protects all other reconfiguration. 266 * These locks are separate due to conflicting interactions 267 * with bdev->bd_mutex. 268 * Lock ordering is: 269 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 270 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 271 */ 272 struct mutex open_mutex; 273 struct mutex reconfig_mutex; 274 atomic_t active; /* general refcount */ 275 atomic_t openers; /* number of active opens */ 276 277 int degraded; /* whether md should consider 278 * adding a spare 279 */ 280 281 atomic_t recovery_active; /* blocks scheduled, but not written */ 282 wait_queue_head_t recovery_wait; 283 sector_t recovery_cp; 284 sector_t resync_min; /* user requested sync 285 * starts here */ 286 sector_t resync_max; /* resync should pause 287 * when it gets here */ 288 289 struct sysfs_dirent *sysfs_state; /* handle for 'array_state' 290 * file in sysfs. 291 */ 292 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ 293 294 struct work_struct del_work; /* used for delayed sysfs removal */ 295 296 spinlock_t write_lock; 297 wait_queue_head_t sb_wait; /* for waiting on superblock updates */ 298 atomic_t pending_writes; /* number of active superblock writes */ 299 300 unsigned int safemode; /* if set, update "clean" superblock 301 * when no writes pending. 302 */ 303 unsigned int safemode_delay; 304 struct timer_list safemode_timer; 305 atomic_t writes_pending; 306 struct request_queue *queue; /* for plugging ... */ 307 308 struct bitmap *bitmap; /* the bitmap for the device */ 309 struct { 310 struct file *file; /* the bitmap file */ 311 loff_t offset; /* offset from superblock of 312 * start of bitmap. May be 313 * negative, but not '0' 314 * For external metadata, offset 315 * from start of device. 316 */ 317 loff_t default_offset; /* this is the offset to use when 318 * hot-adding a bitmap. It should 319 * eventually be settable by sysfs. 320 */ 321 /* When md is serving under dm, it might use a 322 * dirty_log to store the bits. 323 */ 324 struct dm_dirty_log *log; 325 326 struct mutex mutex; 327 unsigned long chunksize; 328 unsigned long daemon_sleep; /* how many jiffies between updates? */ 329 unsigned long max_write_behind; /* write-behind mode */ 330 int external; 331 } bitmap_info; 332 333 atomic_t max_corr_read_errors; /* max read retries */ 334 struct list_head all_mddevs; 335 336 struct attribute_group *to_remove; 337 struct plug_handle *plug; /* if used by personality */ 338 339 struct bio_set *bio_set; 340 341 /* Generic flush handling. 342 * The last to finish preflush schedules a worker to submit 343 * the rest of the request (without the REQ_FLUSH flag). 344 */ 345 struct bio *flush_bio; 346 atomic_t flush_pending; 347 struct work_struct flush_work; 348 struct work_struct event_work; /* used by dm to report failure event */ 349 }; 350 351 352 static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) 353 { 354 int faulty = test_bit(Faulty, &rdev->flags); 355 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 356 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 357 } 358 359 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 360 { 361 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 362 } 363 364 struct mdk_personality 365 { 366 char *name; 367 int level; 368 struct list_head list; 369 struct module *owner; 370 int (*make_request)(mddev_t *mddev, struct bio *bio); 371 int (*run)(mddev_t *mddev); 372 int (*stop)(mddev_t *mddev); 373 void (*status)(struct seq_file *seq, mddev_t *mddev); 374 /* error_handler must set ->faulty and clear ->in_sync 375 * if appropriate, and should abort recovery if needed 376 */ 377 void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); 378 int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); 379 int (*hot_remove_disk) (mddev_t *mddev, int number); 380 int (*spare_active) (mddev_t *mddev); 381 sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); 382 int (*resize) (mddev_t *mddev, sector_t sectors); 383 sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); 384 int (*check_reshape) (mddev_t *mddev); 385 int (*start_reshape) (mddev_t *mddev); 386 void (*finish_reshape) (mddev_t *mddev); 387 /* quiesce moves between quiescence states 388 * 0 - fully active 389 * 1 - no new requests allowed 390 * others - reserved 391 */ 392 void (*quiesce) (mddev_t *mddev, int state); 393 /* takeover is used to transition an array from one 394 * personality to another. The new personality must be able 395 * to handle the data in the current layout. 396 * e.g. 2drive raid1 -> 2drive raid5 397 * ndrive raid5 -> degraded n+1drive raid6 with special layout 398 * If the takeover succeeds, a new 'private' structure is returned. 399 * This needs to be installed and then ->run used to activate the 400 * array. 401 */ 402 void *(*takeover) (mddev_t *mddev); 403 }; 404 405 406 struct md_sysfs_entry { 407 struct attribute attr; 408 ssize_t (*show)(mddev_t *, char *); 409 ssize_t (*store)(mddev_t *, const char *, size_t); 410 }; 411 extern struct attribute_group md_bitmap_group; 412 413 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 414 { 415 if (sd) 416 return sysfs_get_dirent(sd, NULL, name); 417 return sd; 418 } 419 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 420 { 421 if (sd) 422 sysfs_notify_dirent(sd); 423 } 424 425 static inline char * mdname (mddev_t * mddev) 426 { 427 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 428 } 429 430 /* 431 * iterates through some rdev ringlist. It's safe to remove the 432 * current 'rdev'. Dont touch 'tmp' though. 433 */ 434 #define rdev_for_each_list(rdev, tmp, head) \ 435 list_for_each_entry_safe(rdev, tmp, head, same_set) 436 437 /* 438 * iterates through the 'same array disks' ringlist 439 */ 440 #define rdev_for_each(rdev, tmp, mddev) \ 441 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 442 443 #define rdev_for_each_rcu(rdev, mddev) \ 444 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 445 446 typedef struct mdk_thread_s { 447 void (*run) (mddev_t *mddev); 448 mddev_t *mddev; 449 wait_queue_head_t wqueue; 450 unsigned long flags; 451 struct task_struct *tsk; 452 unsigned long timeout; 453 } mdk_thread_t; 454 455 #define THREAD_WAKEUP 0 456 457 #define __wait_event_lock_irq(wq, condition, lock, cmd) \ 458 do { \ 459 wait_queue_t __wait; \ 460 init_waitqueue_entry(&__wait, current); \ 461 \ 462 add_wait_queue(&wq, &__wait); \ 463 for (;;) { \ 464 set_current_state(TASK_UNINTERRUPTIBLE); \ 465 if (condition) \ 466 break; \ 467 spin_unlock_irq(&lock); \ 468 cmd; \ 469 schedule(); \ 470 spin_lock_irq(&lock); \ 471 } \ 472 current->state = TASK_RUNNING; \ 473 remove_wait_queue(&wq, &__wait); \ 474 } while (0) 475 476 #define wait_event_lock_irq(wq, condition, lock, cmd) \ 477 do { \ 478 if (condition) \ 479 break; \ 480 __wait_event_lock_irq(wq, condition, lock, cmd); \ 481 } while (0) 482 483 static inline void safe_put_page(struct page *p) 484 { 485 if (p) put_page(p); 486 } 487 488 extern int register_md_personality(struct mdk_personality *p); 489 extern int unregister_md_personality(struct mdk_personality *p); 490 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), 491 mddev_t *mddev, const char *name); 492 extern void md_unregister_thread(mdk_thread_t *thread); 493 extern void md_wakeup_thread(mdk_thread_t *thread); 494 extern void md_check_recovery(mddev_t *mddev); 495 extern void md_write_start(mddev_t *mddev, struct bio *bi); 496 extern void md_write_end(mddev_t *mddev); 497 extern void md_done_sync(mddev_t *mddev, int blocks, int ok); 498 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 499 500 extern int mddev_congested(mddev_t *mddev, int bits); 501 extern void md_flush_request(mddev_t *mddev, struct bio *bio); 502 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 503 sector_t sector, int size, struct page *page); 504 extern void md_super_wait(mddev_t *mddev); 505 extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 506 struct page *page, int rw, bool metadata_op); 507 extern void md_do_sync(mddev_t *mddev); 508 extern void md_new_event(mddev_t *mddev); 509 extern int md_allow_write(mddev_t *mddev); 510 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 511 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 512 extern int md_check_no_bitmap(mddev_t *mddev); 513 extern int md_integrity_register(mddev_t *mddev); 514 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 515 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 516 extern void restore_bitmap_write_access(struct file *file); 517 extern void md_unplug(mddev_t *mddev); 518 519 extern void mddev_init(mddev_t *mddev); 520 extern int md_run(mddev_t *mddev); 521 extern void md_stop(mddev_t *mddev); 522 extern void md_stop_writes(mddev_t *mddev); 523 extern void md_rdev_init(mdk_rdev_t *rdev); 524 525 extern void mddev_suspend(mddev_t *mddev); 526 extern void mddev_resume(mddev_t *mddev); 527 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 528 mddev_t *mddev); 529 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 530 mddev_t *mddev); 531 #endif /* _MD_MD_H */ 532