1 /* 2 md_k.h : kernel internal structure of the Linux MD driver 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 You should have received a copy of the GNU General Public License 11 (for example /usr/src/linux/COPYING); if not, write to the Free 12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 13 */ 14 15 #ifndef _MD_MD_H 16 #define _MD_MD_H 17 18 #include <linux/blkdev.h> 19 #include <linux/kobject.h> 20 #include <linux/list.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/timer.h> 24 #include <linux/wait.h> 25 #include <linux/workqueue.h> 26 27 #define MaxSector (~(sector_t)0) 28 29 typedef struct mddev_s mddev_t; 30 typedef struct mdk_rdev_s mdk_rdev_t; 31 32 /* generic plugging support - like that provided with request_queue, 33 * but does not require a request_queue 34 */ 35 struct plug_handle { 36 void (*unplug_fn)(struct plug_handle *); 37 struct timer_list unplug_timer; 38 struct work_struct unplug_work; 39 unsigned long unplug_flag; 40 }; 41 #define PLUGGED_FLAG 1 42 void plugger_init(struct plug_handle *plug, 43 void (*unplug_fn)(struct plug_handle *)); 44 void plugger_set_plug(struct plug_handle *plug); 45 int plugger_remove_plug(struct plug_handle *plug); 46 static inline void plugger_flush(struct plug_handle *plug) 47 { 48 del_timer_sync(&plug->unplug_timer); 49 cancel_work_sync(&plug->unplug_work); 50 } 51 52 /* 53 * MD's 'extended' device 54 */ 55 struct mdk_rdev_s 56 { 57 struct list_head same_set; /* RAID devices within the same set */ 58 59 sector_t sectors; /* Device size (in 512bytes sectors) */ 60 mddev_t *mddev; /* RAID array if running */ 61 int last_events; /* IO event timestamp */ 62 63 /* 64 * If meta_bdev is non-NULL, it means that a separate device is 65 * being used to store the metadata (superblock/bitmap) which 66 * would otherwise be contained on the same device as the data (bdev). 67 */ 68 struct block_device *meta_bdev; 69 struct block_device *bdev; /* block device handle */ 70 71 struct page *sb_page; 72 int sb_loaded; 73 __u64 sb_events; 74 sector_t data_offset; /* start of data in array */ 75 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 76 int sb_size; /* bytes in the superblock */ 77 int preferred_minor; /* autorun support */ 78 79 struct kobject kobj; 80 81 /* A device can be in one of three states based on two flags: 82 * Not working: faulty==1 in_sync==0 83 * Fully working: faulty==0 in_sync==1 84 * Working, but not 85 * in sync with array 86 * faulty==0 in_sync==0 87 * 88 * It can never have faulty==1, in_sync==1 89 * This reduces the burden of testing multiple flags in many cases 90 */ 91 92 unsigned long flags; 93 #define Faulty 1 /* device is known to have a fault */ 94 #define In_sync 2 /* device is in_sync with rest of array */ 95 #define WriteMostly 4 /* Avoid reading if at all possible */ 96 #define AutoDetected 7 /* added by auto-detect */ 97 #define Blocked 8 /* An error occured on an externally 98 * managed array, don't allow writes 99 * until it is cleared */ 100 wait_queue_head_t blocked_wait; 101 102 int desc_nr; /* descriptor index in the superblock */ 103 int raid_disk; /* role of device in array */ 104 int new_raid_disk; /* role that the device will have in 105 * the array after a level-change completes. 106 */ 107 int saved_raid_disk; /* role that device used to have in the 108 * array and could again if we did a partial 109 * resync from the bitmap 110 */ 111 sector_t recovery_offset;/* If this device has been partially 112 * recovered, this is where we were 113 * up to. 114 */ 115 116 atomic_t nr_pending; /* number of pending requests. 117 * only maintained for arrays that 118 * support hot removal 119 */ 120 atomic_t read_errors; /* number of consecutive read errors that 121 * we have tried to ignore. 122 */ 123 struct timespec last_read_error; /* monotonic time since our 124 * last read error 125 */ 126 atomic_t corrected_errors; /* number of corrected read errors, 127 * for reporting to userspace and storing 128 * in superblock. 129 */ 130 struct work_struct del_work; /* used for delayed sysfs removal */ 131 132 struct sysfs_dirent *sysfs_state; /* handle for 'state' 133 * sysfs entry */ 134 }; 135 136 struct mddev_s 137 { 138 void *private; 139 struct mdk_personality *pers; 140 dev_t unit; 141 int md_minor; 142 struct list_head disks; 143 unsigned long flags; 144 #define MD_CHANGE_DEVS 0 /* Some device status has changed */ 145 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 146 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 147 148 int suspended; 149 atomic_t active_io; 150 int ro; 151 int sysfs_active; /* set when sysfs deletes 152 * are happening, so run/ 153 * takeover/stop are not safe 154 */ 155 int ready; /* See when safe to pass 156 * IO requests down */ 157 struct gendisk *gendisk; 158 159 struct kobject kobj; 160 int hold_active; 161 #define UNTIL_IOCTL 1 162 #define UNTIL_STOP 2 163 164 /* Superblock information */ 165 int major_version, 166 minor_version, 167 patch_version; 168 int persistent; 169 int external; /* metadata is 170 * managed externally */ 171 char metadata_type[17]; /* externally set*/ 172 int chunk_sectors; 173 time_t ctime, utime; 174 int level, layout; 175 char clevel[16]; 176 int raid_disks; 177 int max_disks; 178 sector_t dev_sectors; /* used size of 179 * component devices */ 180 sector_t array_sectors; /* exported array size */ 181 int external_size; /* size managed 182 * externally */ 183 __u64 events; 184 /* If the last 'event' was simply a clean->dirty transition, and 185 * we didn't write it to the spares, then it is safe and simple 186 * to just decrement the event count on a dirty->clean transition. 187 * So we record that possibility here. 188 */ 189 int can_decrease_events; 190 191 char uuid[16]; 192 193 /* If the array is being reshaped, we need to record the 194 * new shape and an indication of where we are up to. 195 * This is written to the superblock. 196 * If reshape_position is MaxSector, then no reshape is happening (yet). 197 */ 198 sector_t reshape_position; 199 int delta_disks, new_level, new_layout; 200 int new_chunk_sectors; 201 202 struct mdk_thread_s *thread; /* management thread */ 203 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 204 sector_t curr_resync; /* last block scheduled */ 205 /* As resync requests can complete out of order, we cannot easily track 206 * how much resync has been completed. So we occasionally pause until 207 * everything completes, then set curr_resync_completed to curr_resync. 208 * As such it may be well behind the real resync mark, but it is a value 209 * we are certain of. 210 */ 211 sector_t curr_resync_completed; 212 unsigned long resync_mark; /* a recent timestamp */ 213 sector_t resync_mark_cnt;/* blocks written at resync_mark */ 214 sector_t curr_mark_cnt; /* blocks scheduled now */ 215 216 sector_t resync_max_sectors; /* may be set by personality */ 217 218 sector_t resync_mismatches; /* count of sectors where 219 * parity/replica mismatch found 220 */ 221 222 /* allow user-space to request suspension of IO to regions of the array */ 223 sector_t suspend_lo; 224 sector_t suspend_hi; 225 /* if zero, use the system-wide default */ 226 int sync_speed_min; 227 int sync_speed_max; 228 229 /* resync even though the same disks are shared among md-devices */ 230 int parallel_resync; 231 232 int ok_start_degraded; 233 /* recovery/resync flags 234 * NEEDED: we might need to start a resync/recover 235 * RUNNING: a thread is running, or about to be started 236 * SYNC: actually doing a resync, not a recovery 237 * RECOVER: doing recovery, or need to try it. 238 * INTR: resync needs to be aborted for some reason 239 * DONE: thread is done and is waiting to be reaped 240 * REQUEST: user-space has requested a sync (used with SYNC) 241 * CHECK: user-space request for check-only, no repair 242 * RESHAPE: A reshape is happening 243 * 244 * If neither SYNC or RESHAPE are set, then it is a recovery. 245 */ 246 #define MD_RECOVERY_RUNNING 0 247 #define MD_RECOVERY_SYNC 1 248 #define MD_RECOVERY_RECOVER 2 249 #define MD_RECOVERY_INTR 3 250 #define MD_RECOVERY_DONE 4 251 #define MD_RECOVERY_NEEDED 5 252 #define MD_RECOVERY_REQUESTED 6 253 #define MD_RECOVERY_CHECK 7 254 #define MD_RECOVERY_RESHAPE 8 255 #define MD_RECOVERY_FROZEN 9 256 257 unsigned long recovery; 258 int recovery_disabled; /* if we detect that recovery 259 * will always fail, set this 260 * so we don't loop trying */ 261 262 int in_sync; /* know to not need resync */ 263 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 264 * that we are never stopping an array while it is open. 265 * 'reconfig_mutex' protects all other reconfiguration. 266 * These locks are separate due to conflicting interactions 267 * with bdev->bd_mutex. 268 * Lock ordering is: 269 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 270 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 271 */ 272 struct mutex open_mutex; 273 struct mutex reconfig_mutex; 274 atomic_t active; /* general refcount */ 275 atomic_t openers; /* number of active opens */ 276 277 int changed; /* True if we might need to 278 * reread partition info */ 279 int degraded; /* whether md should consider 280 * adding a spare 281 */ 282 283 atomic_t recovery_active; /* blocks scheduled, but not written */ 284 wait_queue_head_t recovery_wait; 285 sector_t recovery_cp; 286 sector_t resync_min; /* user requested sync 287 * starts here */ 288 sector_t resync_max; /* resync should pause 289 * when it gets here */ 290 291 struct sysfs_dirent *sysfs_state; /* handle for 'array_state' 292 * file in sysfs. 293 */ 294 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ 295 296 struct work_struct del_work; /* used for delayed sysfs removal */ 297 298 spinlock_t write_lock; 299 wait_queue_head_t sb_wait; /* for waiting on superblock updates */ 300 atomic_t pending_writes; /* number of active superblock writes */ 301 302 unsigned int safemode; /* if set, update "clean" superblock 303 * when no writes pending. 304 */ 305 unsigned int safemode_delay; 306 struct timer_list safemode_timer; 307 atomic_t writes_pending; 308 struct request_queue *queue; /* for plugging ... */ 309 310 struct bitmap *bitmap; /* the bitmap for the device */ 311 struct { 312 struct file *file; /* the bitmap file */ 313 loff_t offset; /* offset from superblock of 314 * start of bitmap. May be 315 * negative, but not '0' 316 * For external metadata, offset 317 * from start of device. 318 */ 319 loff_t default_offset; /* this is the offset to use when 320 * hot-adding a bitmap. It should 321 * eventually be settable by sysfs. 322 */ 323 /* When md is serving under dm, it might use a 324 * dirty_log to store the bits. 325 */ 326 struct dm_dirty_log *log; 327 328 struct mutex mutex; 329 unsigned long chunksize; 330 unsigned long daemon_sleep; /* how many jiffies between updates? */ 331 unsigned long max_write_behind; /* write-behind mode */ 332 int external; 333 } bitmap_info; 334 335 atomic_t max_corr_read_errors; /* max read retries */ 336 struct list_head all_mddevs; 337 338 struct attribute_group *to_remove; 339 struct plug_handle *plug; /* if used by personality */ 340 341 struct bio_set *bio_set; 342 343 /* Generic flush handling. 344 * The last to finish preflush schedules a worker to submit 345 * the rest of the request (without the REQ_FLUSH flag). 346 */ 347 struct bio *flush_bio; 348 atomic_t flush_pending; 349 struct work_struct flush_work; 350 struct work_struct event_work; /* used by dm to report failure event */ 351 }; 352 353 354 static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) 355 { 356 int faulty = test_bit(Faulty, &rdev->flags); 357 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 358 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 359 } 360 361 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 362 { 363 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 364 } 365 366 struct mdk_personality 367 { 368 char *name; 369 int level; 370 struct list_head list; 371 struct module *owner; 372 int (*make_request)(mddev_t *mddev, struct bio *bio); 373 int (*run)(mddev_t *mddev); 374 int (*stop)(mddev_t *mddev); 375 void (*status)(struct seq_file *seq, mddev_t *mddev); 376 /* error_handler must set ->faulty and clear ->in_sync 377 * if appropriate, and should abort recovery if needed 378 */ 379 void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); 380 int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); 381 int (*hot_remove_disk) (mddev_t *mddev, int number); 382 int (*spare_active) (mddev_t *mddev); 383 sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); 384 int (*resize) (mddev_t *mddev, sector_t sectors); 385 sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); 386 int (*check_reshape) (mddev_t *mddev); 387 int (*start_reshape) (mddev_t *mddev); 388 void (*finish_reshape) (mddev_t *mddev); 389 /* quiesce moves between quiescence states 390 * 0 - fully active 391 * 1 - no new requests allowed 392 * others - reserved 393 */ 394 void (*quiesce) (mddev_t *mddev, int state); 395 /* takeover is used to transition an array from one 396 * personality to another. The new personality must be able 397 * to handle the data in the current layout. 398 * e.g. 2drive raid1 -> 2drive raid5 399 * ndrive raid5 -> degraded n+1drive raid6 with special layout 400 * If the takeover succeeds, a new 'private' structure is returned. 401 * This needs to be installed and then ->run used to activate the 402 * array. 403 */ 404 void *(*takeover) (mddev_t *mddev); 405 }; 406 407 408 struct md_sysfs_entry { 409 struct attribute attr; 410 ssize_t (*show)(mddev_t *, char *); 411 ssize_t (*store)(mddev_t *, const char *, size_t); 412 }; 413 extern struct attribute_group md_bitmap_group; 414 415 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 416 { 417 if (sd) 418 return sysfs_get_dirent(sd, NULL, name); 419 return sd; 420 } 421 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 422 { 423 if (sd) 424 sysfs_notify_dirent(sd); 425 } 426 427 static inline char * mdname (mddev_t * mddev) 428 { 429 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 430 } 431 432 /* 433 * iterates through some rdev ringlist. It's safe to remove the 434 * current 'rdev'. Dont touch 'tmp' though. 435 */ 436 #define rdev_for_each_list(rdev, tmp, head) \ 437 list_for_each_entry_safe(rdev, tmp, head, same_set) 438 439 /* 440 * iterates through the 'same array disks' ringlist 441 */ 442 #define rdev_for_each(rdev, tmp, mddev) \ 443 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 444 445 #define rdev_for_each_rcu(rdev, mddev) \ 446 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 447 448 typedef struct mdk_thread_s { 449 void (*run) (mddev_t *mddev); 450 mddev_t *mddev; 451 wait_queue_head_t wqueue; 452 unsigned long flags; 453 struct task_struct *tsk; 454 unsigned long timeout; 455 } mdk_thread_t; 456 457 #define THREAD_WAKEUP 0 458 459 #define __wait_event_lock_irq(wq, condition, lock, cmd) \ 460 do { \ 461 wait_queue_t __wait; \ 462 init_waitqueue_entry(&__wait, current); \ 463 \ 464 add_wait_queue(&wq, &__wait); \ 465 for (;;) { \ 466 set_current_state(TASK_UNINTERRUPTIBLE); \ 467 if (condition) \ 468 break; \ 469 spin_unlock_irq(&lock); \ 470 cmd; \ 471 schedule(); \ 472 spin_lock_irq(&lock); \ 473 } \ 474 current->state = TASK_RUNNING; \ 475 remove_wait_queue(&wq, &__wait); \ 476 } while (0) 477 478 #define wait_event_lock_irq(wq, condition, lock, cmd) \ 479 do { \ 480 if (condition) \ 481 break; \ 482 __wait_event_lock_irq(wq, condition, lock, cmd); \ 483 } while (0) 484 485 static inline void safe_put_page(struct page *p) 486 { 487 if (p) put_page(p); 488 } 489 490 extern int register_md_personality(struct mdk_personality *p); 491 extern int unregister_md_personality(struct mdk_personality *p); 492 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), 493 mddev_t *mddev, const char *name); 494 extern void md_unregister_thread(mdk_thread_t *thread); 495 extern void md_wakeup_thread(mdk_thread_t *thread); 496 extern void md_check_recovery(mddev_t *mddev); 497 extern void md_write_start(mddev_t *mddev, struct bio *bi); 498 extern void md_write_end(mddev_t *mddev); 499 extern void md_done_sync(mddev_t *mddev, int blocks, int ok); 500 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 501 502 extern int mddev_congested(mddev_t *mddev, int bits); 503 extern void md_flush_request(mddev_t *mddev, struct bio *bio); 504 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 505 sector_t sector, int size, struct page *page); 506 extern void md_super_wait(mddev_t *mddev); 507 extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 508 struct page *page, int rw, bool metadata_op); 509 extern void md_do_sync(mddev_t *mddev); 510 extern void md_new_event(mddev_t *mddev); 511 extern int md_allow_write(mddev_t *mddev); 512 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 513 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 514 extern int md_check_no_bitmap(mddev_t *mddev); 515 extern int md_integrity_register(mddev_t *mddev); 516 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 517 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 518 extern void restore_bitmap_write_access(struct file *file); 519 extern void md_unplug(mddev_t *mddev); 520 521 extern void mddev_init(mddev_t *mddev); 522 extern int md_run(mddev_t *mddev); 523 extern void md_stop(mddev_t *mddev); 524 extern void md_stop_writes(mddev_t *mddev); 525 extern void md_rdev_init(mdk_rdev_t *rdev); 526 527 extern void mddev_suspend(mddev_t *mddev); 528 extern void mddev_resume(mddev_t *mddev); 529 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 530 mddev_t *mddev); 531 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 532 mddev_t *mddev); 533 #endif /* _MD_MD_H */ 534