xref: /openbmc/linux/drivers/md/md.c (revision d37cf9b63113f13d742713881ce691fc615d8b3b)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3     md.c : Multiple Devices driver for Linux
4       Copyright (C) 1998, 1999, 2000 Ingo Molnar
5  
6       completely rewritten, based on the MD driver code from Marc Zyngier
7  
8     Changes:
9  
10     - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11     - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12     - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13     - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14     - kmod support by: Cyrus Durgin
15     - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16     - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17  
18     - lots of fixes and improvements to the RAID1/RAID5 and generic
19       RAID code (such as request based resynchronization):
20  
21       Neil Brown <neilb@cse.unsw.edu.au>.
22  
23     - persistent bitmap code
24       Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25  
26  
27     Errors, Warnings, etc.
28     Please use:
29       pr_crit() for error conditions that risk data loss
30       pr_err() for error conditions that are unexpected, like an IO error
31           or internal inconsistency
32       pr_warn() for error conditions that could have been predicated, like
33           adding a device to an array when it has incompatible metadata
34       pr_info() for every interesting, very rare events, like an array starting
35           or stopping, or resync starting or stopping
36       pr_debug() for everything else.
37  
38  */
39  
40  #include <linux/sched/mm.h>
41  #include <linux/sched/signal.h>
42  #include <linux/kthread.h>
43  #include <linux/blkdev.h>
44  #include <linux/blk-integrity.h>
45  #include <linux/badblocks.h>
46  #include <linux/sysctl.h>
47  #include <linux/seq_file.h>
48  #include <linux/fs.h>
49  #include <linux/poll.h>
50  #include <linux/ctype.h>
51  #include <linux/string.h>
52  #include <linux/hdreg.h>
53  #include <linux/proc_fs.h>
54  #include <linux/random.h>
55  #include <linux/major.h>
56  #include <linux/module.h>
57  #include <linux/reboot.h>
58  #include <linux/file.h>
59  #include <linux/compat.h>
60  #include <linux/delay.h>
61  #include <linux/raid/md_p.h>
62  #include <linux/raid/md_u.h>
63  #include <linux/raid/detect.h>
64  #include <linux/slab.h>
65  #include <linux/percpu-refcount.h>
66  #include <linux/part_stat.h>
67  
68  #include <trace/events/block.h>
69  #include "md.h"
70  #include "md-bitmap.h"
71  #include "md-cluster.h"
72  
73  /* pers_list is a list of registered personalities protected by pers_lock. */
74  static LIST_HEAD(pers_list);
75  static DEFINE_SPINLOCK(pers_lock);
76  
77  static const struct kobj_type md_ktype;
78  
79  struct md_cluster_operations *md_cluster_ops;
80  EXPORT_SYMBOL(md_cluster_ops);
81  static struct module *md_cluster_mod;
82  
83  static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
84  static struct workqueue_struct *md_wq;
85  static struct workqueue_struct *md_misc_wq;
86  struct workqueue_struct *md_bitmap_wq;
87  
88  static int remove_and_add_spares(struct mddev *mddev,
89  				 struct md_rdev *this);
90  static void mddev_detach(struct mddev *mddev);
91  static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
92  static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
93  
94  /*
95   * Default number of read corrections we'll attempt on an rdev
96   * before ejecting it from the array. We divide the read error
97   * count by 2 for every hour elapsed between read errors.
98   */
99  #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
100  /* Default safemode delay: 200 msec */
101  #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
102  /*
103   * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
104   * is 1000 KB/sec, so the extra system load does not show up that much.
105   * Increase it if you want to have more _guaranteed_ speed. Note that
106   * the RAID driver will use the maximum available bandwidth if the IO
107   * subsystem is idle. There is also an 'absolute maximum' reconstruction
108   * speed limit - in case reconstruction slows down your system despite
109   * idle IO detection.
110   *
111   * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
112   * or /sys/block/mdX/md/sync_speed_{min,max}
113   */
114  
115  static int sysctl_speed_limit_min = 1000;
116  static int sysctl_speed_limit_max = 200000;
speed_min(struct mddev * mddev)117  static inline int speed_min(struct mddev *mddev)
118  {
119  	return mddev->sync_speed_min ?
120  		mddev->sync_speed_min : sysctl_speed_limit_min;
121  }
122  
speed_max(struct mddev * mddev)123  static inline int speed_max(struct mddev *mddev)
124  {
125  	return mddev->sync_speed_max ?
126  		mddev->sync_speed_max : sysctl_speed_limit_max;
127  }
128  
rdev_uninit_serial(struct md_rdev * rdev)129  static void rdev_uninit_serial(struct md_rdev *rdev)
130  {
131  	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
132  		return;
133  
134  	kvfree(rdev->serial);
135  	rdev->serial = NULL;
136  }
137  
rdevs_uninit_serial(struct mddev * mddev)138  static void rdevs_uninit_serial(struct mddev *mddev)
139  {
140  	struct md_rdev *rdev;
141  
142  	rdev_for_each(rdev, mddev)
143  		rdev_uninit_serial(rdev);
144  }
145  
rdev_init_serial(struct md_rdev * rdev)146  static int rdev_init_serial(struct md_rdev *rdev)
147  {
148  	/* serial_nums equals with BARRIER_BUCKETS_NR */
149  	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
150  	struct serial_in_rdev *serial = NULL;
151  
152  	if (test_bit(CollisionCheck, &rdev->flags))
153  		return 0;
154  
155  	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
156  			  GFP_KERNEL);
157  	if (!serial)
158  		return -ENOMEM;
159  
160  	for (i = 0; i < serial_nums; i++) {
161  		struct serial_in_rdev *serial_tmp = &serial[i];
162  
163  		spin_lock_init(&serial_tmp->serial_lock);
164  		serial_tmp->serial_rb = RB_ROOT_CACHED;
165  		init_waitqueue_head(&serial_tmp->serial_io_wait);
166  	}
167  
168  	rdev->serial = serial;
169  	set_bit(CollisionCheck, &rdev->flags);
170  
171  	return 0;
172  }
173  
rdevs_init_serial(struct mddev * mddev)174  static int rdevs_init_serial(struct mddev *mddev)
175  {
176  	struct md_rdev *rdev;
177  	int ret = 0;
178  
179  	rdev_for_each(rdev, mddev) {
180  		ret = rdev_init_serial(rdev);
181  		if (ret)
182  			break;
183  	}
184  
185  	/* Free all resources if pool is not existed */
186  	if (ret && !mddev->serial_info_pool)
187  		rdevs_uninit_serial(mddev);
188  
189  	return ret;
190  }
191  
192  /*
193   * rdev needs to enable serial stuffs if it meets the conditions:
194   * 1. it is multi-queue device flaged with writemostly.
195   * 2. the write-behind mode is enabled.
196   */
rdev_need_serial(struct md_rdev * rdev)197  static int rdev_need_serial(struct md_rdev *rdev)
198  {
199  	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
200  		rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
201  		test_bit(WriteMostly, &rdev->flags));
202  }
203  
204  /*
205   * Init resource for rdev(s), then create serial_info_pool if:
206   * 1. rdev is the first device which return true from rdev_enable_serial.
207   * 2. rdev is NULL, means we want to enable serialization for all rdevs.
208   */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev,bool is_suspend)209  void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
210  			      bool is_suspend)
211  {
212  	int ret = 0;
213  
214  	if (rdev && !rdev_need_serial(rdev) &&
215  	    !test_bit(CollisionCheck, &rdev->flags))
216  		return;
217  
218  	if (!is_suspend)
219  		mddev_suspend(mddev);
220  
221  	if (!rdev)
222  		ret = rdevs_init_serial(mddev);
223  	else
224  		ret = rdev_init_serial(rdev);
225  	if (ret)
226  		goto abort;
227  
228  	if (mddev->serial_info_pool == NULL) {
229  		/*
230  		 * already in memalloc noio context by
231  		 * mddev_suspend()
232  		 */
233  		mddev->serial_info_pool =
234  			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
235  						sizeof(struct serial_info));
236  		if (!mddev->serial_info_pool) {
237  			rdevs_uninit_serial(mddev);
238  			pr_err("can't alloc memory pool for serialization\n");
239  		}
240  	}
241  
242  abort:
243  	if (!is_suspend)
244  		mddev_resume(mddev);
245  }
246  
247  /*
248   * Free resource from rdev(s), and destroy serial_info_pool under conditions:
249   * 1. rdev is the last device flaged with CollisionCheck.
250   * 2. when bitmap is destroyed while policy is not enabled.
251   * 3. for disable policy, the pool is destroyed only when no rdev needs it.
252   */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev,bool is_suspend)253  void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
254  			       bool is_suspend)
255  {
256  	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
257  		return;
258  
259  	if (mddev->serial_info_pool) {
260  		struct md_rdev *temp;
261  		int num = 0; /* used to track if other rdevs need the pool */
262  
263  		if (!is_suspend)
264  			mddev_suspend(mddev);
265  		rdev_for_each(temp, mddev) {
266  			if (!rdev) {
267  				if (!mddev->serialize_policy ||
268  				    !rdev_need_serial(temp))
269  					rdev_uninit_serial(temp);
270  				else
271  					num++;
272  			} else if (temp != rdev &&
273  				   test_bit(CollisionCheck, &temp->flags))
274  				num++;
275  		}
276  
277  		if (rdev)
278  			rdev_uninit_serial(rdev);
279  
280  		if (num)
281  			pr_info("The mempool could be used by other devices\n");
282  		else {
283  			mempool_destroy(mddev->serial_info_pool);
284  			mddev->serial_info_pool = NULL;
285  		}
286  		if (!is_suspend)
287  			mddev_resume(mddev);
288  	}
289  }
290  
291  static struct ctl_table_header *raid_table_header;
292  
293  static struct ctl_table raid_table[] = {
294  	{
295  		.procname	= "speed_limit_min",
296  		.data		= &sysctl_speed_limit_min,
297  		.maxlen		= sizeof(int),
298  		.mode		= S_IRUGO|S_IWUSR,
299  		.proc_handler	= proc_dointvec,
300  	},
301  	{
302  		.procname	= "speed_limit_max",
303  		.data		= &sysctl_speed_limit_max,
304  		.maxlen		= sizeof(int),
305  		.mode		= S_IRUGO|S_IWUSR,
306  		.proc_handler	= proc_dointvec,
307  	},
308  	{ }
309  };
310  
311  static int start_readonly;
312  
313  /*
314   * The original mechanism for creating an md device is to create
315   * a device node in /dev and to open it.  This causes races with device-close.
316   * The preferred method is to write to the "new_array" module parameter.
317   * This can avoid races.
318   * Setting create_on_open to false disables the original mechanism
319   * so all the races disappear.
320   */
321  static bool create_on_open = true;
322  
323  /*
324   * We have a system wide 'event count' that is incremented
325   * on any 'interesting' event, and readers of /proc/mdstat
326   * can use 'poll' or 'select' to find out when the event
327   * count increases.
328   *
329   * Events are:
330   *  start array, stop array, error, add device, remove device,
331   *  start build, activate spare
332   */
333  static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
334  static atomic_t md_event_count;
md_new_event(void)335  void md_new_event(void)
336  {
337  	atomic_inc(&md_event_count);
338  	wake_up(&md_event_waiters);
339  }
340  EXPORT_SYMBOL_GPL(md_new_event);
341  
342  /*
343   * Enables to iterate over all existing md arrays
344   * all_mddevs_lock protects this list.
345   */
346  static LIST_HEAD(all_mddevs);
347  static DEFINE_SPINLOCK(all_mddevs_lock);
348  
349  /* Rather than calling directly into the personality make_request function,
350   * IO requests come here first so that we can check if the device is
351   * being suspended pending a reconfiguration.
352   * We hold a refcount over the call to ->make_request.  By the time that
353   * call has finished, the bio has been linked into some internal structure
354   * and so is visible to ->quiesce(), so we don't need the refcount any more.
355   */
is_suspended(struct mddev * mddev,struct bio * bio)356  static bool is_suspended(struct mddev *mddev, struct bio *bio)
357  {
358  	if (is_md_suspended(mddev))
359  		return true;
360  	if (bio_data_dir(bio) != WRITE)
361  		return false;
362  	if (mddev->suspend_lo >= mddev->suspend_hi)
363  		return false;
364  	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
365  		return false;
366  	if (bio_end_sector(bio) < mddev->suspend_lo)
367  		return false;
368  	return true;
369  }
370  
md_handle_request(struct mddev * mddev,struct bio * bio)371  void md_handle_request(struct mddev *mddev, struct bio *bio)
372  {
373  check_suspended:
374  	if (is_suspended(mddev, bio)) {
375  		DEFINE_WAIT(__wait);
376  		/* Bail out if REQ_NOWAIT is set for the bio */
377  		if (bio->bi_opf & REQ_NOWAIT) {
378  			bio_wouldblock_error(bio);
379  			return;
380  		}
381  		for (;;) {
382  			prepare_to_wait(&mddev->sb_wait, &__wait,
383  					TASK_UNINTERRUPTIBLE);
384  			if (!is_suspended(mddev, bio))
385  				break;
386  			schedule();
387  		}
388  		finish_wait(&mddev->sb_wait, &__wait);
389  	}
390  	if (!percpu_ref_tryget_live(&mddev->active_io))
391  		goto check_suspended;
392  
393  	if (!mddev->pers->make_request(mddev, bio)) {
394  		percpu_ref_put(&mddev->active_io);
395  		goto check_suspended;
396  	}
397  
398  	percpu_ref_put(&mddev->active_io);
399  }
400  EXPORT_SYMBOL(md_handle_request);
401  
md_submit_bio(struct bio * bio)402  static void md_submit_bio(struct bio *bio)
403  {
404  	const int rw = bio_data_dir(bio);
405  	struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
406  
407  	if (mddev == NULL || mddev->pers == NULL) {
408  		bio_io_error(bio);
409  		return;
410  	}
411  
412  	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
413  		bio_io_error(bio);
414  		return;
415  	}
416  
417  	bio = bio_split_to_limits(bio);
418  	if (!bio)
419  		return;
420  
421  	if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
422  		if (bio_sectors(bio) != 0)
423  			bio->bi_status = BLK_STS_IOERR;
424  		bio_endio(bio);
425  		return;
426  	}
427  
428  	/* bio could be mergeable after passing to underlayer */
429  	bio->bi_opf &= ~REQ_NOMERGE;
430  
431  	md_handle_request(mddev, bio);
432  }
433  
434  /* mddev_suspend makes sure no new requests are submitted
435   * to the device, and that any requests that have been submitted
436   * are completely handled.
437   * Once mddev_detach() is called and completes, the module will be
438   * completely unused.
439   */
mddev_suspend(struct mddev * mddev)440  void mddev_suspend(struct mddev *mddev)
441  {
442  	struct md_thread *thread = rcu_dereference_protected(mddev->thread,
443  			lockdep_is_held(&mddev->reconfig_mutex));
444  
445  	WARN_ON_ONCE(thread && current == thread->tsk);
446  	if (mddev->suspended++)
447  		return;
448  	wake_up(&mddev->sb_wait);
449  	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
450  	percpu_ref_kill(&mddev->active_io);
451  
452  	if (mddev->pers && mddev->pers->prepare_suspend)
453  		mddev->pers->prepare_suspend(mddev);
454  
455  	wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
456  	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
457  	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
458  
459  	/* restrict memory reclaim I/O during raid array is suspend */
460  	mddev->noio_flag = memalloc_noio_save();
461  }
462  EXPORT_SYMBOL_GPL(mddev_suspend);
463  
mddev_resume(struct mddev * mddev)464  void mddev_resume(struct mddev *mddev)
465  {
466  	lockdep_assert_held(&mddev->reconfig_mutex);
467  	if (--mddev->suspended)
468  		return;
469  
470  	/* entred the memalloc scope from mddev_suspend() */
471  	memalloc_noio_restore(mddev->noio_flag);
472  
473  	percpu_ref_resurrect(&mddev->active_io);
474  	wake_up(&mddev->sb_wait);
475  
476  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
477  	md_wakeup_thread(mddev->thread);
478  	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
479  }
480  EXPORT_SYMBOL_GPL(mddev_resume);
481  
482  /*
483   * Generic flush handling for md
484   */
485  
md_end_flush(struct bio * bio)486  static void md_end_flush(struct bio *bio)
487  {
488  	struct md_rdev *rdev = bio->bi_private;
489  	struct mddev *mddev = rdev->mddev;
490  
491  	bio_put(bio);
492  
493  	rdev_dec_pending(rdev, mddev);
494  
495  	if (atomic_dec_and_test(&mddev->flush_pending))
496  		/* The pre-request flush has finished */
497  		queue_work(md_wq, &mddev->flush_work);
498  }
499  
500  static void md_submit_flush_data(struct work_struct *ws);
501  
submit_flushes(struct work_struct * ws)502  static void submit_flushes(struct work_struct *ws)
503  {
504  	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
505  	struct md_rdev *rdev;
506  
507  	mddev->start_flush = ktime_get_boottime();
508  	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
509  	atomic_set(&mddev->flush_pending, 1);
510  	rcu_read_lock();
511  	rdev_for_each_rcu(rdev, mddev)
512  		if (rdev->raid_disk >= 0 &&
513  		    !test_bit(Faulty, &rdev->flags)) {
514  			struct bio *bi;
515  
516  			atomic_inc(&rdev->nr_pending);
517  			rcu_read_unlock();
518  			bi = bio_alloc_bioset(rdev->bdev, 0,
519  					      REQ_OP_WRITE | REQ_PREFLUSH,
520  					      GFP_NOIO, &mddev->bio_set);
521  			bi->bi_end_io = md_end_flush;
522  			bi->bi_private = rdev;
523  			atomic_inc(&mddev->flush_pending);
524  			submit_bio(bi);
525  			rcu_read_lock();
526  		}
527  	rcu_read_unlock();
528  	if (atomic_dec_and_test(&mddev->flush_pending))
529  		queue_work(md_wq, &mddev->flush_work);
530  }
531  
md_submit_flush_data(struct work_struct * ws)532  static void md_submit_flush_data(struct work_struct *ws)
533  {
534  	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
535  	struct bio *bio = mddev->flush_bio;
536  
537  	/*
538  	 * must reset flush_bio before calling into md_handle_request to avoid a
539  	 * deadlock, because other bios passed md_handle_request suspend check
540  	 * could wait for this and below md_handle_request could wait for those
541  	 * bios because of suspend check
542  	 */
543  	spin_lock_irq(&mddev->lock);
544  	mddev->prev_flush_start = mddev->start_flush;
545  	mddev->flush_bio = NULL;
546  	spin_unlock_irq(&mddev->lock);
547  	wake_up(&mddev->sb_wait);
548  
549  	if (bio->bi_iter.bi_size == 0) {
550  		/* an empty barrier - all done */
551  		bio_endio(bio);
552  	} else {
553  		bio->bi_opf &= ~REQ_PREFLUSH;
554  
555  		/*
556  		 * make_requst() will never return error here, it only
557  		 * returns error in raid5_make_request() by dm-raid.
558  		 * Since dm always splits data and flush operation into
559  		 * two separate io, io size of flush submitted by dm
560  		 * always is 0, make_request() will not be called here.
561  		 */
562  		if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
563  			bio_io_error(bio);;
564  	}
565  
566  	/* The pair is percpu_ref_get() from md_flush_request() */
567  	percpu_ref_put(&mddev->active_io);
568  }
569  
570  /*
571   * Manages consolidation of flushes and submitting any flushes needed for
572   * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
573   * being finished in another context.  Returns false if the flushing is
574   * complete but still needs the I/O portion of the bio to be processed.
575   */
md_flush_request(struct mddev * mddev,struct bio * bio)576  bool md_flush_request(struct mddev *mddev, struct bio *bio)
577  {
578  	ktime_t req_start = ktime_get_boottime();
579  	spin_lock_irq(&mddev->lock);
580  	/* flush requests wait until ongoing flush completes,
581  	 * hence coalescing all the pending requests.
582  	 */
583  	wait_event_lock_irq(mddev->sb_wait,
584  			    !mddev->flush_bio ||
585  			    ktime_before(req_start, mddev->prev_flush_start),
586  			    mddev->lock);
587  	/* new request after previous flush is completed */
588  	if (ktime_after(req_start, mddev->prev_flush_start)) {
589  		WARN_ON(mddev->flush_bio);
590  		/*
591  		 * Grab a reference to make sure mddev_suspend() will wait for
592  		 * this flush to be done.
593  		 *
594  		 * md_flush_reqeust() is called under md_handle_request() and
595  		 * 'active_io' is already grabbed, hence percpu_ref_is_zero()
596  		 * won't pass, percpu_ref_tryget_live() can't be used because
597  		 * percpu_ref_kill() can be called by mddev_suspend()
598  		 * concurrently.
599  		 */
600  		WARN_ON(percpu_ref_is_zero(&mddev->active_io));
601  		percpu_ref_get(&mddev->active_io);
602  		mddev->flush_bio = bio;
603  		bio = NULL;
604  	}
605  	spin_unlock_irq(&mddev->lock);
606  
607  	if (!bio) {
608  		INIT_WORK(&mddev->flush_work, submit_flushes);
609  		queue_work(md_wq, &mddev->flush_work);
610  	} else {
611  		/* flush was performed for some other bio while we waited. */
612  		if (bio->bi_iter.bi_size == 0)
613  			/* an empty barrier - all done */
614  			bio_endio(bio);
615  		else {
616  			bio->bi_opf &= ~REQ_PREFLUSH;
617  			return false;
618  		}
619  	}
620  	return true;
621  }
622  EXPORT_SYMBOL(md_flush_request);
623  
mddev_get(struct mddev * mddev)624  static inline struct mddev *mddev_get(struct mddev *mddev)
625  {
626  	lockdep_assert_held(&all_mddevs_lock);
627  
628  	if (test_bit(MD_DELETED, &mddev->flags))
629  		return NULL;
630  	atomic_inc(&mddev->active);
631  	return mddev;
632  }
633  
634  static void mddev_delayed_delete(struct work_struct *ws);
635  
__mddev_put(struct mddev * mddev)636  static void __mddev_put(struct mddev *mddev)
637  {
638  	if (mddev->raid_disks || !list_empty(&mddev->disks) ||
639  	    mddev->ctime || mddev->hold_active)
640  		return;
641  
642  	/* Array is not configured at all, and not held active, so destroy it */
643  	set_bit(MD_DELETED, &mddev->flags);
644  
645  	/*
646  	 * Call queue_work inside the spinlock so that flush_workqueue() after
647  	 * mddev_find will succeed in waiting for the work to be done.
648  	 */
649  	queue_work(md_misc_wq, &mddev->del_work);
650  }
651  
mddev_put(struct mddev * mddev)652  void mddev_put(struct mddev *mddev)
653  {
654  	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
655  		return;
656  
657  	__mddev_put(mddev);
658  	spin_unlock(&all_mddevs_lock);
659  }
660  
661  static void md_safemode_timeout(struct timer_list *t);
662  static void md_start_sync(struct work_struct *ws);
663  
mddev_init(struct mddev * mddev)664  void mddev_init(struct mddev *mddev)
665  {
666  	mutex_init(&mddev->open_mutex);
667  	mutex_init(&mddev->reconfig_mutex);
668  	mutex_init(&mddev->sync_mutex);
669  	mutex_init(&mddev->bitmap_info.mutex);
670  	INIT_LIST_HEAD(&mddev->disks);
671  	INIT_LIST_HEAD(&mddev->all_mddevs);
672  	INIT_LIST_HEAD(&mddev->deleting);
673  	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
674  	atomic_set(&mddev->active, 1);
675  	atomic_set(&mddev->openers, 0);
676  	atomic_set(&mddev->sync_seq, 0);
677  	spin_lock_init(&mddev->lock);
678  	atomic_set(&mddev->flush_pending, 0);
679  	init_waitqueue_head(&mddev->sb_wait);
680  	init_waitqueue_head(&mddev->recovery_wait);
681  	mddev->reshape_position = MaxSector;
682  	mddev->reshape_backwards = 0;
683  	mddev->last_sync_action = "none";
684  	mddev->resync_min = 0;
685  	mddev->resync_max = MaxSector;
686  	mddev->level = LEVEL_NONE;
687  
688  	INIT_WORK(&mddev->sync_work, md_start_sync);
689  	INIT_WORK(&mddev->del_work, mddev_delayed_delete);
690  }
691  EXPORT_SYMBOL_GPL(mddev_init);
692  
mddev_find_locked(dev_t unit)693  static struct mddev *mddev_find_locked(dev_t unit)
694  {
695  	struct mddev *mddev;
696  
697  	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
698  		if (mddev->unit == unit)
699  			return mddev;
700  
701  	return NULL;
702  }
703  
704  /* find an unused unit number */
mddev_alloc_unit(void)705  static dev_t mddev_alloc_unit(void)
706  {
707  	static int next_minor = 512;
708  	int start = next_minor;
709  	bool is_free = 0;
710  	dev_t dev = 0;
711  
712  	while (!is_free) {
713  		dev = MKDEV(MD_MAJOR, next_minor);
714  		next_minor++;
715  		if (next_minor > MINORMASK)
716  			next_minor = 0;
717  		if (next_minor == start)
718  			return 0;		/* Oh dear, all in use. */
719  		is_free = !mddev_find_locked(dev);
720  	}
721  
722  	return dev;
723  }
724  
mddev_alloc(dev_t unit)725  static struct mddev *mddev_alloc(dev_t unit)
726  {
727  	struct mddev *new;
728  	int error;
729  
730  	if (unit && MAJOR(unit) != MD_MAJOR)
731  		unit &= ~((1 << MdpMinorShift) - 1);
732  
733  	new = kzalloc(sizeof(*new), GFP_KERNEL);
734  	if (!new)
735  		return ERR_PTR(-ENOMEM);
736  	mddev_init(new);
737  
738  	spin_lock(&all_mddevs_lock);
739  	if (unit) {
740  		error = -EEXIST;
741  		if (mddev_find_locked(unit))
742  			goto out_free_new;
743  		new->unit = unit;
744  		if (MAJOR(unit) == MD_MAJOR)
745  			new->md_minor = MINOR(unit);
746  		else
747  			new->md_minor = MINOR(unit) >> MdpMinorShift;
748  		new->hold_active = UNTIL_IOCTL;
749  	} else {
750  		error = -ENODEV;
751  		new->unit = mddev_alloc_unit();
752  		if (!new->unit)
753  			goto out_free_new;
754  		new->md_minor = MINOR(new->unit);
755  		new->hold_active = UNTIL_STOP;
756  	}
757  
758  	list_add(&new->all_mddevs, &all_mddevs);
759  	spin_unlock(&all_mddevs_lock);
760  	return new;
761  out_free_new:
762  	spin_unlock(&all_mddevs_lock);
763  	kfree(new);
764  	return ERR_PTR(error);
765  }
766  
mddev_free(struct mddev * mddev)767  static void mddev_free(struct mddev *mddev)
768  {
769  	spin_lock(&all_mddevs_lock);
770  	list_del(&mddev->all_mddevs);
771  	spin_unlock(&all_mddevs_lock);
772  
773  	kfree(mddev);
774  }
775  
776  static const struct attribute_group md_redundancy_group;
777  
mddev_unlock(struct mddev * mddev)778  void mddev_unlock(struct mddev *mddev)
779  {
780  	struct md_rdev *rdev;
781  	struct md_rdev *tmp;
782  	LIST_HEAD(delete);
783  
784  	if (!list_empty(&mddev->deleting))
785  		list_splice_init(&mddev->deleting, &delete);
786  
787  	if (mddev->to_remove) {
788  		/* These cannot be removed under reconfig_mutex as
789  		 * an access to the files will try to take reconfig_mutex
790  		 * while holding the file unremovable, which leads to
791  		 * a deadlock.
792  		 * So hold set sysfs_active while the remove in happeing,
793  		 * and anything else which might set ->to_remove or my
794  		 * otherwise change the sysfs namespace will fail with
795  		 * -EBUSY if sysfs_active is still set.
796  		 * We set sysfs_active under reconfig_mutex and elsewhere
797  		 * test it under the same mutex to ensure its correct value
798  		 * is seen.
799  		 */
800  		const struct attribute_group *to_remove = mddev->to_remove;
801  		mddev->to_remove = NULL;
802  		mddev->sysfs_active = 1;
803  		mutex_unlock(&mddev->reconfig_mutex);
804  
805  		if (mddev->kobj.sd) {
806  			if (to_remove != &md_redundancy_group)
807  				sysfs_remove_group(&mddev->kobj, to_remove);
808  			if (mddev->pers == NULL ||
809  			    mddev->pers->sync_request == NULL) {
810  				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
811  				if (mddev->sysfs_action)
812  					sysfs_put(mddev->sysfs_action);
813  				if (mddev->sysfs_completed)
814  					sysfs_put(mddev->sysfs_completed);
815  				if (mddev->sysfs_degraded)
816  					sysfs_put(mddev->sysfs_degraded);
817  				mddev->sysfs_action = NULL;
818  				mddev->sysfs_completed = NULL;
819  				mddev->sysfs_degraded = NULL;
820  			}
821  		}
822  		mddev->sysfs_active = 0;
823  	} else
824  		mutex_unlock(&mddev->reconfig_mutex);
825  
826  	md_wakeup_thread(mddev->thread);
827  	wake_up(&mddev->sb_wait);
828  
829  	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
830  		list_del_init(&rdev->same_set);
831  		kobject_del(&rdev->kobj);
832  		export_rdev(rdev, mddev);
833  	}
834  }
835  EXPORT_SYMBOL_GPL(mddev_unlock);
836  
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)837  struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
838  {
839  	struct md_rdev *rdev;
840  
841  	rdev_for_each_rcu(rdev, mddev)
842  		if (rdev->desc_nr == nr)
843  			return rdev;
844  
845  	return NULL;
846  }
847  EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
848  
find_rdev(struct mddev * mddev,dev_t dev)849  static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
850  {
851  	struct md_rdev *rdev;
852  
853  	rdev_for_each(rdev, mddev)
854  		if (rdev->bdev->bd_dev == dev)
855  			return rdev;
856  
857  	return NULL;
858  }
859  
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)860  struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
861  {
862  	struct md_rdev *rdev;
863  
864  	rdev_for_each_rcu(rdev, mddev)
865  		if (rdev->bdev->bd_dev == dev)
866  			return rdev;
867  
868  	return NULL;
869  }
870  EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
871  
find_pers(int level,char * clevel)872  static struct md_personality *find_pers(int level, char *clevel)
873  {
874  	struct md_personality *pers;
875  	list_for_each_entry(pers, &pers_list, list) {
876  		if (level != LEVEL_NONE && pers->level == level)
877  			return pers;
878  		if (strcmp(pers->name, clevel)==0)
879  			return pers;
880  	}
881  	return NULL;
882  }
883  
884  /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)885  static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
886  {
887  	return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
888  }
889  
alloc_disk_sb(struct md_rdev * rdev)890  static int alloc_disk_sb(struct md_rdev *rdev)
891  {
892  	rdev->sb_page = alloc_page(GFP_KERNEL);
893  	if (!rdev->sb_page)
894  		return -ENOMEM;
895  	return 0;
896  }
897  
md_rdev_clear(struct md_rdev * rdev)898  void md_rdev_clear(struct md_rdev *rdev)
899  {
900  	if (rdev->sb_page) {
901  		put_page(rdev->sb_page);
902  		rdev->sb_loaded = 0;
903  		rdev->sb_page = NULL;
904  		rdev->sb_start = 0;
905  		rdev->sectors = 0;
906  	}
907  	if (rdev->bb_page) {
908  		put_page(rdev->bb_page);
909  		rdev->bb_page = NULL;
910  	}
911  	badblocks_exit(&rdev->badblocks);
912  }
913  EXPORT_SYMBOL_GPL(md_rdev_clear);
914  
super_written(struct bio * bio)915  static void super_written(struct bio *bio)
916  {
917  	struct md_rdev *rdev = bio->bi_private;
918  	struct mddev *mddev = rdev->mddev;
919  
920  	if (bio->bi_status) {
921  		pr_err("md: %s gets error=%d\n", __func__,
922  		       blk_status_to_errno(bio->bi_status));
923  		md_error(mddev, rdev);
924  		if (!test_bit(Faulty, &rdev->flags)
925  		    && (bio->bi_opf & MD_FAILFAST)) {
926  			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
927  			set_bit(LastDev, &rdev->flags);
928  		}
929  	} else
930  		clear_bit(LastDev, &rdev->flags);
931  
932  	bio_put(bio);
933  
934  	rdev_dec_pending(rdev, mddev);
935  
936  	if (atomic_dec_and_test(&mddev->pending_writes))
937  		wake_up(&mddev->sb_wait);
938  }
939  
md_super_write(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page)940  void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
941  		   sector_t sector, int size, struct page *page)
942  {
943  	/* write first size bytes of page to sector of rdev
944  	 * Increment mddev->pending_writes before returning
945  	 * and decrement it on completion, waking up sb_wait
946  	 * if zero is reached.
947  	 * If an error occurred, call md_error
948  	 */
949  	struct bio *bio;
950  
951  	if (!page)
952  		return;
953  
954  	if (test_bit(Faulty, &rdev->flags))
955  		return;
956  
957  	bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
958  			      1,
959  			      REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
960  				  | REQ_PREFLUSH | REQ_FUA,
961  			      GFP_NOIO, &mddev->sync_set);
962  
963  	atomic_inc(&rdev->nr_pending);
964  
965  	bio->bi_iter.bi_sector = sector;
966  	__bio_add_page(bio, page, size, 0);
967  	bio->bi_private = rdev;
968  	bio->bi_end_io = super_written;
969  
970  	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
971  	    test_bit(FailFast, &rdev->flags) &&
972  	    !test_bit(LastDev, &rdev->flags))
973  		bio->bi_opf |= MD_FAILFAST;
974  
975  	atomic_inc(&mddev->pending_writes);
976  	submit_bio(bio);
977  }
978  
md_super_wait(struct mddev * mddev)979  int md_super_wait(struct mddev *mddev)
980  {
981  	/* wait for all superblock writes that were scheduled to complete */
982  	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
983  	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
984  		return -EAGAIN;
985  	return 0;
986  }
987  
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)988  int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
989  		 struct page *page, blk_opf_t opf, bool metadata_op)
990  {
991  	struct bio bio;
992  	struct bio_vec bvec;
993  
994  	if (metadata_op && rdev->meta_bdev)
995  		bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
996  	else
997  		bio_init(&bio, rdev->bdev, &bvec, 1, opf);
998  
999  	if (metadata_op)
1000  		bio.bi_iter.bi_sector = sector + rdev->sb_start;
1001  	else if (rdev->mddev->reshape_position != MaxSector &&
1002  		 (rdev->mddev->reshape_backwards ==
1003  		  (sector >= rdev->mddev->reshape_position)))
1004  		bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1005  	else
1006  		bio.bi_iter.bi_sector = sector + rdev->data_offset;
1007  	__bio_add_page(&bio, page, size, 0);
1008  
1009  	submit_bio_wait(&bio);
1010  
1011  	return !bio.bi_status;
1012  }
1013  EXPORT_SYMBOL_GPL(sync_page_io);
1014  
read_disk_sb(struct md_rdev * rdev,int size)1015  static int read_disk_sb(struct md_rdev *rdev, int size)
1016  {
1017  	if (rdev->sb_loaded)
1018  		return 0;
1019  
1020  	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1021  		goto fail;
1022  	rdev->sb_loaded = 1;
1023  	return 0;
1024  
1025  fail:
1026  	pr_err("md: disabled device %pg, could not read superblock.\n",
1027  	       rdev->bdev);
1028  	return -EINVAL;
1029  }
1030  
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1031  static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1032  {
1033  	return	sb1->set_uuid0 == sb2->set_uuid0 &&
1034  		sb1->set_uuid1 == sb2->set_uuid1 &&
1035  		sb1->set_uuid2 == sb2->set_uuid2 &&
1036  		sb1->set_uuid3 == sb2->set_uuid3;
1037  }
1038  
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1039  static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1040  {
1041  	int ret;
1042  	mdp_super_t *tmp1, *tmp2;
1043  
1044  	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1045  	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1046  
1047  	if (!tmp1 || !tmp2) {
1048  		ret = 0;
1049  		goto abort;
1050  	}
1051  
1052  	*tmp1 = *sb1;
1053  	*tmp2 = *sb2;
1054  
1055  	/*
1056  	 * nr_disks is not constant
1057  	 */
1058  	tmp1->nr_disks = 0;
1059  	tmp2->nr_disks = 0;
1060  
1061  	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1062  abort:
1063  	kfree(tmp1);
1064  	kfree(tmp2);
1065  	return ret;
1066  }
1067  
md_csum_fold(u32 csum)1068  static u32 md_csum_fold(u32 csum)
1069  {
1070  	csum = (csum & 0xffff) + (csum >> 16);
1071  	return (csum & 0xffff) + (csum >> 16);
1072  }
1073  
calc_sb_csum(mdp_super_t * sb)1074  static unsigned int calc_sb_csum(mdp_super_t *sb)
1075  {
1076  	u64 newcsum = 0;
1077  	u32 *sb32 = (u32*)sb;
1078  	int i;
1079  	unsigned int disk_csum, csum;
1080  
1081  	disk_csum = sb->sb_csum;
1082  	sb->sb_csum = 0;
1083  
1084  	for (i = 0; i < MD_SB_BYTES/4 ; i++)
1085  		newcsum += sb32[i];
1086  	csum = (newcsum & 0xffffffff) + (newcsum>>32);
1087  
1088  #ifdef CONFIG_ALPHA
1089  	/* This used to use csum_partial, which was wrong for several
1090  	 * reasons including that different results are returned on
1091  	 * different architectures.  It isn't critical that we get exactly
1092  	 * the same return value as before (we always csum_fold before
1093  	 * testing, and that removes any differences).  However as we
1094  	 * know that csum_partial always returned a 16bit value on
1095  	 * alphas, do a fold to maximise conformity to previous behaviour.
1096  	 */
1097  	sb->sb_csum = md_csum_fold(disk_csum);
1098  #else
1099  	sb->sb_csum = disk_csum;
1100  #endif
1101  	return csum;
1102  }
1103  
1104  /*
1105   * Handle superblock details.
1106   * We want to be able to handle multiple superblock formats
1107   * so we have a common interface to them all, and an array of
1108   * different handlers.
1109   * We rely on user-space to write the initial superblock, and support
1110   * reading and updating of superblocks.
1111   * Interface methods are:
1112   *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1113   *      loads and validates a superblock on dev.
1114   *      if refdev != NULL, compare superblocks on both devices
1115   *    Return:
1116   *      0 - dev has a superblock that is compatible with refdev
1117   *      1 - dev has a superblock that is compatible and newer than refdev
1118   *          so dev should be used as the refdev in future
1119   *     -EINVAL superblock incompatible or invalid
1120   *     -othererror e.g. -EIO
1121   *
1122   *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1123   *      Verify that dev is acceptable into mddev.
1124   *       The first time, mddev->raid_disks will be 0, and data from
1125   *       dev should be merged in.  Subsequent calls check that dev
1126   *       is new enough.  Return 0 or -EINVAL
1127   *
1128   *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1129   *     Update the superblock for rdev with data in mddev
1130   *     This does not write to disc.
1131   *
1132   */
1133  
1134  struct super_type  {
1135  	char		    *name;
1136  	struct module	    *owner;
1137  	int		    (*load_super)(struct md_rdev *rdev,
1138  					  struct md_rdev *refdev,
1139  					  int minor_version);
1140  	int		    (*validate_super)(struct mddev *mddev,
1141  					      struct md_rdev *freshest,
1142  					      struct md_rdev *rdev);
1143  	void		    (*sync_super)(struct mddev *mddev,
1144  					  struct md_rdev *rdev);
1145  	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1146  						sector_t num_sectors);
1147  	int		    (*allow_new_offset)(struct md_rdev *rdev,
1148  						unsigned long long new_offset);
1149  };
1150  
1151  /*
1152   * Check that the given mddev has no bitmap.
1153   *
1154   * This function is called from the run method of all personalities that do not
1155   * support bitmaps. It prints an error message and returns non-zero if mddev
1156   * has a bitmap. Otherwise, it returns 0.
1157   *
1158   */
md_check_no_bitmap(struct mddev * mddev)1159  int md_check_no_bitmap(struct mddev *mddev)
1160  {
1161  	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1162  		return 0;
1163  	pr_warn("%s: bitmaps are not supported for %s\n",
1164  		mdname(mddev), mddev->pers->name);
1165  	return 1;
1166  }
1167  EXPORT_SYMBOL(md_check_no_bitmap);
1168  
1169  /*
1170   * load_super for 0.90.0
1171   */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1172  static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1173  {
1174  	mdp_super_t *sb;
1175  	int ret;
1176  	bool spare_disk = true;
1177  
1178  	/*
1179  	 * Calculate the position of the superblock (512byte sectors),
1180  	 * it's at the end of the disk.
1181  	 *
1182  	 * It also happens to be a multiple of 4Kb.
1183  	 */
1184  	rdev->sb_start = calc_dev_sboffset(rdev);
1185  
1186  	ret = read_disk_sb(rdev, MD_SB_BYTES);
1187  	if (ret)
1188  		return ret;
1189  
1190  	ret = -EINVAL;
1191  
1192  	sb = page_address(rdev->sb_page);
1193  
1194  	if (sb->md_magic != MD_SB_MAGIC) {
1195  		pr_warn("md: invalid raid superblock magic on %pg\n",
1196  			rdev->bdev);
1197  		goto abort;
1198  	}
1199  
1200  	if (sb->major_version != 0 ||
1201  	    sb->minor_version < 90 ||
1202  	    sb->minor_version > 91) {
1203  		pr_warn("Bad version number %d.%d on %pg\n",
1204  			sb->major_version, sb->minor_version, rdev->bdev);
1205  		goto abort;
1206  	}
1207  
1208  	if (sb->raid_disks <= 0)
1209  		goto abort;
1210  
1211  	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1212  		pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1213  		goto abort;
1214  	}
1215  
1216  	rdev->preferred_minor = sb->md_minor;
1217  	rdev->data_offset = 0;
1218  	rdev->new_data_offset = 0;
1219  	rdev->sb_size = MD_SB_BYTES;
1220  	rdev->badblocks.shift = -1;
1221  
1222  	if (sb->level == LEVEL_MULTIPATH)
1223  		rdev->desc_nr = -1;
1224  	else
1225  		rdev->desc_nr = sb->this_disk.number;
1226  
1227  	/* not spare disk, or LEVEL_MULTIPATH */
1228  	if (sb->level == LEVEL_MULTIPATH ||
1229  		(rdev->desc_nr >= 0 &&
1230  		 rdev->desc_nr < MD_SB_DISKS &&
1231  		 sb->disks[rdev->desc_nr].state &
1232  		 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1233  		spare_disk = false;
1234  
1235  	if (!refdev) {
1236  		if (!spare_disk)
1237  			ret = 1;
1238  		else
1239  			ret = 0;
1240  	} else {
1241  		__u64 ev1, ev2;
1242  		mdp_super_t *refsb = page_address(refdev->sb_page);
1243  		if (!md_uuid_equal(refsb, sb)) {
1244  			pr_warn("md: %pg has different UUID to %pg\n",
1245  				rdev->bdev, refdev->bdev);
1246  			goto abort;
1247  		}
1248  		if (!md_sb_equal(refsb, sb)) {
1249  			pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1250  				rdev->bdev, refdev->bdev);
1251  			goto abort;
1252  		}
1253  		ev1 = md_event(sb);
1254  		ev2 = md_event(refsb);
1255  
1256  		if (!spare_disk && ev1 > ev2)
1257  			ret = 1;
1258  		else
1259  			ret = 0;
1260  	}
1261  	rdev->sectors = rdev->sb_start;
1262  	/* Limit to 4TB as metadata cannot record more than that.
1263  	 * (not needed for Linear and RAID0 as metadata doesn't
1264  	 * record this size)
1265  	 */
1266  	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1267  		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1268  
1269  	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1270  		/* "this cannot possibly happen" ... */
1271  		ret = -EINVAL;
1272  
1273   abort:
1274  	return ret;
1275  }
1276  
1277  /*
1278   * validate_super for 0.90.0
1279   * note: we are not using "freshest" for 0.9 superblock
1280   */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1281  static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1282  {
1283  	mdp_disk_t *desc;
1284  	mdp_super_t *sb = page_address(rdev->sb_page);
1285  	__u64 ev1 = md_event(sb);
1286  
1287  	rdev->raid_disk = -1;
1288  	clear_bit(Faulty, &rdev->flags);
1289  	clear_bit(In_sync, &rdev->flags);
1290  	clear_bit(Bitmap_sync, &rdev->flags);
1291  	clear_bit(WriteMostly, &rdev->flags);
1292  
1293  	if (mddev->raid_disks == 0) {
1294  		mddev->major_version = 0;
1295  		mddev->minor_version = sb->minor_version;
1296  		mddev->patch_version = sb->patch_version;
1297  		mddev->external = 0;
1298  		mddev->chunk_sectors = sb->chunk_size >> 9;
1299  		mddev->ctime = sb->ctime;
1300  		mddev->utime = sb->utime;
1301  		mddev->level = sb->level;
1302  		mddev->clevel[0] = 0;
1303  		mddev->layout = sb->layout;
1304  		mddev->raid_disks = sb->raid_disks;
1305  		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1306  		mddev->events = ev1;
1307  		mddev->bitmap_info.offset = 0;
1308  		mddev->bitmap_info.space = 0;
1309  		/* bitmap can use 60 K after the 4K superblocks */
1310  		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1311  		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1312  		mddev->reshape_backwards = 0;
1313  
1314  		if (mddev->minor_version >= 91) {
1315  			mddev->reshape_position = sb->reshape_position;
1316  			mddev->delta_disks = sb->delta_disks;
1317  			mddev->new_level = sb->new_level;
1318  			mddev->new_layout = sb->new_layout;
1319  			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1320  			if (mddev->delta_disks < 0)
1321  				mddev->reshape_backwards = 1;
1322  		} else {
1323  			mddev->reshape_position = MaxSector;
1324  			mddev->delta_disks = 0;
1325  			mddev->new_level = mddev->level;
1326  			mddev->new_layout = mddev->layout;
1327  			mddev->new_chunk_sectors = mddev->chunk_sectors;
1328  		}
1329  		if (mddev->level == 0)
1330  			mddev->layout = -1;
1331  
1332  		if (sb->state & (1<<MD_SB_CLEAN))
1333  			mddev->recovery_cp = MaxSector;
1334  		else {
1335  			if (sb->events_hi == sb->cp_events_hi &&
1336  				sb->events_lo == sb->cp_events_lo) {
1337  				mddev->recovery_cp = sb->recovery_cp;
1338  			} else
1339  				mddev->recovery_cp = 0;
1340  		}
1341  
1342  		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1343  		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1344  		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1345  		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1346  
1347  		mddev->max_disks = MD_SB_DISKS;
1348  
1349  		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1350  		    mddev->bitmap_info.file == NULL) {
1351  			mddev->bitmap_info.offset =
1352  				mddev->bitmap_info.default_offset;
1353  			mddev->bitmap_info.space =
1354  				mddev->bitmap_info.default_space;
1355  		}
1356  
1357  	} else if (mddev->pers == NULL) {
1358  		/* Insist on good event counter while assembling, except
1359  		 * for spares (which don't need an event count) */
1360  		++ev1;
1361  		if (sb->disks[rdev->desc_nr].state & (
1362  			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1363  			if (ev1 < mddev->events)
1364  				return -EINVAL;
1365  	} else if (mddev->bitmap) {
1366  		/* if adding to array with a bitmap, then we can accept an
1367  		 * older device ... but not too old.
1368  		 */
1369  		if (ev1 < mddev->bitmap->events_cleared)
1370  			return 0;
1371  		if (ev1 < mddev->events)
1372  			set_bit(Bitmap_sync, &rdev->flags);
1373  	} else {
1374  		if (ev1 < mddev->events)
1375  			/* just a hot-add of a new device, leave raid_disk at -1 */
1376  			return 0;
1377  	}
1378  
1379  	if (mddev->level != LEVEL_MULTIPATH) {
1380  		desc = sb->disks + rdev->desc_nr;
1381  
1382  		if (desc->state & (1<<MD_DISK_FAULTY))
1383  			set_bit(Faulty, &rdev->flags);
1384  		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1385  			    desc->raid_disk < mddev->raid_disks */) {
1386  			set_bit(In_sync, &rdev->flags);
1387  			rdev->raid_disk = desc->raid_disk;
1388  			rdev->saved_raid_disk = desc->raid_disk;
1389  		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1390  			/* active but not in sync implies recovery up to
1391  			 * reshape position.  We don't know exactly where
1392  			 * that is, so set to zero for now */
1393  			if (mddev->minor_version >= 91) {
1394  				rdev->recovery_offset = 0;
1395  				rdev->raid_disk = desc->raid_disk;
1396  			}
1397  		}
1398  		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1399  			set_bit(WriteMostly, &rdev->flags);
1400  		if (desc->state & (1<<MD_DISK_FAILFAST))
1401  			set_bit(FailFast, &rdev->flags);
1402  	} else /* MULTIPATH are always insync */
1403  		set_bit(In_sync, &rdev->flags);
1404  	return 0;
1405  }
1406  
1407  /*
1408   * sync_super for 0.90.0
1409   */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1410  static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1411  {
1412  	mdp_super_t *sb;
1413  	struct md_rdev *rdev2;
1414  	int next_spare = mddev->raid_disks;
1415  
1416  	/* make rdev->sb match mddev data..
1417  	 *
1418  	 * 1/ zero out disks
1419  	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1420  	 * 3/ any empty disks < next_spare become removed
1421  	 *
1422  	 * disks[0] gets initialised to REMOVED because
1423  	 * we cannot be sure from other fields if it has
1424  	 * been initialised or not.
1425  	 */
1426  	int i;
1427  	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1428  
1429  	rdev->sb_size = MD_SB_BYTES;
1430  
1431  	sb = page_address(rdev->sb_page);
1432  
1433  	memset(sb, 0, sizeof(*sb));
1434  
1435  	sb->md_magic = MD_SB_MAGIC;
1436  	sb->major_version = mddev->major_version;
1437  	sb->patch_version = mddev->patch_version;
1438  	sb->gvalid_words  = 0; /* ignored */
1439  	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1440  	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1441  	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1442  	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1443  
1444  	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1445  	sb->level = mddev->level;
1446  	sb->size = mddev->dev_sectors / 2;
1447  	sb->raid_disks = mddev->raid_disks;
1448  	sb->md_minor = mddev->md_minor;
1449  	sb->not_persistent = 0;
1450  	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1451  	sb->state = 0;
1452  	sb->events_hi = (mddev->events>>32);
1453  	sb->events_lo = (u32)mddev->events;
1454  
1455  	if (mddev->reshape_position == MaxSector)
1456  		sb->minor_version = 90;
1457  	else {
1458  		sb->minor_version = 91;
1459  		sb->reshape_position = mddev->reshape_position;
1460  		sb->new_level = mddev->new_level;
1461  		sb->delta_disks = mddev->delta_disks;
1462  		sb->new_layout = mddev->new_layout;
1463  		sb->new_chunk = mddev->new_chunk_sectors << 9;
1464  	}
1465  	mddev->minor_version = sb->minor_version;
1466  	if (mddev->in_sync)
1467  	{
1468  		sb->recovery_cp = mddev->recovery_cp;
1469  		sb->cp_events_hi = (mddev->events>>32);
1470  		sb->cp_events_lo = (u32)mddev->events;
1471  		if (mddev->recovery_cp == MaxSector)
1472  			sb->state = (1<< MD_SB_CLEAN);
1473  	} else
1474  		sb->recovery_cp = 0;
1475  
1476  	sb->layout = mddev->layout;
1477  	sb->chunk_size = mddev->chunk_sectors << 9;
1478  
1479  	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1480  		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1481  
1482  	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1483  	rdev_for_each(rdev2, mddev) {
1484  		mdp_disk_t *d;
1485  		int desc_nr;
1486  		int is_active = test_bit(In_sync, &rdev2->flags);
1487  
1488  		if (rdev2->raid_disk >= 0 &&
1489  		    sb->minor_version >= 91)
1490  			/* we have nowhere to store the recovery_offset,
1491  			 * but if it is not below the reshape_position,
1492  			 * we can piggy-back on that.
1493  			 */
1494  			is_active = 1;
1495  		if (rdev2->raid_disk < 0 ||
1496  		    test_bit(Faulty, &rdev2->flags))
1497  			is_active = 0;
1498  		if (is_active)
1499  			desc_nr = rdev2->raid_disk;
1500  		else
1501  			desc_nr = next_spare++;
1502  		rdev2->desc_nr = desc_nr;
1503  		d = &sb->disks[rdev2->desc_nr];
1504  		nr_disks++;
1505  		d->number = rdev2->desc_nr;
1506  		d->major = MAJOR(rdev2->bdev->bd_dev);
1507  		d->minor = MINOR(rdev2->bdev->bd_dev);
1508  		if (is_active)
1509  			d->raid_disk = rdev2->raid_disk;
1510  		else
1511  			d->raid_disk = rdev2->desc_nr; /* compatibility */
1512  		if (test_bit(Faulty, &rdev2->flags))
1513  			d->state = (1<<MD_DISK_FAULTY);
1514  		else if (is_active) {
1515  			d->state = (1<<MD_DISK_ACTIVE);
1516  			if (test_bit(In_sync, &rdev2->flags))
1517  				d->state |= (1<<MD_DISK_SYNC);
1518  			active++;
1519  			working++;
1520  		} else {
1521  			d->state = 0;
1522  			spare++;
1523  			working++;
1524  		}
1525  		if (test_bit(WriteMostly, &rdev2->flags))
1526  			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1527  		if (test_bit(FailFast, &rdev2->flags))
1528  			d->state |= (1<<MD_DISK_FAILFAST);
1529  	}
1530  	/* now set the "removed" and "faulty" bits on any missing devices */
1531  	for (i=0 ; i < mddev->raid_disks ; i++) {
1532  		mdp_disk_t *d = &sb->disks[i];
1533  		if (d->state == 0 && d->number == 0) {
1534  			d->number = i;
1535  			d->raid_disk = i;
1536  			d->state = (1<<MD_DISK_REMOVED);
1537  			d->state |= (1<<MD_DISK_FAULTY);
1538  			failed++;
1539  		}
1540  	}
1541  	sb->nr_disks = nr_disks;
1542  	sb->active_disks = active;
1543  	sb->working_disks = working;
1544  	sb->failed_disks = failed;
1545  	sb->spare_disks = spare;
1546  
1547  	sb->this_disk = sb->disks[rdev->desc_nr];
1548  	sb->sb_csum = calc_sb_csum(sb);
1549  }
1550  
1551  /*
1552   * rdev_size_change for 0.90.0
1553   */
1554  static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1555  super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1556  {
1557  	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1558  		return 0; /* component must fit device */
1559  	if (rdev->mddev->bitmap_info.offset)
1560  		return 0; /* can't move bitmap */
1561  	rdev->sb_start = calc_dev_sboffset(rdev);
1562  	if (!num_sectors || num_sectors > rdev->sb_start)
1563  		num_sectors = rdev->sb_start;
1564  	/* Limit to 4TB as metadata cannot record more than that.
1565  	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1566  	 */
1567  	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1568  		num_sectors = (sector_t)(2ULL << 32) - 2;
1569  	do {
1570  		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1571  		       rdev->sb_page);
1572  	} while (md_super_wait(rdev->mddev) < 0);
1573  	return num_sectors;
1574  }
1575  
1576  static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1577  super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1578  {
1579  	/* non-zero offset changes not possible with v0.90 */
1580  	return new_offset == 0;
1581  }
1582  
1583  /*
1584   * version 1 superblock
1585   */
1586  
calc_sb_1_csum(struct mdp_superblock_1 * sb)1587  static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1588  {
1589  	__le32 disk_csum;
1590  	u32 csum;
1591  	unsigned long long newcsum;
1592  	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1593  	__le32 *isuper = (__le32*)sb;
1594  
1595  	disk_csum = sb->sb_csum;
1596  	sb->sb_csum = 0;
1597  	newcsum = 0;
1598  	for (; size >= 4; size -= 4)
1599  		newcsum += le32_to_cpu(*isuper++);
1600  
1601  	if (size == 2)
1602  		newcsum += le16_to_cpu(*(__le16*) isuper);
1603  
1604  	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1605  	sb->sb_csum = disk_csum;
1606  	return cpu_to_le32(csum);
1607  }
1608  
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1609  static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1610  {
1611  	struct mdp_superblock_1 *sb;
1612  	int ret;
1613  	sector_t sb_start;
1614  	sector_t sectors;
1615  	int bmask;
1616  	bool spare_disk = true;
1617  
1618  	/*
1619  	 * Calculate the position of the superblock in 512byte sectors.
1620  	 * It is always aligned to a 4K boundary and
1621  	 * depeding on minor_version, it can be:
1622  	 * 0: At least 8K, but less than 12K, from end of device
1623  	 * 1: At start of device
1624  	 * 2: 4K from start of device.
1625  	 */
1626  	switch(minor_version) {
1627  	case 0:
1628  		sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1629  		sb_start &= ~(sector_t)(4*2-1);
1630  		break;
1631  	case 1:
1632  		sb_start = 0;
1633  		break;
1634  	case 2:
1635  		sb_start = 8;
1636  		break;
1637  	default:
1638  		return -EINVAL;
1639  	}
1640  	rdev->sb_start = sb_start;
1641  
1642  	/* superblock is rarely larger than 1K, but it can be larger,
1643  	 * and it is safe to read 4k, so we do that
1644  	 */
1645  	ret = read_disk_sb(rdev, 4096);
1646  	if (ret) return ret;
1647  
1648  	sb = page_address(rdev->sb_page);
1649  
1650  	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1651  	    sb->major_version != cpu_to_le32(1) ||
1652  	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1653  	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1654  	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1655  		return -EINVAL;
1656  
1657  	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1658  		pr_warn("md: invalid superblock checksum on %pg\n",
1659  			rdev->bdev);
1660  		return -EINVAL;
1661  	}
1662  	if (le64_to_cpu(sb->data_size) < 10) {
1663  		pr_warn("md: data_size too small on %pg\n",
1664  			rdev->bdev);
1665  		return -EINVAL;
1666  	}
1667  	if (sb->pad0 ||
1668  	    sb->pad3[0] ||
1669  	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1670  		/* Some padding is non-zero, might be a new feature */
1671  		return -EINVAL;
1672  
1673  	rdev->preferred_minor = 0xffff;
1674  	rdev->data_offset = le64_to_cpu(sb->data_offset);
1675  	rdev->new_data_offset = rdev->data_offset;
1676  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1677  	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1678  		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1679  	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1680  
1681  	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1682  	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1683  	if (rdev->sb_size & bmask)
1684  		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1685  
1686  	if (minor_version
1687  	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1688  		return -EINVAL;
1689  	if (minor_version
1690  	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1691  		return -EINVAL;
1692  
1693  	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1694  		rdev->desc_nr = -1;
1695  	else
1696  		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1697  
1698  	if (!rdev->bb_page) {
1699  		rdev->bb_page = alloc_page(GFP_KERNEL);
1700  		if (!rdev->bb_page)
1701  			return -ENOMEM;
1702  	}
1703  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1704  	    rdev->badblocks.count == 0) {
1705  		/* need to load the bad block list.
1706  		 * Currently we limit it to one page.
1707  		 */
1708  		s32 offset;
1709  		sector_t bb_sector;
1710  		__le64 *bbp;
1711  		int i;
1712  		int sectors = le16_to_cpu(sb->bblog_size);
1713  		if (sectors > (PAGE_SIZE / 512))
1714  			return -EINVAL;
1715  		offset = le32_to_cpu(sb->bblog_offset);
1716  		if (offset == 0)
1717  			return -EINVAL;
1718  		bb_sector = (long long)offset;
1719  		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1720  				  rdev->bb_page, REQ_OP_READ, true))
1721  			return -EIO;
1722  		bbp = (__le64 *)page_address(rdev->bb_page);
1723  		rdev->badblocks.shift = sb->bblog_shift;
1724  		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1725  			u64 bb = le64_to_cpu(*bbp);
1726  			int count = bb & (0x3ff);
1727  			u64 sector = bb >> 10;
1728  			sector <<= sb->bblog_shift;
1729  			count <<= sb->bblog_shift;
1730  			if (bb + 1 == 0)
1731  				break;
1732  			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1733  				return -EINVAL;
1734  		}
1735  	} else if (sb->bblog_offset != 0)
1736  		rdev->badblocks.shift = 0;
1737  
1738  	if ((le32_to_cpu(sb->feature_map) &
1739  	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1740  		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1741  		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1742  		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1743  	}
1744  
1745  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1746  	    sb->level != 0)
1747  		return -EINVAL;
1748  
1749  	/* not spare disk, or LEVEL_MULTIPATH */
1750  	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1751  		(rdev->desc_nr >= 0 &&
1752  		rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1753  		(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1754  		 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1755  		spare_disk = false;
1756  
1757  	if (!refdev) {
1758  		if (!spare_disk)
1759  			ret = 1;
1760  		else
1761  			ret = 0;
1762  	} else {
1763  		__u64 ev1, ev2;
1764  		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1765  
1766  		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1767  		    sb->level != refsb->level ||
1768  		    sb->layout != refsb->layout ||
1769  		    sb->chunksize != refsb->chunksize) {
1770  			pr_warn("md: %pg has strangely different superblock to %pg\n",
1771  				rdev->bdev,
1772  				refdev->bdev);
1773  			return -EINVAL;
1774  		}
1775  		ev1 = le64_to_cpu(sb->events);
1776  		ev2 = le64_to_cpu(refsb->events);
1777  
1778  		if (!spare_disk && ev1 > ev2)
1779  			ret = 1;
1780  		else
1781  			ret = 0;
1782  	}
1783  	if (minor_version)
1784  		sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1785  	else
1786  		sectors = rdev->sb_start;
1787  	if (sectors < le64_to_cpu(sb->data_size))
1788  		return -EINVAL;
1789  	rdev->sectors = le64_to_cpu(sb->data_size);
1790  	return ret;
1791  }
1792  
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1793  static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1794  {
1795  	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1796  	__u64 ev1 = le64_to_cpu(sb->events);
1797  
1798  	rdev->raid_disk = -1;
1799  	clear_bit(Faulty, &rdev->flags);
1800  	clear_bit(In_sync, &rdev->flags);
1801  	clear_bit(Bitmap_sync, &rdev->flags);
1802  	clear_bit(WriteMostly, &rdev->flags);
1803  
1804  	if (mddev->raid_disks == 0) {
1805  		mddev->major_version = 1;
1806  		mddev->patch_version = 0;
1807  		mddev->external = 0;
1808  		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1809  		mddev->ctime = le64_to_cpu(sb->ctime);
1810  		mddev->utime = le64_to_cpu(sb->utime);
1811  		mddev->level = le32_to_cpu(sb->level);
1812  		mddev->clevel[0] = 0;
1813  		mddev->layout = le32_to_cpu(sb->layout);
1814  		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1815  		mddev->dev_sectors = le64_to_cpu(sb->size);
1816  		mddev->events = ev1;
1817  		mddev->bitmap_info.offset = 0;
1818  		mddev->bitmap_info.space = 0;
1819  		/* Default location for bitmap is 1K after superblock
1820  		 * using 3K - total of 4K
1821  		 */
1822  		mddev->bitmap_info.default_offset = 1024 >> 9;
1823  		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1824  		mddev->reshape_backwards = 0;
1825  
1826  		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1827  		memcpy(mddev->uuid, sb->set_uuid, 16);
1828  
1829  		mddev->max_disks =  (4096-256)/2;
1830  
1831  		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1832  		    mddev->bitmap_info.file == NULL) {
1833  			mddev->bitmap_info.offset =
1834  				(__s32)le32_to_cpu(sb->bitmap_offset);
1835  			/* Metadata doesn't record how much space is available.
1836  			 * For 1.0, we assume we can use up to the superblock
1837  			 * if before, else to 4K beyond superblock.
1838  			 * For others, assume no change is possible.
1839  			 */
1840  			if (mddev->minor_version > 0)
1841  				mddev->bitmap_info.space = 0;
1842  			else if (mddev->bitmap_info.offset > 0)
1843  				mddev->bitmap_info.space =
1844  					8 - mddev->bitmap_info.offset;
1845  			else
1846  				mddev->bitmap_info.space =
1847  					-mddev->bitmap_info.offset;
1848  		}
1849  
1850  		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1851  			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1852  			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1853  			mddev->new_level = le32_to_cpu(sb->new_level);
1854  			mddev->new_layout = le32_to_cpu(sb->new_layout);
1855  			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1856  			if (mddev->delta_disks < 0 ||
1857  			    (mddev->delta_disks == 0 &&
1858  			     (le32_to_cpu(sb->feature_map)
1859  			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1860  				mddev->reshape_backwards = 1;
1861  		} else {
1862  			mddev->reshape_position = MaxSector;
1863  			mddev->delta_disks = 0;
1864  			mddev->new_level = mddev->level;
1865  			mddev->new_layout = mddev->layout;
1866  			mddev->new_chunk_sectors = mddev->chunk_sectors;
1867  		}
1868  
1869  		if (mddev->level == 0 &&
1870  		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1871  			mddev->layout = -1;
1872  
1873  		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1874  			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1875  
1876  		if (le32_to_cpu(sb->feature_map) &
1877  		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1878  			if (le32_to_cpu(sb->feature_map) &
1879  			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1880  				return -EINVAL;
1881  			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1882  			    (le32_to_cpu(sb->feature_map) &
1883  					    MD_FEATURE_MULTIPLE_PPLS))
1884  				return -EINVAL;
1885  			set_bit(MD_HAS_PPL, &mddev->flags);
1886  		}
1887  	} else if (mddev->pers == NULL) {
1888  		/* Insist of good event counter while assembling, except for
1889  		 * spares (which don't need an event count).
1890  		 * Similar to mdadm, we allow event counter difference of 1
1891  		 * from the freshest device.
1892  		 */
1893  		if (rdev->desc_nr >= 0 &&
1894  		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1895  		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1896  		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1897  			if (ev1 + 1 < mddev->events)
1898  				return -EINVAL;
1899  	} else if (mddev->bitmap) {
1900  		/* If adding to array with a bitmap, then we can accept an
1901  		 * older device, but not too old.
1902  		 */
1903  		if (ev1 < mddev->bitmap->events_cleared)
1904  			return 0;
1905  		if (ev1 < mddev->events)
1906  			set_bit(Bitmap_sync, &rdev->flags);
1907  	} else {
1908  		if (ev1 < mddev->events)
1909  			/* just a hot-add of a new device, leave raid_disk at -1 */
1910  			return 0;
1911  	}
1912  	if (mddev->level != LEVEL_MULTIPATH) {
1913  		int role;
1914  		if (rdev->desc_nr < 0 ||
1915  		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1916  			role = MD_DISK_ROLE_SPARE;
1917  			rdev->desc_nr = -1;
1918  		} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
1919  			/*
1920  			 * If we are assembling, and our event counter is smaller than the
1921  			 * highest event counter, we cannot trust our superblock about the role.
1922  			 * It could happen that our rdev was marked as Faulty, and all other
1923  			 * superblocks were updated with +1 event counter.
1924  			 * Then, before the next superblock update, which typically happens when
1925  			 * remove_and_add_spares() removes the device from the array, there was
1926  			 * a crash or reboot.
1927  			 * If we allow current rdev without consulting the freshest superblock,
1928  			 * we could cause data corruption.
1929  			 * Note that in this case our event counter is smaller by 1 than the
1930  			 * highest, otherwise, this rdev would not be allowed into array;
1931  			 * both kernel and mdadm allow event counter difference of 1.
1932  			 */
1933  			struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
1934  			u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
1935  
1936  			if (rdev->desc_nr >= freshest_max_dev) {
1937  				/* this is unexpected, better not proceed */
1938  				pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
1939  						mdname(mddev), rdev->bdev, rdev->desc_nr,
1940  						freshest->bdev, freshest_max_dev);
1941  				return -EUCLEAN;
1942  			}
1943  
1944  			role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
1945  			pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
1946  				     mdname(mddev), rdev->bdev, role, role, freshest->bdev);
1947  		} else {
1948  			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1949  		}
1950  		switch(role) {
1951  		case MD_DISK_ROLE_SPARE: /* spare */
1952  			break;
1953  		case MD_DISK_ROLE_FAULTY: /* faulty */
1954  			set_bit(Faulty, &rdev->flags);
1955  			break;
1956  		case MD_DISK_ROLE_JOURNAL: /* journal device */
1957  			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1958  				/* journal device without journal feature */
1959  				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1960  				return -EINVAL;
1961  			}
1962  			set_bit(Journal, &rdev->flags);
1963  			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1964  			rdev->raid_disk = 0;
1965  			break;
1966  		default:
1967  			rdev->saved_raid_disk = role;
1968  			if ((le32_to_cpu(sb->feature_map) &
1969  			     MD_FEATURE_RECOVERY_OFFSET)) {
1970  				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1971  				if (!(le32_to_cpu(sb->feature_map) &
1972  				      MD_FEATURE_RECOVERY_BITMAP))
1973  					rdev->saved_raid_disk = -1;
1974  			} else {
1975  				/*
1976  				 * If the array is FROZEN, then the device can't
1977  				 * be in_sync with rest of array.
1978  				 */
1979  				if (!test_bit(MD_RECOVERY_FROZEN,
1980  					      &mddev->recovery))
1981  					set_bit(In_sync, &rdev->flags);
1982  			}
1983  			rdev->raid_disk = role;
1984  			break;
1985  		}
1986  		if (sb->devflags & WriteMostly1)
1987  			set_bit(WriteMostly, &rdev->flags);
1988  		if (sb->devflags & FailFast1)
1989  			set_bit(FailFast, &rdev->flags);
1990  		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1991  			set_bit(Replacement, &rdev->flags);
1992  	} else /* MULTIPATH are always insync */
1993  		set_bit(In_sync, &rdev->flags);
1994  
1995  	return 0;
1996  }
1997  
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)1998  static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1999  {
2000  	struct mdp_superblock_1 *sb;
2001  	struct md_rdev *rdev2;
2002  	int max_dev, i;
2003  	/* make rdev->sb match mddev and rdev data. */
2004  
2005  	sb = page_address(rdev->sb_page);
2006  
2007  	sb->feature_map = 0;
2008  	sb->pad0 = 0;
2009  	sb->recovery_offset = cpu_to_le64(0);
2010  	memset(sb->pad3, 0, sizeof(sb->pad3));
2011  
2012  	sb->utime = cpu_to_le64((__u64)mddev->utime);
2013  	sb->events = cpu_to_le64(mddev->events);
2014  	if (mddev->in_sync)
2015  		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2016  	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2017  		sb->resync_offset = cpu_to_le64(MaxSector);
2018  	else
2019  		sb->resync_offset = cpu_to_le64(0);
2020  
2021  	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2022  
2023  	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2024  	sb->size = cpu_to_le64(mddev->dev_sectors);
2025  	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2026  	sb->level = cpu_to_le32(mddev->level);
2027  	sb->layout = cpu_to_le32(mddev->layout);
2028  	if (test_bit(FailFast, &rdev->flags))
2029  		sb->devflags |= FailFast1;
2030  	else
2031  		sb->devflags &= ~FailFast1;
2032  
2033  	if (test_bit(WriteMostly, &rdev->flags))
2034  		sb->devflags |= WriteMostly1;
2035  	else
2036  		sb->devflags &= ~WriteMostly1;
2037  	sb->data_offset = cpu_to_le64(rdev->data_offset);
2038  	sb->data_size = cpu_to_le64(rdev->sectors);
2039  
2040  	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2041  		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2042  		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2043  	}
2044  
2045  	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2046  	    !test_bit(In_sync, &rdev->flags)) {
2047  		sb->feature_map |=
2048  			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2049  		sb->recovery_offset =
2050  			cpu_to_le64(rdev->recovery_offset);
2051  		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2052  			sb->feature_map |=
2053  				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2054  	}
2055  	/* Note: recovery_offset and journal_tail share space  */
2056  	if (test_bit(Journal, &rdev->flags))
2057  		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2058  	if (test_bit(Replacement, &rdev->flags))
2059  		sb->feature_map |=
2060  			cpu_to_le32(MD_FEATURE_REPLACEMENT);
2061  
2062  	if (mddev->reshape_position != MaxSector) {
2063  		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2064  		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2065  		sb->new_layout = cpu_to_le32(mddev->new_layout);
2066  		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2067  		sb->new_level = cpu_to_le32(mddev->new_level);
2068  		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2069  		if (mddev->delta_disks == 0 &&
2070  		    mddev->reshape_backwards)
2071  			sb->feature_map
2072  				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2073  		if (rdev->new_data_offset != rdev->data_offset) {
2074  			sb->feature_map
2075  				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2076  			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2077  							     - rdev->data_offset));
2078  		}
2079  	}
2080  
2081  	if (mddev_is_clustered(mddev))
2082  		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2083  
2084  	if (rdev->badblocks.count == 0)
2085  		/* Nothing to do for bad blocks*/ ;
2086  	else if (sb->bblog_offset == 0)
2087  		/* Cannot record bad blocks on this device */
2088  		md_error(mddev, rdev);
2089  	else {
2090  		struct badblocks *bb = &rdev->badblocks;
2091  		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2092  		u64 *p = bb->page;
2093  		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2094  		if (bb->changed) {
2095  			unsigned seq;
2096  
2097  retry:
2098  			seq = read_seqbegin(&bb->lock);
2099  
2100  			memset(bbp, 0xff, PAGE_SIZE);
2101  
2102  			for (i = 0 ; i < bb->count ; i++) {
2103  				u64 internal_bb = p[i];
2104  				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2105  						| BB_LEN(internal_bb));
2106  				bbp[i] = cpu_to_le64(store_bb);
2107  			}
2108  			bb->changed = 0;
2109  			if (read_seqretry(&bb->lock, seq))
2110  				goto retry;
2111  
2112  			bb->sector = (rdev->sb_start +
2113  				      (int)le32_to_cpu(sb->bblog_offset));
2114  			bb->size = le16_to_cpu(sb->bblog_size);
2115  		}
2116  	}
2117  
2118  	max_dev = 0;
2119  	rdev_for_each(rdev2, mddev)
2120  		if (rdev2->desc_nr+1 > max_dev)
2121  			max_dev = rdev2->desc_nr+1;
2122  
2123  	if (max_dev > le32_to_cpu(sb->max_dev)) {
2124  		int bmask;
2125  		sb->max_dev = cpu_to_le32(max_dev);
2126  		rdev->sb_size = max_dev * 2 + 256;
2127  		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2128  		if (rdev->sb_size & bmask)
2129  			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2130  	} else
2131  		max_dev = le32_to_cpu(sb->max_dev);
2132  
2133  	for (i=0; i<max_dev;i++)
2134  		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2135  
2136  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2137  		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2138  
2139  	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2140  		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2141  			sb->feature_map |=
2142  			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2143  		else
2144  			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2145  		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2146  		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2147  	}
2148  
2149  	rdev_for_each(rdev2, mddev) {
2150  		i = rdev2->desc_nr;
2151  		if (test_bit(Faulty, &rdev2->flags))
2152  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2153  		else if (test_bit(In_sync, &rdev2->flags))
2154  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2155  		else if (test_bit(Journal, &rdev2->flags))
2156  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2157  		else if (rdev2->raid_disk >= 0)
2158  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2159  		else
2160  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2161  	}
2162  
2163  	sb->sb_csum = calc_sb_1_csum(sb);
2164  }
2165  
super_1_choose_bm_space(sector_t dev_size)2166  static sector_t super_1_choose_bm_space(sector_t dev_size)
2167  {
2168  	sector_t bm_space;
2169  
2170  	/* if the device is bigger than 8Gig, save 64k for bitmap
2171  	 * usage, if bigger than 200Gig, save 128k
2172  	 */
2173  	if (dev_size < 64*2)
2174  		bm_space = 0;
2175  	else if (dev_size - 64*2 >= 200*1024*1024*2)
2176  		bm_space = 128*2;
2177  	else if (dev_size - 4*2 > 8*1024*1024*2)
2178  		bm_space = 64*2;
2179  	else
2180  		bm_space = 4*2;
2181  	return bm_space;
2182  }
2183  
2184  static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2185  super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2186  {
2187  	struct mdp_superblock_1 *sb;
2188  	sector_t max_sectors;
2189  	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2190  		return 0; /* component must fit device */
2191  	if (rdev->data_offset != rdev->new_data_offset)
2192  		return 0; /* too confusing */
2193  	if (rdev->sb_start < rdev->data_offset) {
2194  		/* minor versions 1 and 2; superblock before data */
2195  		max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2196  		if (!num_sectors || num_sectors > max_sectors)
2197  			num_sectors = max_sectors;
2198  	} else if (rdev->mddev->bitmap_info.offset) {
2199  		/* minor version 0 with bitmap we can't move */
2200  		return 0;
2201  	} else {
2202  		/* minor version 0; superblock after data */
2203  		sector_t sb_start, bm_space;
2204  		sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2205  
2206  		/* 8K is for superblock */
2207  		sb_start = dev_size - 8*2;
2208  		sb_start &= ~(sector_t)(4*2 - 1);
2209  
2210  		bm_space = super_1_choose_bm_space(dev_size);
2211  
2212  		/* Space that can be used to store date needs to decrease
2213  		 * superblock bitmap space and bad block space(4K)
2214  		 */
2215  		max_sectors = sb_start - bm_space - 4*2;
2216  
2217  		if (!num_sectors || num_sectors > max_sectors)
2218  			num_sectors = max_sectors;
2219  		rdev->sb_start = sb_start;
2220  	}
2221  	sb = page_address(rdev->sb_page);
2222  	sb->data_size = cpu_to_le64(num_sectors);
2223  	sb->super_offset = cpu_to_le64(rdev->sb_start);
2224  	sb->sb_csum = calc_sb_1_csum(sb);
2225  	do {
2226  		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2227  			       rdev->sb_page);
2228  	} while (md_super_wait(rdev->mddev) < 0);
2229  	return num_sectors;
2230  
2231  }
2232  
2233  static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2234  super_1_allow_new_offset(struct md_rdev *rdev,
2235  			 unsigned long long new_offset)
2236  {
2237  	/* All necessary checks on new >= old have been done */
2238  	struct bitmap *bitmap;
2239  	if (new_offset >= rdev->data_offset)
2240  		return 1;
2241  
2242  	/* with 1.0 metadata, there is no metadata to tread on
2243  	 * so we can always move back */
2244  	if (rdev->mddev->minor_version == 0)
2245  		return 1;
2246  
2247  	/* otherwise we must be sure not to step on
2248  	 * any metadata, so stay:
2249  	 * 36K beyond start of superblock
2250  	 * beyond end of badblocks
2251  	 * beyond write-intent bitmap
2252  	 */
2253  	if (rdev->sb_start + (32+4)*2 > new_offset)
2254  		return 0;
2255  	bitmap = rdev->mddev->bitmap;
2256  	if (bitmap && !rdev->mddev->bitmap_info.file &&
2257  	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2258  	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2259  		return 0;
2260  	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2261  		return 0;
2262  
2263  	return 1;
2264  }
2265  
2266  static struct super_type super_types[] = {
2267  	[0] = {
2268  		.name	= "0.90.0",
2269  		.owner	= THIS_MODULE,
2270  		.load_super	    = super_90_load,
2271  		.validate_super	    = super_90_validate,
2272  		.sync_super	    = super_90_sync,
2273  		.rdev_size_change   = super_90_rdev_size_change,
2274  		.allow_new_offset   = super_90_allow_new_offset,
2275  	},
2276  	[1] = {
2277  		.name	= "md-1",
2278  		.owner	= THIS_MODULE,
2279  		.load_super	    = super_1_load,
2280  		.validate_super	    = super_1_validate,
2281  		.sync_super	    = super_1_sync,
2282  		.rdev_size_change   = super_1_rdev_size_change,
2283  		.allow_new_offset   = super_1_allow_new_offset,
2284  	},
2285  };
2286  
sync_super(struct mddev * mddev,struct md_rdev * rdev)2287  static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2288  {
2289  	if (mddev->sync_super) {
2290  		mddev->sync_super(mddev, rdev);
2291  		return;
2292  	}
2293  
2294  	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2295  
2296  	super_types[mddev->major_version].sync_super(mddev, rdev);
2297  }
2298  
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2299  static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2300  {
2301  	struct md_rdev *rdev, *rdev2;
2302  
2303  	rcu_read_lock();
2304  	rdev_for_each_rcu(rdev, mddev1) {
2305  		if (test_bit(Faulty, &rdev->flags) ||
2306  		    test_bit(Journal, &rdev->flags) ||
2307  		    rdev->raid_disk == -1)
2308  			continue;
2309  		rdev_for_each_rcu(rdev2, mddev2) {
2310  			if (test_bit(Faulty, &rdev2->flags) ||
2311  			    test_bit(Journal, &rdev2->flags) ||
2312  			    rdev2->raid_disk == -1)
2313  				continue;
2314  			if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2315  				rcu_read_unlock();
2316  				return 1;
2317  			}
2318  		}
2319  	}
2320  	rcu_read_unlock();
2321  	return 0;
2322  }
2323  
2324  static LIST_HEAD(pending_raid_disks);
2325  
2326  /*
2327   * Try to register data integrity profile for an mddev
2328   *
2329   * This is called when an array is started and after a disk has been kicked
2330   * from the array. It only succeeds if all working and active component devices
2331   * are integrity capable with matching profiles.
2332   */
md_integrity_register(struct mddev * mddev)2333  int md_integrity_register(struct mddev *mddev)
2334  {
2335  	struct md_rdev *rdev, *reference = NULL;
2336  
2337  	if (list_empty(&mddev->disks))
2338  		return 0; /* nothing to do */
2339  	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2340  		return 0; /* shouldn't register, or already is */
2341  	rdev_for_each(rdev, mddev) {
2342  		/* skip spares and non-functional disks */
2343  		if (test_bit(Faulty, &rdev->flags))
2344  			continue;
2345  		if (rdev->raid_disk < 0)
2346  			continue;
2347  		if (!reference) {
2348  			/* Use the first rdev as the reference */
2349  			reference = rdev;
2350  			continue;
2351  		}
2352  		/* does this rdev's profile match the reference profile? */
2353  		if (blk_integrity_compare(reference->bdev->bd_disk,
2354  				rdev->bdev->bd_disk) < 0)
2355  			return -EINVAL;
2356  	}
2357  	if (!reference || !bdev_get_integrity(reference->bdev))
2358  		return 0;
2359  	/*
2360  	 * All component devices are integrity capable and have matching
2361  	 * profiles, register the common profile for the md device.
2362  	 */
2363  	blk_integrity_register(mddev->gendisk,
2364  			       bdev_get_integrity(reference->bdev));
2365  
2366  	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2367  	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
2368  	    (mddev->level != 1 && mddev->level != 10 &&
2369  	     bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
2370  		/*
2371  		 * No need to handle the failure of bioset_integrity_create,
2372  		 * because the function is called by md_run() -> pers->run(),
2373  		 * md_run calls bioset_exit -> bioset_integrity_free in case
2374  		 * of failure case.
2375  		 */
2376  		pr_err("md: failed to create integrity pool for %s\n",
2377  		       mdname(mddev));
2378  		return -EINVAL;
2379  	}
2380  	return 0;
2381  }
2382  EXPORT_SYMBOL(md_integrity_register);
2383  
2384  /*
2385   * Attempt to add an rdev, but only if it is consistent with the current
2386   * integrity profile
2387   */
md_integrity_add_rdev(struct md_rdev * rdev,struct mddev * mddev)2388  int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2389  {
2390  	struct blk_integrity *bi_mddev;
2391  
2392  	if (!mddev->gendisk)
2393  		return 0;
2394  
2395  	bi_mddev = blk_get_integrity(mddev->gendisk);
2396  
2397  	if (!bi_mddev) /* nothing to do */
2398  		return 0;
2399  
2400  	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2401  		pr_err("%s: incompatible integrity profile for %pg\n",
2402  		       mdname(mddev), rdev->bdev);
2403  		return -ENXIO;
2404  	}
2405  
2406  	return 0;
2407  }
2408  EXPORT_SYMBOL(md_integrity_add_rdev);
2409  
rdev_read_only(struct md_rdev * rdev)2410  static bool rdev_read_only(struct md_rdev *rdev)
2411  {
2412  	return bdev_read_only(rdev->bdev) ||
2413  		(rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2414  }
2415  
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2416  static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2417  {
2418  	char b[BDEVNAME_SIZE];
2419  	int err;
2420  
2421  	/* prevent duplicates */
2422  	if (find_rdev(mddev, rdev->bdev->bd_dev))
2423  		return -EEXIST;
2424  
2425  	if (rdev_read_only(rdev) && mddev->pers)
2426  		return -EROFS;
2427  
2428  	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2429  	if (!test_bit(Journal, &rdev->flags) &&
2430  	    rdev->sectors &&
2431  	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2432  		if (mddev->pers) {
2433  			/* Cannot change size, so fail
2434  			 * If mddev->level <= 0, then we don't care
2435  			 * about aligning sizes (e.g. linear)
2436  			 */
2437  			if (mddev->level > 0)
2438  				return -ENOSPC;
2439  		} else
2440  			mddev->dev_sectors = rdev->sectors;
2441  	}
2442  
2443  	/* Verify rdev->desc_nr is unique.
2444  	 * If it is -1, assign a free number, else
2445  	 * check number is not in use
2446  	 */
2447  	rcu_read_lock();
2448  	if (rdev->desc_nr < 0) {
2449  		int choice = 0;
2450  		if (mddev->pers)
2451  			choice = mddev->raid_disks;
2452  		while (md_find_rdev_nr_rcu(mddev, choice))
2453  			choice++;
2454  		rdev->desc_nr = choice;
2455  	} else {
2456  		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2457  			rcu_read_unlock();
2458  			return -EBUSY;
2459  		}
2460  	}
2461  	rcu_read_unlock();
2462  	if (!test_bit(Journal, &rdev->flags) &&
2463  	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2464  		pr_warn("md: %s: array is limited to %d devices\n",
2465  			mdname(mddev), mddev->max_disks);
2466  		return -EBUSY;
2467  	}
2468  	snprintf(b, sizeof(b), "%pg", rdev->bdev);
2469  	strreplace(b, '/', '!');
2470  
2471  	rdev->mddev = mddev;
2472  	pr_debug("md: bind<%s>\n", b);
2473  
2474  	if (mddev->raid_disks)
2475  		mddev_create_serial_pool(mddev, rdev, false);
2476  
2477  	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2478  		goto fail;
2479  
2480  	/* failure here is OK */
2481  	err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2482  	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2483  	rdev->sysfs_unack_badblocks =
2484  		sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2485  	rdev->sysfs_badblocks =
2486  		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2487  
2488  	list_add_rcu(&rdev->same_set, &mddev->disks);
2489  	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2490  
2491  	/* May as well allow recovery to be retried once */
2492  	mddev->recovery_disabled++;
2493  
2494  	return 0;
2495  
2496   fail:
2497  	pr_warn("md: failed to register dev-%s for %s\n",
2498  		b, mdname(mddev));
2499  	mddev_destroy_serial_pool(mddev, rdev, false);
2500  	return err;
2501  }
2502  
2503  void md_autodetect_dev(dev_t dev);
2504  
2505  /* just for claiming the bdev */
2506  static struct md_rdev claim_rdev;
2507  
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2508  static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2509  {
2510  	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2511  	md_rdev_clear(rdev);
2512  #ifndef MODULE
2513  	if (test_bit(AutoDetected, &rdev->flags))
2514  		md_autodetect_dev(rdev->bdev->bd_dev);
2515  #endif
2516  	blkdev_put(rdev->bdev,
2517  		   test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev);
2518  	rdev->bdev = NULL;
2519  	kobject_put(&rdev->kobj);
2520  }
2521  
md_kick_rdev_from_array(struct md_rdev * rdev)2522  static void md_kick_rdev_from_array(struct md_rdev *rdev)
2523  {
2524  	struct mddev *mddev = rdev->mddev;
2525  
2526  	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2527  	list_del_rcu(&rdev->same_set);
2528  	pr_debug("md: unbind<%pg>\n", rdev->bdev);
2529  	mddev_destroy_serial_pool(rdev->mddev, rdev, false);
2530  	rdev->mddev = NULL;
2531  	sysfs_remove_link(&rdev->kobj, "block");
2532  	sysfs_put(rdev->sysfs_state);
2533  	sysfs_put(rdev->sysfs_unack_badblocks);
2534  	sysfs_put(rdev->sysfs_badblocks);
2535  	rdev->sysfs_state = NULL;
2536  	rdev->sysfs_unack_badblocks = NULL;
2537  	rdev->sysfs_badblocks = NULL;
2538  	rdev->badblocks.count = 0;
2539  
2540  	synchronize_rcu();
2541  
2542  	/*
2543  	 * kobject_del() will wait for all in progress writers to be done, where
2544  	 * reconfig_mutex is held, hence it can't be called under
2545  	 * reconfig_mutex and it's delayed to mddev_unlock().
2546  	 */
2547  	list_add(&rdev->same_set, &mddev->deleting);
2548  }
2549  
export_array(struct mddev * mddev)2550  static void export_array(struct mddev *mddev)
2551  {
2552  	struct md_rdev *rdev;
2553  
2554  	while (!list_empty(&mddev->disks)) {
2555  		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2556  					same_set);
2557  		md_kick_rdev_from_array(rdev);
2558  	}
2559  	mddev->raid_disks = 0;
2560  	mddev->major_version = 0;
2561  }
2562  
set_in_sync(struct mddev * mddev)2563  static bool set_in_sync(struct mddev *mddev)
2564  {
2565  	lockdep_assert_held(&mddev->lock);
2566  	if (!mddev->in_sync) {
2567  		mddev->sync_checkers++;
2568  		spin_unlock(&mddev->lock);
2569  		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2570  		spin_lock(&mddev->lock);
2571  		if (!mddev->in_sync &&
2572  		    percpu_ref_is_zero(&mddev->writes_pending)) {
2573  			mddev->in_sync = 1;
2574  			/*
2575  			 * Ensure ->in_sync is visible before we clear
2576  			 * ->sync_checkers.
2577  			 */
2578  			smp_mb();
2579  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2580  			sysfs_notify_dirent_safe(mddev->sysfs_state);
2581  		}
2582  		if (--mddev->sync_checkers == 0)
2583  			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2584  	}
2585  	if (mddev->safemode == 1)
2586  		mddev->safemode = 0;
2587  	return mddev->in_sync;
2588  }
2589  
sync_sbs(struct mddev * mddev,int nospares)2590  static void sync_sbs(struct mddev *mddev, int nospares)
2591  {
2592  	/* Update each superblock (in-memory image), but
2593  	 * if we are allowed to, skip spares which already
2594  	 * have the right event counter, or have one earlier
2595  	 * (which would mean they aren't being marked as dirty
2596  	 * with the rest of the array)
2597  	 */
2598  	struct md_rdev *rdev;
2599  	rdev_for_each(rdev, mddev) {
2600  		if (rdev->sb_events == mddev->events ||
2601  		    (nospares &&
2602  		     rdev->raid_disk < 0 &&
2603  		     rdev->sb_events+1 == mddev->events)) {
2604  			/* Don't update this superblock */
2605  			rdev->sb_loaded = 2;
2606  		} else {
2607  			sync_super(mddev, rdev);
2608  			rdev->sb_loaded = 1;
2609  		}
2610  	}
2611  }
2612  
does_sb_need_changing(struct mddev * mddev)2613  static bool does_sb_need_changing(struct mddev *mddev)
2614  {
2615  	struct md_rdev *rdev = NULL, *iter;
2616  	struct mdp_superblock_1 *sb;
2617  	int role;
2618  
2619  	/* Find a good rdev */
2620  	rdev_for_each(iter, mddev)
2621  		if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2622  			rdev = iter;
2623  			break;
2624  		}
2625  
2626  	/* No good device found. */
2627  	if (!rdev)
2628  		return false;
2629  
2630  	sb = page_address(rdev->sb_page);
2631  	/* Check if a device has become faulty or a spare become active */
2632  	rdev_for_each(rdev, mddev) {
2633  		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2634  		/* Device activated? */
2635  		if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2636  		    !test_bit(Faulty, &rdev->flags))
2637  			return true;
2638  		/* Device turned faulty? */
2639  		if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2640  			return true;
2641  	}
2642  
2643  	/* Check if any mddev parameters have changed */
2644  	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2645  	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2646  	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2647  	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2648  	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2649  		return true;
2650  
2651  	return false;
2652  }
2653  
md_update_sb(struct mddev * mddev,int force_change)2654  void md_update_sb(struct mddev *mddev, int force_change)
2655  {
2656  	struct md_rdev *rdev;
2657  	int sync_req;
2658  	int nospares = 0;
2659  	int any_badblocks_changed = 0;
2660  	int ret = -1;
2661  
2662  	if (!md_is_rdwr(mddev)) {
2663  		if (force_change)
2664  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2665  		return;
2666  	}
2667  
2668  repeat:
2669  	if (mddev_is_clustered(mddev)) {
2670  		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2671  			force_change = 1;
2672  		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2673  			nospares = 1;
2674  		ret = md_cluster_ops->metadata_update_start(mddev);
2675  		/* Has someone else has updated the sb */
2676  		if (!does_sb_need_changing(mddev)) {
2677  			if (ret == 0)
2678  				md_cluster_ops->metadata_update_cancel(mddev);
2679  			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2680  							 BIT(MD_SB_CHANGE_DEVS) |
2681  							 BIT(MD_SB_CHANGE_CLEAN));
2682  			return;
2683  		}
2684  	}
2685  
2686  	/*
2687  	 * First make sure individual recovery_offsets are correct
2688  	 * curr_resync_completed can only be used during recovery.
2689  	 * During reshape/resync it might use array-addresses rather
2690  	 * that device addresses.
2691  	 */
2692  	rdev_for_each(rdev, mddev) {
2693  		if (rdev->raid_disk >= 0 &&
2694  		    mddev->delta_disks >= 0 &&
2695  		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2696  		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2697  		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2698  		    !test_bit(Journal, &rdev->flags) &&
2699  		    !test_bit(In_sync, &rdev->flags) &&
2700  		    mddev->curr_resync_completed > rdev->recovery_offset)
2701  				rdev->recovery_offset = mddev->curr_resync_completed;
2702  
2703  	}
2704  	if (!mddev->persistent) {
2705  		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2706  		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2707  		if (!mddev->external) {
2708  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2709  			rdev_for_each(rdev, mddev) {
2710  				if (rdev->badblocks.changed) {
2711  					rdev->badblocks.changed = 0;
2712  					ack_all_badblocks(&rdev->badblocks);
2713  					md_error(mddev, rdev);
2714  				}
2715  				clear_bit(Blocked, &rdev->flags);
2716  				clear_bit(BlockedBadBlocks, &rdev->flags);
2717  				wake_up(&rdev->blocked_wait);
2718  			}
2719  		}
2720  		wake_up(&mddev->sb_wait);
2721  		return;
2722  	}
2723  
2724  	spin_lock(&mddev->lock);
2725  
2726  	mddev->utime = ktime_get_real_seconds();
2727  
2728  	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2729  		force_change = 1;
2730  	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2731  		/* just a clean<-> dirty transition, possibly leave spares alone,
2732  		 * though if events isn't the right even/odd, we will have to do
2733  		 * spares after all
2734  		 */
2735  		nospares = 1;
2736  	if (force_change)
2737  		nospares = 0;
2738  	if (mddev->degraded)
2739  		/* If the array is degraded, then skipping spares is both
2740  		 * dangerous and fairly pointless.
2741  		 * Dangerous because a device that was removed from the array
2742  		 * might have a event_count that still looks up-to-date,
2743  		 * so it can be re-added without a resync.
2744  		 * Pointless because if there are any spares to skip,
2745  		 * then a recovery will happen and soon that array won't
2746  		 * be degraded any more and the spare can go back to sleep then.
2747  		 */
2748  		nospares = 0;
2749  
2750  	sync_req = mddev->in_sync;
2751  
2752  	/* If this is just a dirty<->clean transition, and the array is clean
2753  	 * and 'events' is odd, we can roll back to the previous clean state */
2754  	if (nospares
2755  	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2756  	    && mddev->can_decrease_events
2757  	    && mddev->events != 1) {
2758  		mddev->events--;
2759  		mddev->can_decrease_events = 0;
2760  	} else {
2761  		/* otherwise we have to go forward and ... */
2762  		mddev->events ++;
2763  		mddev->can_decrease_events = nospares;
2764  	}
2765  
2766  	/*
2767  	 * This 64-bit counter should never wrap.
2768  	 * Either we are in around ~1 trillion A.C., assuming
2769  	 * 1 reboot per second, or we have a bug...
2770  	 */
2771  	WARN_ON(mddev->events == 0);
2772  
2773  	rdev_for_each(rdev, mddev) {
2774  		if (rdev->badblocks.changed)
2775  			any_badblocks_changed++;
2776  		if (test_bit(Faulty, &rdev->flags))
2777  			set_bit(FaultRecorded, &rdev->flags);
2778  	}
2779  
2780  	sync_sbs(mddev, nospares);
2781  	spin_unlock(&mddev->lock);
2782  
2783  	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2784  		 mdname(mddev), mddev->in_sync);
2785  
2786  	if (mddev->queue)
2787  		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2788  rewrite:
2789  	md_bitmap_update_sb(mddev->bitmap);
2790  	rdev_for_each(rdev, mddev) {
2791  		if (rdev->sb_loaded != 1)
2792  			continue; /* no noise on spare devices */
2793  
2794  		if (!test_bit(Faulty, &rdev->flags)) {
2795  			md_super_write(mddev,rdev,
2796  				       rdev->sb_start, rdev->sb_size,
2797  				       rdev->sb_page);
2798  			pr_debug("md: (write) %pg's sb offset: %llu\n",
2799  				 rdev->bdev,
2800  				 (unsigned long long)rdev->sb_start);
2801  			rdev->sb_events = mddev->events;
2802  			if (rdev->badblocks.size) {
2803  				md_super_write(mddev, rdev,
2804  					       rdev->badblocks.sector,
2805  					       rdev->badblocks.size << 9,
2806  					       rdev->bb_page);
2807  				rdev->badblocks.size = 0;
2808  			}
2809  
2810  		} else
2811  			pr_debug("md: %pg (skipping faulty)\n",
2812  				 rdev->bdev);
2813  
2814  		if (mddev->level == LEVEL_MULTIPATH)
2815  			/* only need to write one superblock... */
2816  			break;
2817  	}
2818  	if (md_super_wait(mddev) < 0)
2819  		goto rewrite;
2820  	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2821  
2822  	if (mddev_is_clustered(mddev) && ret == 0)
2823  		md_cluster_ops->metadata_update_finish(mddev);
2824  
2825  	if (mddev->in_sync != sync_req ||
2826  	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2827  			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2828  		/* have to write it out again */
2829  		goto repeat;
2830  	wake_up(&mddev->sb_wait);
2831  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2832  		sysfs_notify_dirent_safe(mddev->sysfs_completed);
2833  
2834  	rdev_for_each(rdev, mddev) {
2835  		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2836  			clear_bit(Blocked, &rdev->flags);
2837  
2838  		if (any_badblocks_changed)
2839  			ack_all_badblocks(&rdev->badblocks);
2840  		clear_bit(BlockedBadBlocks, &rdev->flags);
2841  		wake_up(&rdev->blocked_wait);
2842  	}
2843  }
2844  EXPORT_SYMBOL(md_update_sb);
2845  
add_bound_rdev(struct md_rdev * rdev)2846  static int add_bound_rdev(struct md_rdev *rdev)
2847  {
2848  	struct mddev *mddev = rdev->mddev;
2849  	int err = 0;
2850  	bool add_journal = test_bit(Journal, &rdev->flags);
2851  
2852  	if (!mddev->pers->hot_remove_disk || add_journal) {
2853  		/* If there is hot_add_disk but no hot_remove_disk
2854  		 * then added disks for geometry changes,
2855  		 * and should be added immediately.
2856  		 */
2857  		super_types[mddev->major_version].
2858  			validate_super(mddev, NULL/*freshest*/, rdev);
2859  		if (add_journal)
2860  			mddev_suspend(mddev);
2861  		err = mddev->pers->hot_add_disk(mddev, rdev);
2862  		if (add_journal)
2863  			mddev_resume(mddev);
2864  		if (err) {
2865  			md_kick_rdev_from_array(rdev);
2866  			return err;
2867  		}
2868  	}
2869  	sysfs_notify_dirent_safe(rdev->sysfs_state);
2870  
2871  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2872  	if (mddev->degraded)
2873  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2874  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2875  	md_new_event();
2876  	md_wakeup_thread(mddev->thread);
2877  	return 0;
2878  }
2879  
2880  /* words written to sysfs files may, or may not, be \n terminated.
2881   * We want to accept with case. For this we use cmd_match.
2882   */
cmd_match(const char * cmd,const char * str)2883  static int cmd_match(const char *cmd, const char *str)
2884  {
2885  	/* See if cmd, written into a sysfs file, matches
2886  	 * str.  They must either be the same, or cmd can
2887  	 * have a trailing newline
2888  	 */
2889  	while (*cmd && *str && *cmd == *str) {
2890  		cmd++;
2891  		str++;
2892  	}
2893  	if (*cmd == '\n')
2894  		cmd++;
2895  	if (*str || *cmd)
2896  		return 0;
2897  	return 1;
2898  }
2899  
2900  struct rdev_sysfs_entry {
2901  	struct attribute attr;
2902  	ssize_t (*show)(struct md_rdev *, char *);
2903  	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2904  };
2905  
2906  static ssize_t
state_show(struct md_rdev * rdev,char * page)2907  state_show(struct md_rdev *rdev, char *page)
2908  {
2909  	char *sep = ",";
2910  	size_t len = 0;
2911  	unsigned long flags = READ_ONCE(rdev->flags);
2912  
2913  	if (test_bit(Faulty, &flags) ||
2914  	    (!test_bit(ExternalBbl, &flags) &&
2915  	    rdev->badblocks.unacked_exist))
2916  		len += sprintf(page+len, "faulty%s", sep);
2917  	if (test_bit(In_sync, &flags))
2918  		len += sprintf(page+len, "in_sync%s", sep);
2919  	if (test_bit(Journal, &flags))
2920  		len += sprintf(page+len, "journal%s", sep);
2921  	if (test_bit(WriteMostly, &flags))
2922  		len += sprintf(page+len, "write_mostly%s", sep);
2923  	if (test_bit(Blocked, &flags) ||
2924  	    (rdev->badblocks.unacked_exist
2925  	     && !test_bit(Faulty, &flags)))
2926  		len += sprintf(page+len, "blocked%s", sep);
2927  	if (!test_bit(Faulty, &flags) &&
2928  	    !test_bit(Journal, &flags) &&
2929  	    !test_bit(In_sync, &flags))
2930  		len += sprintf(page+len, "spare%s", sep);
2931  	if (test_bit(WriteErrorSeen, &flags))
2932  		len += sprintf(page+len, "write_error%s", sep);
2933  	if (test_bit(WantReplacement, &flags))
2934  		len += sprintf(page+len, "want_replacement%s", sep);
2935  	if (test_bit(Replacement, &flags))
2936  		len += sprintf(page+len, "replacement%s", sep);
2937  	if (test_bit(ExternalBbl, &flags))
2938  		len += sprintf(page+len, "external_bbl%s", sep);
2939  	if (test_bit(FailFast, &flags))
2940  		len += sprintf(page+len, "failfast%s", sep);
2941  
2942  	if (len)
2943  		len -= strlen(sep);
2944  
2945  	return len+sprintf(page+len, "\n");
2946  }
2947  
2948  static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)2949  state_store(struct md_rdev *rdev, const char *buf, size_t len)
2950  {
2951  	/* can write
2952  	 *  faulty  - simulates an error
2953  	 *  remove  - disconnects the device
2954  	 *  writemostly - sets write_mostly
2955  	 *  -writemostly - clears write_mostly
2956  	 *  blocked - sets the Blocked flags
2957  	 *  -blocked - clears the Blocked and possibly simulates an error
2958  	 *  insync - sets Insync providing device isn't active
2959  	 *  -insync - clear Insync for a device with a slot assigned,
2960  	 *            so that it gets rebuilt based on bitmap
2961  	 *  write_error - sets WriteErrorSeen
2962  	 *  -write_error - clears WriteErrorSeen
2963  	 *  {,-}failfast - set/clear FailFast
2964  	 */
2965  
2966  	struct mddev *mddev = rdev->mddev;
2967  	int err = -EINVAL;
2968  	bool need_update_sb = false;
2969  
2970  	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2971  		md_error(rdev->mddev, rdev);
2972  
2973  		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
2974  			err = -EBUSY;
2975  		else
2976  			err = 0;
2977  	} else if (cmd_match(buf, "remove")) {
2978  		if (rdev->mddev->pers) {
2979  			clear_bit(Blocked, &rdev->flags);
2980  			remove_and_add_spares(rdev->mddev, rdev);
2981  		}
2982  		if (rdev->raid_disk >= 0)
2983  			err = -EBUSY;
2984  		else {
2985  			err = 0;
2986  			if (mddev_is_clustered(mddev))
2987  				err = md_cluster_ops->remove_disk(mddev, rdev);
2988  
2989  			if (err == 0) {
2990  				md_kick_rdev_from_array(rdev);
2991  				if (mddev->pers) {
2992  					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2993  					md_wakeup_thread(mddev->thread);
2994  				}
2995  				md_new_event();
2996  			}
2997  		}
2998  	} else if (cmd_match(buf, "writemostly")) {
2999  		set_bit(WriteMostly, &rdev->flags);
3000  		mddev_create_serial_pool(rdev->mddev, rdev, false);
3001  		need_update_sb = true;
3002  		err = 0;
3003  	} else if (cmd_match(buf, "-writemostly")) {
3004  		mddev_destroy_serial_pool(rdev->mddev, rdev, false);
3005  		clear_bit(WriteMostly, &rdev->flags);
3006  		need_update_sb = true;
3007  		err = 0;
3008  	} else if (cmd_match(buf, "blocked")) {
3009  		set_bit(Blocked, &rdev->flags);
3010  		err = 0;
3011  	} else if (cmd_match(buf, "-blocked")) {
3012  		if (!test_bit(Faulty, &rdev->flags) &&
3013  		    !test_bit(ExternalBbl, &rdev->flags) &&
3014  		    rdev->badblocks.unacked_exist) {
3015  			/* metadata handler doesn't understand badblocks,
3016  			 * so we need to fail the device
3017  			 */
3018  			md_error(rdev->mddev, rdev);
3019  		}
3020  		clear_bit(Blocked, &rdev->flags);
3021  		clear_bit(BlockedBadBlocks, &rdev->flags);
3022  		wake_up(&rdev->blocked_wait);
3023  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3024  		md_wakeup_thread(rdev->mddev->thread);
3025  
3026  		err = 0;
3027  	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3028  		set_bit(In_sync, &rdev->flags);
3029  		err = 0;
3030  	} else if (cmd_match(buf, "failfast")) {
3031  		set_bit(FailFast, &rdev->flags);
3032  		need_update_sb = true;
3033  		err = 0;
3034  	} else if (cmd_match(buf, "-failfast")) {
3035  		clear_bit(FailFast, &rdev->flags);
3036  		need_update_sb = true;
3037  		err = 0;
3038  	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3039  		   !test_bit(Journal, &rdev->flags)) {
3040  		if (rdev->mddev->pers == NULL) {
3041  			clear_bit(In_sync, &rdev->flags);
3042  			rdev->saved_raid_disk = rdev->raid_disk;
3043  			rdev->raid_disk = -1;
3044  			err = 0;
3045  		}
3046  	} else if (cmd_match(buf, "write_error")) {
3047  		set_bit(WriteErrorSeen, &rdev->flags);
3048  		err = 0;
3049  	} else if (cmd_match(buf, "-write_error")) {
3050  		clear_bit(WriteErrorSeen, &rdev->flags);
3051  		err = 0;
3052  	} else if (cmd_match(buf, "want_replacement")) {
3053  		/* Any non-spare device that is not a replacement can
3054  		 * become want_replacement at any time, but we then need to
3055  		 * check if recovery is needed.
3056  		 */
3057  		if (rdev->raid_disk >= 0 &&
3058  		    !test_bit(Journal, &rdev->flags) &&
3059  		    !test_bit(Replacement, &rdev->flags))
3060  			set_bit(WantReplacement, &rdev->flags);
3061  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3062  		md_wakeup_thread(rdev->mddev->thread);
3063  		err = 0;
3064  	} else if (cmd_match(buf, "-want_replacement")) {
3065  		/* Clearing 'want_replacement' is always allowed.
3066  		 * Once replacements starts it is too late though.
3067  		 */
3068  		err = 0;
3069  		clear_bit(WantReplacement, &rdev->flags);
3070  	} else if (cmd_match(buf, "replacement")) {
3071  		/* Can only set a device as a replacement when array has not
3072  		 * yet been started.  Once running, replacement is automatic
3073  		 * from spares, or by assigning 'slot'.
3074  		 */
3075  		if (rdev->mddev->pers)
3076  			err = -EBUSY;
3077  		else {
3078  			set_bit(Replacement, &rdev->flags);
3079  			err = 0;
3080  		}
3081  	} else if (cmd_match(buf, "-replacement")) {
3082  		/* Similarly, can only clear Replacement before start */
3083  		if (rdev->mddev->pers)
3084  			err = -EBUSY;
3085  		else {
3086  			clear_bit(Replacement, &rdev->flags);
3087  			err = 0;
3088  		}
3089  	} else if (cmd_match(buf, "re-add")) {
3090  		if (!rdev->mddev->pers)
3091  			err = -EINVAL;
3092  		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3093  				rdev->saved_raid_disk >= 0) {
3094  			/* clear_bit is performed _after_ all the devices
3095  			 * have their local Faulty bit cleared. If any writes
3096  			 * happen in the meantime in the local node, they
3097  			 * will land in the local bitmap, which will be synced
3098  			 * by this node eventually
3099  			 */
3100  			if (!mddev_is_clustered(rdev->mddev) ||
3101  			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
3102  				clear_bit(Faulty, &rdev->flags);
3103  				err = add_bound_rdev(rdev);
3104  			}
3105  		} else
3106  			err = -EBUSY;
3107  	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3108  		set_bit(ExternalBbl, &rdev->flags);
3109  		rdev->badblocks.shift = 0;
3110  		err = 0;
3111  	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3112  		clear_bit(ExternalBbl, &rdev->flags);
3113  		err = 0;
3114  	}
3115  	if (need_update_sb)
3116  		md_update_sb(mddev, 1);
3117  	if (!err)
3118  		sysfs_notify_dirent_safe(rdev->sysfs_state);
3119  	return err ? err : len;
3120  }
3121  static struct rdev_sysfs_entry rdev_state =
3122  __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3123  
3124  static ssize_t
errors_show(struct md_rdev * rdev,char * page)3125  errors_show(struct md_rdev *rdev, char *page)
3126  {
3127  	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3128  }
3129  
3130  static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3131  errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3132  {
3133  	unsigned int n;
3134  	int rv;
3135  
3136  	rv = kstrtouint(buf, 10, &n);
3137  	if (rv < 0)
3138  		return rv;
3139  	atomic_set(&rdev->corrected_errors, n);
3140  	return len;
3141  }
3142  static struct rdev_sysfs_entry rdev_errors =
3143  __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3144  
3145  static ssize_t
slot_show(struct md_rdev * rdev,char * page)3146  slot_show(struct md_rdev *rdev, char *page)
3147  {
3148  	if (test_bit(Journal, &rdev->flags))
3149  		return sprintf(page, "journal\n");
3150  	else if (rdev->raid_disk < 0)
3151  		return sprintf(page, "none\n");
3152  	else
3153  		return sprintf(page, "%d\n", rdev->raid_disk);
3154  }
3155  
3156  static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3157  slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3158  {
3159  	int slot;
3160  	int err;
3161  
3162  	if (test_bit(Journal, &rdev->flags))
3163  		return -EBUSY;
3164  	if (strncmp(buf, "none", 4)==0)
3165  		slot = -1;
3166  	else {
3167  		err = kstrtouint(buf, 10, (unsigned int *)&slot);
3168  		if (err < 0)
3169  			return err;
3170  		if (slot < 0)
3171  			/* overflow */
3172  			return -ENOSPC;
3173  	}
3174  	if (rdev->mddev->pers && slot == -1) {
3175  		/* Setting 'slot' on an active array requires also
3176  		 * updating the 'rd%d' link, and communicating
3177  		 * with the personality with ->hot_*_disk.
3178  		 * For now we only support removing
3179  		 * failed/spare devices.  This normally happens automatically,
3180  		 * but not when the metadata is externally managed.
3181  		 */
3182  		if (rdev->raid_disk == -1)
3183  			return -EEXIST;
3184  		/* personality does all needed checks */
3185  		if (rdev->mddev->pers->hot_remove_disk == NULL)
3186  			return -EINVAL;
3187  		clear_bit(Blocked, &rdev->flags);
3188  		remove_and_add_spares(rdev->mddev, rdev);
3189  		if (rdev->raid_disk >= 0)
3190  			return -EBUSY;
3191  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3192  		md_wakeup_thread(rdev->mddev->thread);
3193  	} else if (rdev->mddev->pers) {
3194  		/* Activating a spare .. or possibly reactivating
3195  		 * if we ever get bitmaps working here.
3196  		 */
3197  		int err;
3198  
3199  		if (rdev->raid_disk != -1)
3200  			return -EBUSY;
3201  
3202  		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3203  			return -EBUSY;
3204  
3205  		if (rdev->mddev->pers->hot_add_disk == NULL)
3206  			return -EINVAL;
3207  
3208  		if (slot >= rdev->mddev->raid_disks &&
3209  		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3210  			return -ENOSPC;
3211  
3212  		rdev->raid_disk = slot;
3213  		if (test_bit(In_sync, &rdev->flags))
3214  			rdev->saved_raid_disk = slot;
3215  		else
3216  			rdev->saved_raid_disk = -1;
3217  		clear_bit(In_sync, &rdev->flags);
3218  		clear_bit(Bitmap_sync, &rdev->flags);
3219  		err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3220  		if (err) {
3221  			rdev->raid_disk = -1;
3222  			return err;
3223  		} else
3224  			sysfs_notify_dirent_safe(rdev->sysfs_state);
3225  		/* failure here is OK */;
3226  		sysfs_link_rdev(rdev->mddev, rdev);
3227  		/* don't wakeup anyone, leave that to userspace. */
3228  	} else {
3229  		if (slot >= rdev->mddev->raid_disks &&
3230  		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3231  			return -ENOSPC;
3232  		rdev->raid_disk = slot;
3233  		/* assume it is working */
3234  		clear_bit(Faulty, &rdev->flags);
3235  		clear_bit(WriteMostly, &rdev->flags);
3236  		set_bit(In_sync, &rdev->flags);
3237  		sysfs_notify_dirent_safe(rdev->sysfs_state);
3238  	}
3239  	return len;
3240  }
3241  
3242  static struct rdev_sysfs_entry rdev_slot =
3243  __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3244  
3245  static ssize_t
offset_show(struct md_rdev * rdev,char * page)3246  offset_show(struct md_rdev *rdev, char *page)
3247  {
3248  	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3249  }
3250  
3251  static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3252  offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3253  {
3254  	unsigned long long offset;
3255  	if (kstrtoull(buf, 10, &offset) < 0)
3256  		return -EINVAL;
3257  	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3258  		return -EBUSY;
3259  	if (rdev->sectors && rdev->mddev->external)
3260  		/* Must set offset before size, so overlap checks
3261  		 * can be sane */
3262  		return -EBUSY;
3263  	rdev->data_offset = offset;
3264  	rdev->new_data_offset = offset;
3265  	return len;
3266  }
3267  
3268  static struct rdev_sysfs_entry rdev_offset =
3269  __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3270  
new_offset_show(struct md_rdev * rdev,char * page)3271  static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3272  {
3273  	return sprintf(page, "%llu\n",
3274  		       (unsigned long long)rdev->new_data_offset);
3275  }
3276  
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3277  static ssize_t new_offset_store(struct md_rdev *rdev,
3278  				const char *buf, size_t len)
3279  {
3280  	unsigned long long new_offset;
3281  	struct mddev *mddev = rdev->mddev;
3282  
3283  	if (kstrtoull(buf, 10, &new_offset) < 0)
3284  		return -EINVAL;
3285  
3286  	if (mddev->sync_thread ||
3287  	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3288  		return -EBUSY;
3289  	if (new_offset == rdev->data_offset)
3290  		/* reset is always permitted */
3291  		;
3292  	else if (new_offset > rdev->data_offset) {
3293  		/* must not push array size beyond rdev_sectors */
3294  		if (new_offset - rdev->data_offset
3295  		    + mddev->dev_sectors > rdev->sectors)
3296  				return -E2BIG;
3297  	}
3298  	/* Metadata worries about other space details. */
3299  
3300  	/* decreasing the offset is inconsistent with a backwards
3301  	 * reshape.
3302  	 */
3303  	if (new_offset < rdev->data_offset &&
3304  	    mddev->reshape_backwards)
3305  		return -EINVAL;
3306  	/* Increasing offset is inconsistent with forwards
3307  	 * reshape.  reshape_direction should be set to
3308  	 * 'backwards' first.
3309  	 */
3310  	if (new_offset > rdev->data_offset &&
3311  	    !mddev->reshape_backwards)
3312  		return -EINVAL;
3313  
3314  	if (mddev->pers && mddev->persistent &&
3315  	    !super_types[mddev->major_version]
3316  	    .allow_new_offset(rdev, new_offset))
3317  		return -E2BIG;
3318  	rdev->new_data_offset = new_offset;
3319  	if (new_offset > rdev->data_offset)
3320  		mddev->reshape_backwards = 1;
3321  	else if (new_offset < rdev->data_offset)
3322  		mddev->reshape_backwards = 0;
3323  
3324  	return len;
3325  }
3326  static struct rdev_sysfs_entry rdev_new_offset =
3327  __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3328  
3329  static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3330  rdev_size_show(struct md_rdev *rdev, char *page)
3331  {
3332  	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3333  }
3334  
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3335  static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3336  {
3337  	/* check if two start/length pairs overlap */
3338  	if (a->data_offset + a->sectors <= b->data_offset)
3339  		return false;
3340  	if (b->data_offset + b->sectors <= a->data_offset)
3341  		return false;
3342  	return true;
3343  }
3344  
md_rdev_overlaps(struct md_rdev * rdev)3345  static bool md_rdev_overlaps(struct md_rdev *rdev)
3346  {
3347  	struct mddev *mddev;
3348  	struct md_rdev *rdev2;
3349  
3350  	spin_lock(&all_mddevs_lock);
3351  	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3352  		if (test_bit(MD_DELETED, &mddev->flags))
3353  			continue;
3354  		rdev_for_each(rdev2, mddev) {
3355  			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3356  			    md_rdevs_overlap(rdev, rdev2)) {
3357  				spin_unlock(&all_mddevs_lock);
3358  				return true;
3359  			}
3360  		}
3361  	}
3362  	spin_unlock(&all_mddevs_lock);
3363  	return false;
3364  }
3365  
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3366  static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3367  {
3368  	unsigned long long blocks;
3369  	sector_t new;
3370  
3371  	if (kstrtoull(buf, 10, &blocks) < 0)
3372  		return -EINVAL;
3373  
3374  	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3375  		return -EINVAL; /* sector conversion overflow */
3376  
3377  	new = blocks * 2;
3378  	if (new != blocks * 2)
3379  		return -EINVAL; /* unsigned long long to sector_t overflow */
3380  
3381  	*sectors = new;
3382  	return 0;
3383  }
3384  
3385  static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3386  rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3387  {
3388  	struct mddev *my_mddev = rdev->mddev;
3389  	sector_t oldsectors = rdev->sectors;
3390  	sector_t sectors;
3391  
3392  	if (test_bit(Journal, &rdev->flags))
3393  		return -EBUSY;
3394  	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3395  		return -EINVAL;
3396  	if (rdev->data_offset != rdev->new_data_offset)
3397  		return -EINVAL; /* too confusing */
3398  	if (my_mddev->pers && rdev->raid_disk >= 0) {
3399  		if (my_mddev->persistent) {
3400  			sectors = super_types[my_mddev->major_version].
3401  				rdev_size_change(rdev, sectors);
3402  			if (!sectors)
3403  				return -EBUSY;
3404  		} else if (!sectors)
3405  			sectors = bdev_nr_sectors(rdev->bdev) -
3406  				rdev->data_offset;
3407  		if (!my_mddev->pers->resize)
3408  			/* Cannot change size for RAID0 or Linear etc */
3409  			return -EINVAL;
3410  	}
3411  	if (sectors < my_mddev->dev_sectors)
3412  		return -EINVAL; /* component must fit device */
3413  
3414  	rdev->sectors = sectors;
3415  
3416  	/*
3417  	 * Check that all other rdevs with the same bdev do not overlap.  This
3418  	 * check does not provide a hard guarantee, it just helps avoid
3419  	 * dangerous mistakes.
3420  	 */
3421  	if (sectors > oldsectors && my_mddev->external &&
3422  	    md_rdev_overlaps(rdev)) {
3423  		/*
3424  		 * Someone else could have slipped in a size change here, but
3425  		 * doing so is just silly.  We put oldsectors back because we
3426  		 * know it is safe, and trust userspace not to race with itself.
3427  		 */
3428  		rdev->sectors = oldsectors;
3429  		return -EBUSY;
3430  	}
3431  	return len;
3432  }
3433  
3434  static struct rdev_sysfs_entry rdev_size =
3435  __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3436  
recovery_start_show(struct md_rdev * rdev,char * page)3437  static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3438  {
3439  	unsigned long long recovery_start = rdev->recovery_offset;
3440  
3441  	if (test_bit(In_sync, &rdev->flags) ||
3442  	    recovery_start == MaxSector)
3443  		return sprintf(page, "none\n");
3444  
3445  	return sprintf(page, "%llu\n", recovery_start);
3446  }
3447  
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3448  static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3449  {
3450  	unsigned long long recovery_start;
3451  
3452  	if (cmd_match(buf, "none"))
3453  		recovery_start = MaxSector;
3454  	else if (kstrtoull(buf, 10, &recovery_start))
3455  		return -EINVAL;
3456  
3457  	if (rdev->mddev->pers &&
3458  	    rdev->raid_disk >= 0)
3459  		return -EBUSY;
3460  
3461  	rdev->recovery_offset = recovery_start;
3462  	if (recovery_start == MaxSector)
3463  		set_bit(In_sync, &rdev->flags);
3464  	else
3465  		clear_bit(In_sync, &rdev->flags);
3466  	return len;
3467  }
3468  
3469  static struct rdev_sysfs_entry rdev_recovery_start =
3470  __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3471  
3472  /* sysfs access to bad-blocks list.
3473   * We present two files.
3474   * 'bad-blocks' lists sector numbers and lengths of ranges that
3475   *    are recorded as bad.  The list is truncated to fit within
3476   *    the one-page limit of sysfs.
3477   *    Writing "sector length" to this file adds an acknowledged
3478   *    bad block list.
3479   * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3480   *    been acknowledged.  Writing to this file adds bad blocks
3481   *    without acknowledging them.  This is largely for testing.
3482   */
bb_show(struct md_rdev * rdev,char * page)3483  static ssize_t bb_show(struct md_rdev *rdev, char *page)
3484  {
3485  	return badblocks_show(&rdev->badblocks, page, 0);
3486  }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3487  static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3488  {
3489  	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3490  	/* Maybe that ack was all we needed */
3491  	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3492  		wake_up(&rdev->blocked_wait);
3493  	return rv;
3494  }
3495  static struct rdev_sysfs_entry rdev_bad_blocks =
3496  __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3497  
ubb_show(struct md_rdev * rdev,char * page)3498  static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3499  {
3500  	return badblocks_show(&rdev->badblocks, page, 1);
3501  }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3502  static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3503  {
3504  	return badblocks_store(&rdev->badblocks, page, len, 1);
3505  }
3506  static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3507  __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3508  
3509  static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3510  ppl_sector_show(struct md_rdev *rdev, char *page)
3511  {
3512  	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3513  }
3514  
3515  static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3516  ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3517  {
3518  	unsigned long long sector;
3519  
3520  	if (kstrtoull(buf, 10, &sector) < 0)
3521  		return -EINVAL;
3522  	if (sector != (sector_t)sector)
3523  		return -EINVAL;
3524  
3525  	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3526  	    rdev->raid_disk >= 0)
3527  		return -EBUSY;
3528  
3529  	if (rdev->mddev->persistent) {
3530  		if (rdev->mddev->major_version == 0)
3531  			return -EINVAL;
3532  		if ((sector > rdev->sb_start &&
3533  		     sector - rdev->sb_start > S16_MAX) ||
3534  		    (sector < rdev->sb_start &&
3535  		     rdev->sb_start - sector > -S16_MIN))
3536  			return -EINVAL;
3537  		rdev->ppl.offset = sector - rdev->sb_start;
3538  	} else if (!rdev->mddev->external) {
3539  		return -EBUSY;
3540  	}
3541  	rdev->ppl.sector = sector;
3542  	return len;
3543  }
3544  
3545  static struct rdev_sysfs_entry rdev_ppl_sector =
3546  __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3547  
3548  static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3549  ppl_size_show(struct md_rdev *rdev, char *page)
3550  {
3551  	return sprintf(page, "%u\n", rdev->ppl.size);
3552  }
3553  
3554  static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3555  ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3556  {
3557  	unsigned int size;
3558  
3559  	if (kstrtouint(buf, 10, &size) < 0)
3560  		return -EINVAL;
3561  
3562  	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3563  	    rdev->raid_disk >= 0)
3564  		return -EBUSY;
3565  
3566  	if (rdev->mddev->persistent) {
3567  		if (rdev->mddev->major_version == 0)
3568  			return -EINVAL;
3569  		if (size > U16_MAX)
3570  			return -EINVAL;
3571  	} else if (!rdev->mddev->external) {
3572  		return -EBUSY;
3573  	}
3574  	rdev->ppl.size = size;
3575  	return len;
3576  }
3577  
3578  static struct rdev_sysfs_entry rdev_ppl_size =
3579  __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3580  
3581  static struct attribute *rdev_default_attrs[] = {
3582  	&rdev_state.attr,
3583  	&rdev_errors.attr,
3584  	&rdev_slot.attr,
3585  	&rdev_offset.attr,
3586  	&rdev_new_offset.attr,
3587  	&rdev_size.attr,
3588  	&rdev_recovery_start.attr,
3589  	&rdev_bad_blocks.attr,
3590  	&rdev_unack_bad_blocks.attr,
3591  	&rdev_ppl_sector.attr,
3592  	&rdev_ppl_size.attr,
3593  	NULL,
3594  };
3595  ATTRIBUTE_GROUPS(rdev_default);
3596  static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3597  rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3598  {
3599  	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3600  	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3601  
3602  	if (!entry->show)
3603  		return -EIO;
3604  	if (!rdev->mddev)
3605  		return -ENODEV;
3606  	return entry->show(rdev, page);
3607  }
3608  
3609  static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3610  rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3611  	      const char *page, size_t length)
3612  {
3613  	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3614  	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3615  	struct kernfs_node *kn = NULL;
3616  	ssize_t rv;
3617  	struct mddev *mddev = rdev->mddev;
3618  
3619  	if (!entry->store)
3620  		return -EIO;
3621  	if (!capable(CAP_SYS_ADMIN))
3622  		return -EACCES;
3623  
3624  	if (entry->store == state_store && cmd_match(page, "remove"))
3625  		kn = sysfs_break_active_protection(kobj, attr);
3626  
3627  	rv = mddev ? mddev_lock(mddev) : -ENODEV;
3628  	if (!rv) {
3629  		if (rdev->mddev == NULL)
3630  			rv = -ENODEV;
3631  		else
3632  			rv = entry->store(rdev, page, length);
3633  		mddev_unlock(mddev);
3634  	}
3635  
3636  	if (kn)
3637  		sysfs_unbreak_active_protection(kn);
3638  
3639  	return rv;
3640  }
3641  
rdev_free(struct kobject * ko)3642  static void rdev_free(struct kobject *ko)
3643  {
3644  	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3645  	kfree(rdev);
3646  }
3647  static const struct sysfs_ops rdev_sysfs_ops = {
3648  	.show		= rdev_attr_show,
3649  	.store		= rdev_attr_store,
3650  };
3651  static const struct kobj_type rdev_ktype = {
3652  	.release	= rdev_free,
3653  	.sysfs_ops	= &rdev_sysfs_ops,
3654  	.default_groups	= rdev_default_groups,
3655  };
3656  
md_rdev_init(struct md_rdev * rdev)3657  int md_rdev_init(struct md_rdev *rdev)
3658  {
3659  	rdev->desc_nr = -1;
3660  	rdev->saved_raid_disk = -1;
3661  	rdev->raid_disk = -1;
3662  	rdev->flags = 0;
3663  	rdev->data_offset = 0;
3664  	rdev->new_data_offset = 0;
3665  	rdev->sb_events = 0;
3666  	rdev->last_read_error = 0;
3667  	rdev->sb_loaded = 0;
3668  	rdev->bb_page = NULL;
3669  	atomic_set(&rdev->nr_pending, 0);
3670  	atomic_set(&rdev->read_errors, 0);
3671  	atomic_set(&rdev->corrected_errors, 0);
3672  
3673  	INIT_LIST_HEAD(&rdev->same_set);
3674  	init_waitqueue_head(&rdev->blocked_wait);
3675  
3676  	/* Add space to store bad block list.
3677  	 * This reserves the space even on arrays where it cannot
3678  	 * be used - I wonder if that matters
3679  	 */
3680  	return badblocks_init(&rdev->badblocks, 0);
3681  }
3682  EXPORT_SYMBOL_GPL(md_rdev_init);
3683  
3684  /*
3685   * Import a device. If 'super_format' >= 0, then sanity check the superblock
3686   *
3687   * mark the device faulty if:
3688   *
3689   *   - the device is nonexistent (zero size)
3690   *   - the device has no valid superblock
3691   *
3692   * a faulty rdev _never_ has rdev->sb set.
3693   */
md_import_device(dev_t newdev,int super_format,int super_minor)3694  static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3695  {
3696  	struct md_rdev *rdev;
3697  	struct md_rdev *holder;
3698  	sector_t size;
3699  	int err;
3700  
3701  	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3702  	if (!rdev)
3703  		return ERR_PTR(-ENOMEM);
3704  
3705  	err = md_rdev_init(rdev);
3706  	if (err)
3707  		goto out_free_rdev;
3708  	err = alloc_disk_sb(rdev);
3709  	if (err)
3710  		goto out_clear_rdev;
3711  
3712  	if (super_format == -2) {
3713  		holder = &claim_rdev;
3714  	} else {
3715  		holder = rdev;
3716  		set_bit(Holder, &rdev->flags);
3717  	}
3718  
3719  	rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
3720  				       holder, NULL);
3721  	if (IS_ERR(rdev->bdev)) {
3722  		pr_warn("md: could not open device unknown-block(%u,%u).\n",
3723  			MAJOR(newdev), MINOR(newdev));
3724  		err = PTR_ERR(rdev->bdev);
3725  		goto out_clear_rdev;
3726  	}
3727  
3728  	kobject_init(&rdev->kobj, &rdev_ktype);
3729  
3730  	size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3731  	if (!size) {
3732  		pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3733  			rdev->bdev);
3734  		err = -EINVAL;
3735  		goto out_blkdev_put;
3736  	}
3737  
3738  	if (super_format >= 0) {
3739  		err = super_types[super_format].
3740  			load_super(rdev, NULL, super_minor);
3741  		if (err == -EINVAL) {
3742  			pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3743  				rdev->bdev,
3744  				super_format, super_minor);
3745  			goto out_blkdev_put;
3746  		}
3747  		if (err < 0) {
3748  			pr_warn("md: could not read %pg's sb, not importing!\n",
3749  				rdev->bdev);
3750  			goto out_blkdev_put;
3751  		}
3752  	}
3753  
3754  	return rdev;
3755  
3756  out_blkdev_put:
3757  	blkdev_put(rdev->bdev, holder);
3758  out_clear_rdev:
3759  	md_rdev_clear(rdev);
3760  out_free_rdev:
3761  	kfree(rdev);
3762  	return ERR_PTR(err);
3763  }
3764  
3765  /*
3766   * Check a full RAID array for plausibility
3767   */
3768  
analyze_sbs(struct mddev * mddev)3769  static int analyze_sbs(struct mddev *mddev)
3770  {
3771  	int i;
3772  	struct md_rdev *rdev, *freshest, *tmp;
3773  
3774  	freshest = NULL;
3775  	rdev_for_each_safe(rdev, tmp, mddev)
3776  		switch (super_types[mddev->major_version].
3777  			load_super(rdev, freshest, mddev->minor_version)) {
3778  		case 1:
3779  			freshest = rdev;
3780  			break;
3781  		case 0:
3782  			break;
3783  		default:
3784  			pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3785  				rdev->bdev);
3786  			md_kick_rdev_from_array(rdev);
3787  		}
3788  
3789  	/* Cannot find a valid fresh disk */
3790  	if (!freshest) {
3791  		pr_warn("md: cannot find a valid disk\n");
3792  		return -EINVAL;
3793  	}
3794  
3795  	super_types[mddev->major_version].
3796  		validate_super(mddev, NULL/*freshest*/, freshest);
3797  
3798  	i = 0;
3799  	rdev_for_each_safe(rdev, tmp, mddev) {
3800  		if (mddev->max_disks &&
3801  		    (rdev->desc_nr >= mddev->max_disks ||
3802  		     i > mddev->max_disks)) {
3803  			pr_warn("md: %s: %pg: only %d devices permitted\n",
3804  				mdname(mddev), rdev->bdev,
3805  				mddev->max_disks);
3806  			md_kick_rdev_from_array(rdev);
3807  			continue;
3808  		}
3809  		if (rdev != freshest) {
3810  			if (super_types[mddev->major_version].
3811  			    validate_super(mddev, freshest, rdev)) {
3812  				pr_warn("md: kicking non-fresh %pg from array!\n",
3813  					rdev->bdev);
3814  				md_kick_rdev_from_array(rdev);
3815  				continue;
3816  			}
3817  		}
3818  		if (mddev->level == LEVEL_MULTIPATH) {
3819  			rdev->desc_nr = i++;
3820  			rdev->raid_disk = rdev->desc_nr;
3821  			set_bit(In_sync, &rdev->flags);
3822  		} else if (rdev->raid_disk >=
3823  			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3824  			   !test_bit(Journal, &rdev->flags)) {
3825  			rdev->raid_disk = -1;
3826  			clear_bit(In_sync, &rdev->flags);
3827  		}
3828  	}
3829  
3830  	return 0;
3831  }
3832  
3833  /* Read a fixed-point number.
3834   * Numbers in sysfs attributes should be in "standard" units where
3835   * possible, so time should be in seconds.
3836   * However we internally use a a much smaller unit such as
3837   * milliseconds or jiffies.
3838   * This function takes a decimal number with a possible fractional
3839   * component, and produces an integer which is the result of
3840   * multiplying that number by 10^'scale'.
3841   * all without any floating-point arithmetic.
3842   */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3843  int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3844  {
3845  	unsigned long result = 0;
3846  	long decimals = -1;
3847  	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3848  		if (*cp == '.')
3849  			decimals = 0;
3850  		else if (decimals < scale) {
3851  			unsigned int value;
3852  			value = *cp - '0';
3853  			result = result * 10 + value;
3854  			if (decimals >= 0)
3855  				decimals++;
3856  		}
3857  		cp++;
3858  	}
3859  	if (*cp == '\n')
3860  		cp++;
3861  	if (*cp)
3862  		return -EINVAL;
3863  	if (decimals < 0)
3864  		decimals = 0;
3865  	*res = result * int_pow(10, scale - decimals);
3866  	return 0;
3867  }
3868  
3869  static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3870  safe_delay_show(struct mddev *mddev, char *page)
3871  {
3872  	unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3873  
3874  	return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3875  }
3876  static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3877  safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3878  {
3879  	unsigned long msec;
3880  
3881  	if (mddev_is_clustered(mddev)) {
3882  		pr_warn("md: Safemode is disabled for clustered mode\n");
3883  		return -EINVAL;
3884  	}
3885  
3886  	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3887  		return -EINVAL;
3888  	if (msec == 0)
3889  		mddev->safemode_delay = 0;
3890  	else {
3891  		unsigned long old_delay = mddev->safemode_delay;
3892  		unsigned long new_delay = (msec*HZ)/1000;
3893  
3894  		if (new_delay == 0)
3895  			new_delay = 1;
3896  		mddev->safemode_delay = new_delay;
3897  		if (new_delay < old_delay || old_delay == 0)
3898  			mod_timer(&mddev->safemode_timer, jiffies+1);
3899  	}
3900  	return len;
3901  }
3902  static struct md_sysfs_entry md_safe_delay =
3903  __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3904  
3905  static ssize_t
level_show(struct mddev * mddev,char * page)3906  level_show(struct mddev *mddev, char *page)
3907  {
3908  	struct md_personality *p;
3909  	int ret;
3910  	spin_lock(&mddev->lock);
3911  	p = mddev->pers;
3912  	if (p)
3913  		ret = sprintf(page, "%s\n", p->name);
3914  	else if (mddev->clevel[0])
3915  		ret = sprintf(page, "%s\n", mddev->clevel);
3916  	else if (mddev->level != LEVEL_NONE)
3917  		ret = sprintf(page, "%d\n", mddev->level);
3918  	else
3919  		ret = 0;
3920  	spin_unlock(&mddev->lock);
3921  	return ret;
3922  }
3923  
3924  static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)3925  level_store(struct mddev *mddev, const char *buf, size_t len)
3926  {
3927  	char clevel[16];
3928  	ssize_t rv;
3929  	size_t slen = len;
3930  	struct md_personality *pers, *oldpers;
3931  	long level;
3932  	void *priv, *oldpriv;
3933  	struct md_rdev *rdev;
3934  
3935  	if (slen == 0 || slen >= sizeof(clevel))
3936  		return -EINVAL;
3937  
3938  	rv = mddev_lock(mddev);
3939  	if (rv)
3940  		return rv;
3941  
3942  	if (mddev->pers == NULL) {
3943  		strncpy(mddev->clevel, buf, slen);
3944  		if (mddev->clevel[slen-1] == '\n')
3945  			slen--;
3946  		mddev->clevel[slen] = 0;
3947  		mddev->level = LEVEL_NONE;
3948  		rv = len;
3949  		goto out_unlock;
3950  	}
3951  	rv = -EROFS;
3952  	if (!md_is_rdwr(mddev))
3953  		goto out_unlock;
3954  
3955  	/* request to change the personality.  Need to ensure:
3956  	 *  - array is not engaged in resync/recovery/reshape
3957  	 *  - old personality can be suspended
3958  	 *  - new personality will access other array.
3959  	 */
3960  
3961  	rv = -EBUSY;
3962  	if (mddev->sync_thread ||
3963  	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3964  	    mddev->reshape_position != MaxSector ||
3965  	    mddev->sysfs_active)
3966  		goto out_unlock;
3967  
3968  	rv = -EINVAL;
3969  	if (!mddev->pers->quiesce) {
3970  		pr_warn("md: %s: %s does not support online personality change\n",
3971  			mdname(mddev), mddev->pers->name);
3972  		goto out_unlock;
3973  	}
3974  
3975  	/* Now find the new personality */
3976  	strncpy(clevel, buf, slen);
3977  	if (clevel[slen-1] == '\n')
3978  		slen--;
3979  	clevel[slen] = 0;
3980  	if (kstrtol(clevel, 10, &level))
3981  		level = LEVEL_NONE;
3982  
3983  	if (request_module("md-%s", clevel) != 0)
3984  		request_module("md-level-%s", clevel);
3985  	spin_lock(&pers_lock);
3986  	pers = find_pers(level, clevel);
3987  	if (!pers || !try_module_get(pers->owner)) {
3988  		spin_unlock(&pers_lock);
3989  		pr_warn("md: personality %s not loaded\n", clevel);
3990  		rv = -EINVAL;
3991  		goto out_unlock;
3992  	}
3993  	spin_unlock(&pers_lock);
3994  
3995  	if (pers == mddev->pers) {
3996  		/* Nothing to do! */
3997  		module_put(pers->owner);
3998  		rv = len;
3999  		goto out_unlock;
4000  	}
4001  	if (!pers->takeover) {
4002  		module_put(pers->owner);
4003  		pr_warn("md: %s: %s does not support personality takeover\n",
4004  			mdname(mddev), clevel);
4005  		rv = -EINVAL;
4006  		goto out_unlock;
4007  	}
4008  
4009  	rdev_for_each(rdev, mddev)
4010  		rdev->new_raid_disk = rdev->raid_disk;
4011  
4012  	/* ->takeover must set new_* and/or delta_disks
4013  	 * if it succeeds, and may set them when it fails.
4014  	 */
4015  	priv = pers->takeover(mddev);
4016  	if (IS_ERR(priv)) {
4017  		mddev->new_level = mddev->level;
4018  		mddev->new_layout = mddev->layout;
4019  		mddev->new_chunk_sectors = mddev->chunk_sectors;
4020  		mddev->raid_disks -= mddev->delta_disks;
4021  		mddev->delta_disks = 0;
4022  		mddev->reshape_backwards = 0;
4023  		module_put(pers->owner);
4024  		pr_warn("md: %s: %s would not accept array\n",
4025  			mdname(mddev), clevel);
4026  		rv = PTR_ERR(priv);
4027  		goto out_unlock;
4028  	}
4029  
4030  	/* Looks like we have a winner */
4031  	mddev_suspend(mddev);
4032  	mddev_detach(mddev);
4033  
4034  	spin_lock(&mddev->lock);
4035  	oldpers = mddev->pers;
4036  	oldpriv = mddev->private;
4037  	mddev->pers = pers;
4038  	mddev->private = priv;
4039  	strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4040  	mddev->level = mddev->new_level;
4041  	mddev->layout = mddev->new_layout;
4042  	mddev->chunk_sectors = mddev->new_chunk_sectors;
4043  	mddev->delta_disks = 0;
4044  	mddev->reshape_backwards = 0;
4045  	mddev->degraded = 0;
4046  	spin_unlock(&mddev->lock);
4047  
4048  	if (oldpers->sync_request == NULL &&
4049  	    mddev->external) {
4050  		/* We are converting from a no-redundancy array
4051  		 * to a redundancy array and metadata is managed
4052  		 * externally so we need to be sure that writes
4053  		 * won't block due to a need to transition
4054  		 *      clean->dirty
4055  		 * until external management is started.
4056  		 */
4057  		mddev->in_sync = 0;
4058  		mddev->safemode_delay = 0;
4059  		mddev->safemode = 0;
4060  	}
4061  
4062  	oldpers->free(mddev, oldpriv);
4063  
4064  	if (oldpers->sync_request == NULL &&
4065  	    pers->sync_request != NULL) {
4066  		/* need to add the md_redundancy_group */
4067  		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4068  			pr_warn("md: cannot register extra attributes for %s\n",
4069  				mdname(mddev));
4070  		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4071  		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4072  		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4073  	}
4074  	if (oldpers->sync_request != NULL &&
4075  	    pers->sync_request == NULL) {
4076  		/* need to remove the md_redundancy_group */
4077  		if (mddev->to_remove == NULL)
4078  			mddev->to_remove = &md_redundancy_group;
4079  	}
4080  
4081  	module_put(oldpers->owner);
4082  
4083  	rdev_for_each(rdev, mddev) {
4084  		if (rdev->raid_disk < 0)
4085  			continue;
4086  		if (rdev->new_raid_disk >= mddev->raid_disks)
4087  			rdev->new_raid_disk = -1;
4088  		if (rdev->new_raid_disk == rdev->raid_disk)
4089  			continue;
4090  		sysfs_unlink_rdev(mddev, rdev);
4091  	}
4092  	rdev_for_each(rdev, mddev) {
4093  		if (rdev->raid_disk < 0)
4094  			continue;
4095  		if (rdev->new_raid_disk == rdev->raid_disk)
4096  			continue;
4097  		rdev->raid_disk = rdev->new_raid_disk;
4098  		if (rdev->raid_disk < 0)
4099  			clear_bit(In_sync, &rdev->flags);
4100  		else {
4101  			if (sysfs_link_rdev(mddev, rdev))
4102  				pr_warn("md: cannot register rd%d for %s after level change\n",
4103  					rdev->raid_disk, mdname(mddev));
4104  		}
4105  	}
4106  
4107  	if (pers->sync_request == NULL) {
4108  		/* this is now an array without redundancy, so
4109  		 * it must always be in_sync
4110  		 */
4111  		mddev->in_sync = 1;
4112  		del_timer_sync(&mddev->safemode_timer);
4113  	}
4114  	blk_set_stacking_limits(&mddev->queue->limits);
4115  	pers->run(mddev);
4116  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4117  	mddev_resume(mddev);
4118  	if (!mddev->thread)
4119  		md_update_sb(mddev, 1);
4120  	sysfs_notify_dirent_safe(mddev->sysfs_level);
4121  	md_new_event();
4122  	rv = len;
4123  out_unlock:
4124  	mddev_unlock(mddev);
4125  	return rv;
4126  }
4127  
4128  static struct md_sysfs_entry md_level =
4129  __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4130  
4131  static ssize_t
layout_show(struct mddev * mddev,char * page)4132  layout_show(struct mddev *mddev, char *page)
4133  {
4134  	/* just a number, not meaningful for all levels */
4135  	if (mddev->reshape_position != MaxSector &&
4136  	    mddev->layout != mddev->new_layout)
4137  		return sprintf(page, "%d (%d)\n",
4138  			       mddev->new_layout, mddev->layout);
4139  	return sprintf(page, "%d\n", mddev->layout);
4140  }
4141  
4142  static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4143  layout_store(struct mddev *mddev, const char *buf, size_t len)
4144  {
4145  	unsigned int n;
4146  	int err;
4147  
4148  	err = kstrtouint(buf, 10, &n);
4149  	if (err < 0)
4150  		return err;
4151  	err = mddev_lock(mddev);
4152  	if (err)
4153  		return err;
4154  
4155  	if (mddev->pers) {
4156  		if (mddev->pers->check_reshape == NULL)
4157  			err = -EBUSY;
4158  		else if (!md_is_rdwr(mddev))
4159  			err = -EROFS;
4160  		else {
4161  			mddev->new_layout = n;
4162  			err = mddev->pers->check_reshape(mddev);
4163  			if (err)
4164  				mddev->new_layout = mddev->layout;
4165  		}
4166  	} else {
4167  		mddev->new_layout = n;
4168  		if (mddev->reshape_position == MaxSector)
4169  			mddev->layout = n;
4170  	}
4171  	mddev_unlock(mddev);
4172  	return err ?: len;
4173  }
4174  static struct md_sysfs_entry md_layout =
4175  __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4176  
4177  static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4178  raid_disks_show(struct mddev *mddev, char *page)
4179  {
4180  	if (mddev->raid_disks == 0)
4181  		return 0;
4182  	if (mddev->reshape_position != MaxSector &&
4183  	    mddev->delta_disks != 0)
4184  		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4185  			       mddev->raid_disks - mddev->delta_disks);
4186  	return sprintf(page, "%d\n", mddev->raid_disks);
4187  }
4188  
4189  static int update_raid_disks(struct mddev *mddev, int raid_disks);
4190  
4191  static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4192  raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4193  {
4194  	unsigned int n;
4195  	int err;
4196  
4197  	err = kstrtouint(buf, 10, &n);
4198  	if (err < 0)
4199  		return err;
4200  
4201  	err = mddev_lock(mddev);
4202  	if (err)
4203  		return err;
4204  	if (mddev->pers)
4205  		err = update_raid_disks(mddev, n);
4206  	else if (mddev->reshape_position != MaxSector) {
4207  		struct md_rdev *rdev;
4208  		int olddisks = mddev->raid_disks - mddev->delta_disks;
4209  
4210  		err = -EINVAL;
4211  		rdev_for_each(rdev, mddev) {
4212  			if (olddisks < n &&
4213  			    rdev->data_offset < rdev->new_data_offset)
4214  				goto out_unlock;
4215  			if (olddisks > n &&
4216  			    rdev->data_offset > rdev->new_data_offset)
4217  				goto out_unlock;
4218  		}
4219  		err = 0;
4220  		mddev->delta_disks = n - olddisks;
4221  		mddev->raid_disks = n;
4222  		mddev->reshape_backwards = (mddev->delta_disks < 0);
4223  	} else
4224  		mddev->raid_disks = n;
4225  out_unlock:
4226  	mddev_unlock(mddev);
4227  	return err ? err : len;
4228  }
4229  static struct md_sysfs_entry md_raid_disks =
4230  __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4231  
4232  static ssize_t
uuid_show(struct mddev * mddev,char * page)4233  uuid_show(struct mddev *mddev, char *page)
4234  {
4235  	return sprintf(page, "%pU\n", mddev->uuid);
4236  }
4237  static struct md_sysfs_entry md_uuid =
4238  __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4239  
4240  static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4241  chunk_size_show(struct mddev *mddev, char *page)
4242  {
4243  	if (mddev->reshape_position != MaxSector &&
4244  	    mddev->chunk_sectors != mddev->new_chunk_sectors)
4245  		return sprintf(page, "%d (%d)\n",
4246  			       mddev->new_chunk_sectors << 9,
4247  			       mddev->chunk_sectors << 9);
4248  	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4249  }
4250  
4251  static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4252  chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4253  {
4254  	unsigned long n;
4255  	int err;
4256  
4257  	err = kstrtoul(buf, 10, &n);
4258  	if (err < 0)
4259  		return err;
4260  
4261  	err = mddev_lock(mddev);
4262  	if (err)
4263  		return err;
4264  	if (mddev->pers) {
4265  		if (mddev->pers->check_reshape == NULL)
4266  			err = -EBUSY;
4267  		else if (!md_is_rdwr(mddev))
4268  			err = -EROFS;
4269  		else {
4270  			mddev->new_chunk_sectors = n >> 9;
4271  			err = mddev->pers->check_reshape(mddev);
4272  			if (err)
4273  				mddev->new_chunk_sectors = mddev->chunk_sectors;
4274  		}
4275  	} else {
4276  		mddev->new_chunk_sectors = n >> 9;
4277  		if (mddev->reshape_position == MaxSector)
4278  			mddev->chunk_sectors = n >> 9;
4279  	}
4280  	mddev_unlock(mddev);
4281  	return err ?: len;
4282  }
4283  static struct md_sysfs_entry md_chunk_size =
4284  __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4285  
4286  static ssize_t
resync_start_show(struct mddev * mddev,char * page)4287  resync_start_show(struct mddev *mddev, char *page)
4288  {
4289  	if (mddev->recovery_cp == MaxSector)
4290  		return sprintf(page, "none\n");
4291  	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4292  }
4293  
4294  static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4295  resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4296  {
4297  	unsigned long long n;
4298  	int err;
4299  
4300  	if (cmd_match(buf, "none"))
4301  		n = MaxSector;
4302  	else {
4303  		err = kstrtoull(buf, 10, &n);
4304  		if (err < 0)
4305  			return err;
4306  		if (n != (sector_t)n)
4307  			return -EINVAL;
4308  	}
4309  
4310  	err = mddev_lock(mddev);
4311  	if (err)
4312  		return err;
4313  	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4314  		err = -EBUSY;
4315  
4316  	if (!err) {
4317  		mddev->recovery_cp = n;
4318  		if (mddev->pers)
4319  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4320  	}
4321  	mddev_unlock(mddev);
4322  	return err ?: len;
4323  }
4324  static struct md_sysfs_entry md_resync_start =
4325  __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4326  		resync_start_show, resync_start_store);
4327  
4328  /*
4329   * The array state can be:
4330   *
4331   * clear
4332   *     No devices, no size, no level
4333   *     Equivalent to STOP_ARRAY ioctl
4334   * inactive
4335   *     May have some settings, but array is not active
4336   *        all IO results in error
4337   *     When written, doesn't tear down array, but just stops it
4338   * suspended (not supported yet)
4339   *     All IO requests will block. The array can be reconfigured.
4340   *     Writing this, if accepted, will block until array is quiescent
4341   * readonly
4342   *     no resync can happen.  no superblocks get written.
4343   *     write requests fail
4344   * read-auto
4345   *     like readonly, but behaves like 'clean' on a write request.
4346   *
4347   * clean - no pending writes, but otherwise active.
4348   *     When written to inactive array, starts without resync
4349   *     If a write request arrives then
4350   *       if metadata is known, mark 'dirty' and switch to 'active'.
4351   *       if not known, block and switch to write-pending
4352   *     If written to an active array that has pending writes, then fails.
4353   * active
4354   *     fully active: IO and resync can be happening.
4355   *     When written to inactive array, starts with resync
4356   *
4357   * write-pending
4358   *     clean, but writes are blocked waiting for 'active' to be written.
4359   *
4360   * active-idle
4361   *     like active, but no writes have been seen for a while (100msec).
4362   *
4363   * broken
4364  *     Array is failed. It's useful because mounted-arrays aren't stopped
4365  *     when array is failed, so this state will at least alert the user that
4366  *     something is wrong.
4367   */
4368  enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4369  		   write_pending, active_idle, broken, bad_word};
4370  static char *array_states[] = {
4371  	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4372  	"write-pending", "active-idle", "broken", NULL };
4373  
match_word(const char * word,char ** list)4374  static int match_word(const char *word, char **list)
4375  {
4376  	int n;
4377  	for (n=0; list[n]; n++)
4378  		if (cmd_match(word, list[n]))
4379  			break;
4380  	return n;
4381  }
4382  
4383  static ssize_t
array_state_show(struct mddev * mddev,char * page)4384  array_state_show(struct mddev *mddev, char *page)
4385  {
4386  	enum array_state st = inactive;
4387  
4388  	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4389  		switch(mddev->ro) {
4390  		case MD_RDONLY:
4391  			st = readonly;
4392  			break;
4393  		case MD_AUTO_READ:
4394  			st = read_auto;
4395  			break;
4396  		case MD_RDWR:
4397  			spin_lock(&mddev->lock);
4398  			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4399  				st = write_pending;
4400  			else if (mddev->in_sync)
4401  				st = clean;
4402  			else if (mddev->safemode)
4403  				st = active_idle;
4404  			else
4405  				st = active;
4406  			spin_unlock(&mddev->lock);
4407  		}
4408  
4409  		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4410  			st = broken;
4411  	} else {
4412  		if (list_empty(&mddev->disks) &&
4413  		    mddev->raid_disks == 0 &&
4414  		    mddev->dev_sectors == 0)
4415  			st = clear;
4416  		else
4417  			st = inactive;
4418  	}
4419  	return sprintf(page, "%s\n", array_states[st]);
4420  }
4421  
4422  static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4423  static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4424  static int restart_array(struct mddev *mddev);
4425  
4426  static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4427  array_state_store(struct mddev *mddev, const char *buf, size_t len)
4428  {
4429  	int err = 0;
4430  	enum array_state st = match_word(buf, array_states);
4431  
4432  	if (mddev->pers && (st == active || st == clean) &&
4433  	    mddev->ro != MD_RDONLY) {
4434  		/* don't take reconfig_mutex when toggling between
4435  		 * clean and active
4436  		 */
4437  		spin_lock(&mddev->lock);
4438  		if (st == active) {
4439  			restart_array(mddev);
4440  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4441  			md_wakeup_thread(mddev->thread);
4442  			wake_up(&mddev->sb_wait);
4443  		} else /* st == clean */ {
4444  			restart_array(mddev);
4445  			if (!set_in_sync(mddev))
4446  				err = -EBUSY;
4447  		}
4448  		if (!err)
4449  			sysfs_notify_dirent_safe(mddev->sysfs_state);
4450  		spin_unlock(&mddev->lock);
4451  		return err ?: len;
4452  	}
4453  	err = mddev_lock(mddev);
4454  	if (err)
4455  		return err;
4456  	err = -EINVAL;
4457  	switch(st) {
4458  	case bad_word:
4459  		break;
4460  	case clear:
4461  		/* stopping an active array */
4462  		err = do_md_stop(mddev, 0, NULL);
4463  		break;
4464  	case inactive:
4465  		/* stopping an active array */
4466  		if (mddev->pers)
4467  			err = do_md_stop(mddev, 2, NULL);
4468  		else
4469  			err = 0; /* already inactive */
4470  		break;
4471  	case suspended:
4472  		break; /* not supported yet */
4473  	case readonly:
4474  		if (mddev->pers)
4475  			err = md_set_readonly(mddev, NULL);
4476  		else {
4477  			mddev->ro = MD_RDONLY;
4478  			set_disk_ro(mddev->gendisk, 1);
4479  			err = do_md_run(mddev);
4480  		}
4481  		break;
4482  	case read_auto:
4483  		if (mddev->pers) {
4484  			if (md_is_rdwr(mddev))
4485  				err = md_set_readonly(mddev, NULL);
4486  			else if (mddev->ro == MD_RDONLY)
4487  				err = restart_array(mddev);
4488  			if (err == 0) {
4489  				mddev->ro = MD_AUTO_READ;
4490  				set_disk_ro(mddev->gendisk, 0);
4491  			}
4492  		} else {
4493  			mddev->ro = MD_AUTO_READ;
4494  			err = do_md_run(mddev);
4495  		}
4496  		break;
4497  	case clean:
4498  		if (mddev->pers) {
4499  			err = restart_array(mddev);
4500  			if (err)
4501  				break;
4502  			spin_lock(&mddev->lock);
4503  			if (!set_in_sync(mddev))
4504  				err = -EBUSY;
4505  			spin_unlock(&mddev->lock);
4506  		} else
4507  			err = -EINVAL;
4508  		break;
4509  	case active:
4510  		if (mddev->pers) {
4511  			err = restart_array(mddev);
4512  			if (err)
4513  				break;
4514  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4515  			wake_up(&mddev->sb_wait);
4516  			err = 0;
4517  		} else {
4518  			mddev->ro = MD_RDWR;
4519  			set_disk_ro(mddev->gendisk, 0);
4520  			err = do_md_run(mddev);
4521  		}
4522  		break;
4523  	case write_pending:
4524  	case active_idle:
4525  	case broken:
4526  		/* these cannot be set */
4527  		break;
4528  	}
4529  
4530  	if (!err) {
4531  		if (mddev->hold_active == UNTIL_IOCTL)
4532  			mddev->hold_active = 0;
4533  		sysfs_notify_dirent_safe(mddev->sysfs_state);
4534  	}
4535  	mddev_unlock(mddev);
4536  	return err ?: len;
4537  }
4538  static struct md_sysfs_entry md_array_state =
4539  __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4540  
4541  static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4542  max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4543  	return sprintf(page, "%d\n",
4544  		       atomic_read(&mddev->max_corr_read_errors));
4545  }
4546  
4547  static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4548  max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4549  {
4550  	unsigned int n;
4551  	int rv;
4552  
4553  	rv = kstrtouint(buf, 10, &n);
4554  	if (rv < 0)
4555  		return rv;
4556  	if (n > INT_MAX)
4557  		return -EINVAL;
4558  	atomic_set(&mddev->max_corr_read_errors, n);
4559  	return len;
4560  }
4561  
4562  static struct md_sysfs_entry max_corr_read_errors =
4563  __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4564  	max_corrected_read_errors_store);
4565  
4566  static ssize_t
null_show(struct mddev * mddev,char * page)4567  null_show(struct mddev *mddev, char *page)
4568  {
4569  	return -EINVAL;
4570  }
4571  
4572  static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4573  new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4574  {
4575  	/* buf must be %d:%d\n? giving major and minor numbers */
4576  	/* The new device is added to the array.
4577  	 * If the array has a persistent superblock, we read the
4578  	 * superblock to initialise info and check validity.
4579  	 * Otherwise, only checking done is that in bind_rdev_to_array,
4580  	 * which mainly checks size.
4581  	 */
4582  	char *e;
4583  	int major = simple_strtoul(buf, &e, 10);
4584  	int minor;
4585  	dev_t dev;
4586  	struct md_rdev *rdev;
4587  	int err;
4588  
4589  	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4590  		return -EINVAL;
4591  	minor = simple_strtoul(e+1, &e, 10);
4592  	if (*e && *e != '\n')
4593  		return -EINVAL;
4594  	dev = MKDEV(major, minor);
4595  	if (major != MAJOR(dev) ||
4596  	    minor != MINOR(dev))
4597  		return -EOVERFLOW;
4598  
4599  	err = mddev_lock(mddev);
4600  	if (err)
4601  		return err;
4602  	if (mddev->persistent) {
4603  		rdev = md_import_device(dev, mddev->major_version,
4604  					mddev->minor_version);
4605  		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4606  			struct md_rdev *rdev0
4607  				= list_entry(mddev->disks.next,
4608  					     struct md_rdev, same_set);
4609  			err = super_types[mddev->major_version]
4610  				.load_super(rdev, rdev0, mddev->minor_version);
4611  			if (err < 0)
4612  				goto out;
4613  		}
4614  	} else if (mddev->external)
4615  		rdev = md_import_device(dev, -2, -1);
4616  	else
4617  		rdev = md_import_device(dev, -1, -1);
4618  
4619  	if (IS_ERR(rdev)) {
4620  		mddev_unlock(mddev);
4621  		return PTR_ERR(rdev);
4622  	}
4623  	err = bind_rdev_to_array(rdev, mddev);
4624   out:
4625  	if (err)
4626  		export_rdev(rdev, mddev);
4627  	mddev_unlock(mddev);
4628  	if (!err)
4629  		md_new_event();
4630  	return err ? err : len;
4631  }
4632  
4633  static struct md_sysfs_entry md_new_device =
4634  __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4635  
4636  static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4637  bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4638  {
4639  	char *end;
4640  	unsigned long chunk, end_chunk;
4641  	int err;
4642  
4643  	err = mddev_lock(mddev);
4644  	if (err)
4645  		return err;
4646  	if (!mddev->bitmap)
4647  		goto out;
4648  	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4649  	while (*buf) {
4650  		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4651  		if (buf == end) break;
4652  		if (*end == '-') { /* range */
4653  			buf = end + 1;
4654  			end_chunk = simple_strtoul(buf, &end, 0);
4655  			if (buf == end) break;
4656  		}
4657  		if (*end && !isspace(*end)) break;
4658  		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4659  		buf = skip_spaces(end);
4660  	}
4661  	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4662  out:
4663  	mddev_unlock(mddev);
4664  	return len;
4665  }
4666  
4667  static struct md_sysfs_entry md_bitmap =
4668  __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4669  
4670  static ssize_t
size_show(struct mddev * mddev,char * page)4671  size_show(struct mddev *mddev, char *page)
4672  {
4673  	return sprintf(page, "%llu\n",
4674  		(unsigned long long)mddev->dev_sectors / 2);
4675  }
4676  
4677  static int update_size(struct mddev *mddev, sector_t num_sectors);
4678  
4679  static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4680  size_store(struct mddev *mddev, const char *buf, size_t len)
4681  {
4682  	/* If array is inactive, we can reduce the component size, but
4683  	 * not increase it (except from 0).
4684  	 * If array is active, we can try an on-line resize
4685  	 */
4686  	sector_t sectors;
4687  	int err = strict_blocks_to_sectors(buf, &sectors);
4688  
4689  	if (err < 0)
4690  		return err;
4691  	err = mddev_lock(mddev);
4692  	if (err)
4693  		return err;
4694  	if (mddev->pers) {
4695  		err = update_size(mddev, sectors);
4696  		if (err == 0)
4697  			md_update_sb(mddev, 1);
4698  	} else {
4699  		if (mddev->dev_sectors == 0 ||
4700  		    mddev->dev_sectors > sectors)
4701  			mddev->dev_sectors = sectors;
4702  		else
4703  			err = -ENOSPC;
4704  	}
4705  	mddev_unlock(mddev);
4706  	return err ? err : len;
4707  }
4708  
4709  static struct md_sysfs_entry md_size =
4710  __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4711  
4712  /* Metadata version.
4713   * This is one of
4714   *   'none' for arrays with no metadata (good luck...)
4715   *   'external' for arrays with externally managed metadata,
4716   * or N.M for internally known formats
4717   */
4718  static ssize_t
metadata_show(struct mddev * mddev,char * page)4719  metadata_show(struct mddev *mddev, char *page)
4720  {
4721  	if (mddev->persistent)
4722  		return sprintf(page, "%d.%d\n",
4723  			       mddev->major_version, mddev->minor_version);
4724  	else if (mddev->external)
4725  		return sprintf(page, "external:%s\n", mddev->metadata_type);
4726  	else
4727  		return sprintf(page, "none\n");
4728  }
4729  
4730  static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4731  metadata_store(struct mddev *mddev, const char *buf, size_t len)
4732  {
4733  	int major, minor;
4734  	char *e;
4735  	int err;
4736  	/* Changing the details of 'external' metadata is
4737  	 * always permitted.  Otherwise there must be
4738  	 * no devices attached to the array.
4739  	 */
4740  
4741  	err = mddev_lock(mddev);
4742  	if (err)
4743  		return err;
4744  	err = -EBUSY;
4745  	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4746  		;
4747  	else if (!list_empty(&mddev->disks))
4748  		goto out_unlock;
4749  
4750  	err = 0;
4751  	if (cmd_match(buf, "none")) {
4752  		mddev->persistent = 0;
4753  		mddev->external = 0;
4754  		mddev->major_version = 0;
4755  		mddev->minor_version = 90;
4756  		goto out_unlock;
4757  	}
4758  	if (strncmp(buf, "external:", 9) == 0) {
4759  		size_t namelen = len-9;
4760  		if (namelen >= sizeof(mddev->metadata_type))
4761  			namelen = sizeof(mddev->metadata_type)-1;
4762  		strncpy(mddev->metadata_type, buf+9, namelen);
4763  		mddev->metadata_type[namelen] = 0;
4764  		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4765  			mddev->metadata_type[--namelen] = 0;
4766  		mddev->persistent = 0;
4767  		mddev->external = 1;
4768  		mddev->major_version = 0;
4769  		mddev->minor_version = 90;
4770  		goto out_unlock;
4771  	}
4772  	major = simple_strtoul(buf, &e, 10);
4773  	err = -EINVAL;
4774  	if (e==buf || *e != '.')
4775  		goto out_unlock;
4776  	buf = e+1;
4777  	minor = simple_strtoul(buf, &e, 10);
4778  	if (e==buf || (*e && *e != '\n') )
4779  		goto out_unlock;
4780  	err = -ENOENT;
4781  	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4782  		goto out_unlock;
4783  	mddev->major_version = major;
4784  	mddev->minor_version = minor;
4785  	mddev->persistent = 1;
4786  	mddev->external = 0;
4787  	err = 0;
4788  out_unlock:
4789  	mddev_unlock(mddev);
4790  	return err ?: len;
4791  }
4792  
4793  static struct md_sysfs_entry md_metadata =
4794  __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4795  
4796  static ssize_t
action_show(struct mddev * mddev,char * page)4797  action_show(struct mddev *mddev, char *page)
4798  {
4799  	char *type = "idle";
4800  	unsigned long recovery = mddev->recovery;
4801  	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4802  		type = "frozen";
4803  	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4804  	    (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4805  		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4806  			type = "reshape";
4807  		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4808  			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4809  				type = "resync";
4810  			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4811  				type = "check";
4812  			else
4813  				type = "repair";
4814  		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4815  			type = "recover";
4816  		else if (mddev->reshape_position != MaxSector)
4817  			type = "reshape";
4818  	}
4819  	return sprintf(page, "%s\n", type);
4820  }
4821  
stop_sync_thread(struct mddev * mddev)4822  static void stop_sync_thread(struct mddev *mddev)
4823  {
4824  	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4825  		return;
4826  
4827  	if (mddev_lock(mddev))
4828  		return;
4829  
4830  	/*
4831  	 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
4832  	 * held.
4833  	 */
4834  	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4835  		mddev_unlock(mddev);
4836  		return;
4837  	}
4838  
4839  	if (work_pending(&mddev->sync_work))
4840  		flush_workqueue(md_misc_wq);
4841  
4842  	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4843  	/*
4844  	 * Thread might be blocked waiting for metadata update which will now
4845  	 * never happen
4846  	 */
4847  	md_wakeup_thread_directly(mddev->sync_thread);
4848  
4849  	mddev_unlock(mddev);
4850  }
4851  
idle_sync_thread(struct mddev * mddev)4852  static void idle_sync_thread(struct mddev *mddev)
4853  {
4854  	int sync_seq = atomic_read(&mddev->sync_seq);
4855  
4856  	mutex_lock(&mddev->sync_mutex);
4857  	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4858  	stop_sync_thread(mddev);
4859  
4860  	wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
4861  			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
4862  
4863  	mutex_unlock(&mddev->sync_mutex);
4864  }
4865  
frozen_sync_thread(struct mddev * mddev)4866  static void frozen_sync_thread(struct mddev *mddev)
4867  {
4868  	mutex_lock(&mddev->sync_mutex);
4869  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4870  	stop_sync_thread(mddev);
4871  
4872  	wait_event(resync_wait, mddev->sync_thread == NULL &&
4873  			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
4874  
4875  	mutex_unlock(&mddev->sync_mutex);
4876  }
4877  
4878  static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)4879  action_store(struct mddev *mddev, const char *page, size_t len)
4880  {
4881  	if (!mddev->pers || !mddev->pers->sync_request)
4882  		return -EINVAL;
4883  
4884  
4885  	if (cmd_match(page, "idle"))
4886  		idle_sync_thread(mddev);
4887  	else if (cmd_match(page, "frozen"))
4888  		frozen_sync_thread(mddev);
4889  	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4890  		return -EBUSY;
4891  	else if (cmd_match(page, "resync"))
4892  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4893  	else if (cmd_match(page, "recover")) {
4894  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4895  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4896  	} else if (cmd_match(page, "reshape")) {
4897  		int err;
4898  		if (mddev->pers->start_reshape == NULL)
4899  			return -EINVAL;
4900  		err = mddev_lock(mddev);
4901  		if (!err) {
4902  			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4903  				err =  -EBUSY;
4904  			} else if (mddev->reshape_position == MaxSector ||
4905  				   mddev->pers->check_reshape == NULL ||
4906  				   mddev->pers->check_reshape(mddev)) {
4907  				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4908  				err = mddev->pers->start_reshape(mddev);
4909  			} else {
4910  				/*
4911  				 * If reshape is still in progress, and
4912  				 * md_check_recovery() can continue to reshape,
4913  				 * don't restart reshape because data can be
4914  				 * corrupted for raid456.
4915  				 */
4916  				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4917  			}
4918  			mddev_unlock(mddev);
4919  		}
4920  		if (err)
4921  			return err;
4922  		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4923  	} else {
4924  		if (cmd_match(page, "check"))
4925  			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4926  		else if (!cmd_match(page, "repair"))
4927  			return -EINVAL;
4928  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4929  		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4930  		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4931  	}
4932  	if (mddev->ro == MD_AUTO_READ) {
4933  		/* A write to sync_action is enough to justify
4934  		 * canceling read-auto mode
4935  		 */
4936  		mddev->ro = MD_RDWR;
4937  		md_wakeup_thread(mddev->sync_thread);
4938  	}
4939  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4940  	md_wakeup_thread(mddev->thread);
4941  	sysfs_notify_dirent_safe(mddev->sysfs_action);
4942  	return len;
4943  }
4944  
4945  static struct md_sysfs_entry md_scan_mode =
4946  __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4947  
4948  static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)4949  last_sync_action_show(struct mddev *mddev, char *page)
4950  {
4951  	return sprintf(page, "%s\n", mddev->last_sync_action);
4952  }
4953  
4954  static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4955  
4956  static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)4957  mismatch_cnt_show(struct mddev *mddev, char *page)
4958  {
4959  	return sprintf(page, "%llu\n",
4960  		       (unsigned long long)
4961  		       atomic64_read(&mddev->resync_mismatches));
4962  }
4963  
4964  static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4965  
4966  static ssize_t
sync_min_show(struct mddev * mddev,char * page)4967  sync_min_show(struct mddev *mddev, char *page)
4968  {
4969  	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4970  		       mddev->sync_speed_min ? "local": "system");
4971  }
4972  
4973  static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)4974  sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4975  {
4976  	unsigned int min;
4977  	int rv;
4978  
4979  	if (strncmp(buf, "system", 6)==0) {
4980  		min = 0;
4981  	} else {
4982  		rv = kstrtouint(buf, 10, &min);
4983  		if (rv < 0)
4984  			return rv;
4985  		if (min == 0)
4986  			return -EINVAL;
4987  	}
4988  	mddev->sync_speed_min = min;
4989  	return len;
4990  }
4991  
4992  static struct md_sysfs_entry md_sync_min =
4993  __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4994  
4995  static ssize_t
sync_max_show(struct mddev * mddev,char * page)4996  sync_max_show(struct mddev *mddev, char *page)
4997  {
4998  	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4999  		       mddev->sync_speed_max ? "local": "system");
5000  }
5001  
5002  static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5003  sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5004  {
5005  	unsigned int max;
5006  	int rv;
5007  
5008  	if (strncmp(buf, "system", 6)==0) {
5009  		max = 0;
5010  	} else {
5011  		rv = kstrtouint(buf, 10, &max);
5012  		if (rv < 0)
5013  			return rv;
5014  		if (max == 0)
5015  			return -EINVAL;
5016  	}
5017  	mddev->sync_speed_max = max;
5018  	return len;
5019  }
5020  
5021  static struct md_sysfs_entry md_sync_max =
5022  __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5023  
5024  static ssize_t
degraded_show(struct mddev * mddev,char * page)5025  degraded_show(struct mddev *mddev, char *page)
5026  {
5027  	return sprintf(page, "%d\n", mddev->degraded);
5028  }
5029  static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5030  
5031  static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5032  sync_force_parallel_show(struct mddev *mddev, char *page)
5033  {
5034  	return sprintf(page, "%d\n", mddev->parallel_resync);
5035  }
5036  
5037  static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5038  sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5039  {
5040  	long n;
5041  
5042  	if (kstrtol(buf, 10, &n))
5043  		return -EINVAL;
5044  
5045  	if (n != 0 && n != 1)
5046  		return -EINVAL;
5047  
5048  	mddev->parallel_resync = n;
5049  
5050  	if (mddev->sync_thread)
5051  		wake_up(&resync_wait);
5052  
5053  	return len;
5054  }
5055  
5056  /* force parallel resync, even with shared block devices */
5057  static struct md_sysfs_entry md_sync_force_parallel =
5058  __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5059         sync_force_parallel_show, sync_force_parallel_store);
5060  
5061  static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5062  sync_speed_show(struct mddev *mddev, char *page)
5063  {
5064  	unsigned long resync, dt, db;
5065  	if (mddev->curr_resync == MD_RESYNC_NONE)
5066  		return sprintf(page, "none\n");
5067  	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5068  	dt = (jiffies - mddev->resync_mark) / HZ;
5069  	if (!dt) dt++;
5070  	db = resync - mddev->resync_mark_cnt;
5071  	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5072  }
5073  
5074  static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5075  
5076  static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5077  sync_completed_show(struct mddev *mddev, char *page)
5078  {
5079  	unsigned long long max_sectors, resync;
5080  
5081  	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5082  		return sprintf(page, "none\n");
5083  
5084  	if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5085  	    mddev->curr_resync == MD_RESYNC_DELAYED)
5086  		return sprintf(page, "delayed\n");
5087  
5088  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5089  	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5090  		max_sectors = mddev->resync_max_sectors;
5091  	else
5092  		max_sectors = mddev->dev_sectors;
5093  
5094  	resync = mddev->curr_resync_completed;
5095  	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5096  }
5097  
5098  static struct md_sysfs_entry md_sync_completed =
5099  	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5100  
5101  static ssize_t
min_sync_show(struct mddev * mddev,char * page)5102  min_sync_show(struct mddev *mddev, char *page)
5103  {
5104  	return sprintf(page, "%llu\n",
5105  		       (unsigned long long)mddev->resync_min);
5106  }
5107  static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5108  min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5109  {
5110  	unsigned long long min;
5111  	int err;
5112  
5113  	if (kstrtoull(buf, 10, &min))
5114  		return -EINVAL;
5115  
5116  	spin_lock(&mddev->lock);
5117  	err = -EINVAL;
5118  	if (min > mddev->resync_max)
5119  		goto out_unlock;
5120  
5121  	err = -EBUSY;
5122  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5123  		goto out_unlock;
5124  
5125  	/* Round down to multiple of 4K for safety */
5126  	mddev->resync_min = round_down(min, 8);
5127  	err = 0;
5128  
5129  out_unlock:
5130  	spin_unlock(&mddev->lock);
5131  	return err ?: len;
5132  }
5133  
5134  static struct md_sysfs_entry md_min_sync =
5135  __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5136  
5137  static ssize_t
max_sync_show(struct mddev * mddev,char * page)5138  max_sync_show(struct mddev *mddev, char *page)
5139  {
5140  	if (mddev->resync_max == MaxSector)
5141  		return sprintf(page, "max\n");
5142  	else
5143  		return sprintf(page, "%llu\n",
5144  			       (unsigned long long)mddev->resync_max);
5145  }
5146  static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5147  max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5148  {
5149  	int err;
5150  	spin_lock(&mddev->lock);
5151  	if (strncmp(buf, "max", 3) == 0)
5152  		mddev->resync_max = MaxSector;
5153  	else {
5154  		unsigned long long max;
5155  		int chunk;
5156  
5157  		err = -EINVAL;
5158  		if (kstrtoull(buf, 10, &max))
5159  			goto out_unlock;
5160  		if (max < mddev->resync_min)
5161  			goto out_unlock;
5162  
5163  		err = -EBUSY;
5164  		if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5165  		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5166  			goto out_unlock;
5167  
5168  		/* Must be a multiple of chunk_size */
5169  		chunk = mddev->chunk_sectors;
5170  		if (chunk) {
5171  			sector_t temp = max;
5172  
5173  			err = -EINVAL;
5174  			if (sector_div(temp, chunk))
5175  				goto out_unlock;
5176  		}
5177  		mddev->resync_max = max;
5178  	}
5179  	wake_up(&mddev->recovery_wait);
5180  	err = 0;
5181  out_unlock:
5182  	spin_unlock(&mddev->lock);
5183  	return err ?: len;
5184  }
5185  
5186  static struct md_sysfs_entry md_max_sync =
5187  __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5188  
5189  static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5190  suspend_lo_show(struct mddev *mddev, char *page)
5191  {
5192  	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
5193  }
5194  
5195  static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5196  suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5197  {
5198  	unsigned long long new;
5199  	int err;
5200  
5201  	err = kstrtoull(buf, 10, &new);
5202  	if (err < 0)
5203  		return err;
5204  	if (new != (sector_t)new)
5205  		return -EINVAL;
5206  
5207  	err = mddev_lock(mddev);
5208  	if (err)
5209  		return err;
5210  	err = -EINVAL;
5211  	if (mddev->pers == NULL ||
5212  	    mddev->pers->quiesce == NULL)
5213  		goto unlock;
5214  	mddev_suspend(mddev);
5215  	mddev->suspend_lo = new;
5216  	mddev_resume(mddev);
5217  
5218  	err = 0;
5219  unlock:
5220  	mddev_unlock(mddev);
5221  	return err ?: len;
5222  }
5223  static struct md_sysfs_entry md_suspend_lo =
5224  __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5225  
5226  static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5227  suspend_hi_show(struct mddev *mddev, char *page)
5228  {
5229  	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
5230  }
5231  
5232  static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5233  suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5234  {
5235  	unsigned long long new;
5236  	int err;
5237  
5238  	err = kstrtoull(buf, 10, &new);
5239  	if (err < 0)
5240  		return err;
5241  	if (new != (sector_t)new)
5242  		return -EINVAL;
5243  
5244  	err = mddev_lock(mddev);
5245  	if (err)
5246  		return err;
5247  	err = -EINVAL;
5248  	if (mddev->pers == NULL)
5249  		goto unlock;
5250  
5251  	mddev_suspend(mddev);
5252  	mddev->suspend_hi = new;
5253  	mddev_resume(mddev);
5254  
5255  	err = 0;
5256  unlock:
5257  	mddev_unlock(mddev);
5258  	return err ?: len;
5259  }
5260  static struct md_sysfs_entry md_suspend_hi =
5261  __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5262  
5263  static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5264  reshape_position_show(struct mddev *mddev, char *page)
5265  {
5266  	if (mddev->reshape_position != MaxSector)
5267  		return sprintf(page, "%llu\n",
5268  			       (unsigned long long)mddev->reshape_position);
5269  	strcpy(page, "none\n");
5270  	return 5;
5271  }
5272  
5273  static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5274  reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5275  {
5276  	struct md_rdev *rdev;
5277  	unsigned long long new;
5278  	int err;
5279  
5280  	err = kstrtoull(buf, 10, &new);
5281  	if (err < 0)
5282  		return err;
5283  	if (new != (sector_t)new)
5284  		return -EINVAL;
5285  	err = mddev_lock(mddev);
5286  	if (err)
5287  		return err;
5288  	err = -EBUSY;
5289  	if (mddev->pers)
5290  		goto unlock;
5291  	mddev->reshape_position = new;
5292  	mddev->delta_disks = 0;
5293  	mddev->reshape_backwards = 0;
5294  	mddev->new_level = mddev->level;
5295  	mddev->new_layout = mddev->layout;
5296  	mddev->new_chunk_sectors = mddev->chunk_sectors;
5297  	rdev_for_each(rdev, mddev)
5298  		rdev->new_data_offset = rdev->data_offset;
5299  	err = 0;
5300  unlock:
5301  	mddev_unlock(mddev);
5302  	return err ?: len;
5303  }
5304  
5305  static struct md_sysfs_entry md_reshape_position =
5306  __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5307         reshape_position_store);
5308  
5309  static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5310  reshape_direction_show(struct mddev *mddev, char *page)
5311  {
5312  	return sprintf(page, "%s\n",
5313  		       mddev->reshape_backwards ? "backwards" : "forwards");
5314  }
5315  
5316  static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5317  reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5318  {
5319  	int backwards = 0;
5320  	int err;
5321  
5322  	if (cmd_match(buf, "forwards"))
5323  		backwards = 0;
5324  	else if (cmd_match(buf, "backwards"))
5325  		backwards = 1;
5326  	else
5327  		return -EINVAL;
5328  	if (mddev->reshape_backwards == backwards)
5329  		return len;
5330  
5331  	err = mddev_lock(mddev);
5332  	if (err)
5333  		return err;
5334  	/* check if we are allowed to change */
5335  	if (mddev->delta_disks)
5336  		err = -EBUSY;
5337  	else if (mddev->persistent &&
5338  	    mddev->major_version == 0)
5339  		err =  -EINVAL;
5340  	else
5341  		mddev->reshape_backwards = backwards;
5342  	mddev_unlock(mddev);
5343  	return err ?: len;
5344  }
5345  
5346  static struct md_sysfs_entry md_reshape_direction =
5347  __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5348         reshape_direction_store);
5349  
5350  static ssize_t
array_size_show(struct mddev * mddev,char * page)5351  array_size_show(struct mddev *mddev, char *page)
5352  {
5353  	if (mddev->external_size)
5354  		return sprintf(page, "%llu\n",
5355  			       (unsigned long long)mddev->array_sectors/2);
5356  	else
5357  		return sprintf(page, "default\n");
5358  }
5359  
5360  static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5361  array_size_store(struct mddev *mddev, const char *buf, size_t len)
5362  {
5363  	sector_t sectors;
5364  	int err;
5365  
5366  	err = mddev_lock(mddev);
5367  	if (err)
5368  		return err;
5369  
5370  	/* cluster raid doesn't support change array_sectors */
5371  	if (mddev_is_clustered(mddev)) {
5372  		mddev_unlock(mddev);
5373  		return -EINVAL;
5374  	}
5375  
5376  	if (strncmp(buf, "default", 7) == 0) {
5377  		if (mddev->pers)
5378  			sectors = mddev->pers->size(mddev, 0, 0);
5379  		else
5380  			sectors = mddev->array_sectors;
5381  
5382  		mddev->external_size = 0;
5383  	} else {
5384  		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5385  			err = -EINVAL;
5386  		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5387  			err = -E2BIG;
5388  		else
5389  			mddev->external_size = 1;
5390  	}
5391  
5392  	if (!err) {
5393  		mddev->array_sectors = sectors;
5394  		if (mddev->pers)
5395  			set_capacity_and_notify(mddev->gendisk,
5396  						mddev->array_sectors);
5397  	}
5398  	mddev_unlock(mddev);
5399  	return err ?: len;
5400  }
5401  
5402  static struct md_sysfs_entry md_array_size =
5403  __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5404         array_size_store);
5405  
5406  static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5407  consistency_policy_show(struct mddev *mddev, char *page)
5408  {
5409  	int ret;
5410  
5411  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5412  		ret = sprintf(page, "journal\n");
5413  	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5414  		ret = sprintf(page, "ppl\n");
5415  	} else if (mddev->bitmap) {
5416  		ret = sprintf(page, "bitmap\n");
5417  	} else if (mddev->pers) {
5418  		if (mddev->pers->sync_request)
5419  			ret = sprintf(page, "resync\n");
5420  		else
5421  			ret = sprintf(page, "none\n");
5422  	} else {
5423  		ret = sprintf(page, "unknown\n");
5424  	}
5425  
5426  	return ret;
5427  }
5428  
5429  static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5430  consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5431  {
5432  	int err = 0;
5433  
5434  	if (mddev->pers) {
5435  		if (mddev->pers->change_consistency_policy)
5436  			err = mddev->pers->change_consistency_policy(mddev, buf);
5437  		else
5438  			err = -EBUSY;
5439  	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5440  		set_bit(MD_HAS_PPL, &mddev->flags);
5441  	} else {
5442  		err = -EINVAL;
5443  	}
5444  
5445  	return err ? err : len;
5446  }
5447  
5448  static struct md_sysfs_entry md_consistency_policy =
5449  __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5450         consistency_policy_store);
5451  
fail_last_dev_show(struct mddev * mddev,char * page)5452  static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5453  {
5454  	return sprintf(page, "%d\n", mddev->fail_last_dev);
5455  }
5456  
5457  /*
5458   * Setting fail_last_dev to true to allow last device to be forcibly removed
5459   * from RAID1/RAID10.
5460   */
5461  static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5462  fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5463  {
5464  	int ret;
5465  	bool value;
5466  
5467  	ret = kstrtobool(buf, &value);
5468  	if (ret)
5469  		return ret;
5470  
5471  	if (value != mddev->fail_last_dev)
5472  		mddev->fail_last_dev = value;
5473  
5474  	return len;
5475  }
5476  static struct md_sysfs_entry md_fail_last_dev =
5477  __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5478         fail_last_dev_store);
5479  
serialize_policy_show(struct mddev * mddev,char * page)5480  static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5481  {
5482  	if (mddev->pers == NULL || (mddev->pers->level != 1))
5483  		return sprintf(page, "n/a\n");
5484  	else
5485  		return sprintf(page, "%d\n", mddev->serialize_policy);
5486  }
5487  
5488  /*
5489   * Setting serialize_policy to true to enforce write IO is not reordered
5490   * for raid1.
5491   */
5492  static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5493  serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5494  {
5495  	int err;
5496  	bool value;
5497  
5498  	err = kstrtobool(buf, &value);
5499  	if (err)
5500  		return err;
5501  
5502  	if (value == mddev->serialize_policy)
5503  		return len;
5504  
5505  	err = mddev_lock(mddev);
5506  	if (err)
5507  		return err;
5508  	if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5509  		pr_err("md: serialize_policy is only effective for raid1\n");
5510  		err = -EINVAL;
5511  		goto unlock;
5512  	}
5513  
5514  	mddev_suspend(mddev);
5515  	if (value)
5516  		mddev_create_serial_pool(mddev, NULL, true);
5517  	else
5518  		mddev_destroy_serial_pool(mddev, NULL, true);
5519  	mddev->serialize_policy = value;
5520  	mddev_resume(mddev);
5521  unlock:
5522  	mddev_unlock(mddev);
5523  	return err ?: len;
5524  }
5525  
5526  static struct md_sysfs_entry md_serialize_policy =
5527  __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5528         serialize_policy_store);
5529  
5530  
5531  static struct attribute *md_default_attrs[] = {
5532  	&md_level.attr,
5533  	&md_layout.attr,
5534  	&md_raid_disks.attr,
5535  	&md_uuid.attr,
5536  	&md_chunk_size.attr,
5537  	&md_size.attr,
5538  	&md_resync_start.attr,
5539  	&md_metadata.attr,
5540  	&md_new_device.attr,
5541  	&md_safe_delay.attr,
5542  	&md_array_state.attr,
5543  	&md_reshape_position.attr,
5544  	&md_reshape_direction.attr,
5545  	&md_array_size.attr,
5546  	&max_corr_read_errors.attr,
5547  	&md_consistency_policy.attr,
5548  	&md_fail_last_dev.attr,
5549  	&md_serialize_policy.attr,
5550  	NULL,
5551  };
5552  
5553  static const struct attribute_group md_default_group = {
5554  	.attrs = md_default_attrs,
5555  };
5556  
5557  static struct attribute *md_redundancy_attrs[] = {
5558  	&md_scan_mode.attr,
5559  	&md_last_scan_mode.attr,
5560  	&md_mismatches.attr,
5561  	&md_sync_min.attr,
5562  	&md_sync_max.attr,
5563  	&md_sync_speed.attr,
5564  	&md_sync_force_parallel.attr,
5565  	&md_sync_completed.attr,
5566  	&md_min_sync.attr,
5567  	&md_max_sync.attr,
5568  	&md_suspend_lo.attr,
5569  	&md_suspend_hi.attr,
5570  	&md_bitmap.attr,
5571  	&md_degraded.attr,
5572  	NULL,
5573  };
5574  static const struct attribute_group md_redundancy_group = {
5575  	.name = NULL,
5576  	.attrs = md_redundancy_attrs,
5577  };
5578  
5579  static const struct attribute_group *md_attr_groups[] = {
5580  	&md_default_group,
5581  	&md_bitmap_group,
5582  	NULL,
5583  };
5584  
5585  static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)5586  md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5587  {
5588  	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5589  	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5590  	ssize_t rv;
5591  
5592  	if (!entry->show)
5593  		return -EIO;
5594  	spin_lock(&all_mddevs_lock);
5595  	if (!mddev_get(mddev)) {
5596  		spin_unlock(&all_mddevs_lock);
5597  		return -EBUSY;
5598  	}
5599  	spin_unlock(&all_mddevs_lock);
5600  
5601  	rv = entry->show(mddev, page);
5602  	mddev_put(mddev);
5603  	return rv;
5604  }
5605  
5606  static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)5607  md_attr_store(struct kobject *kobj, struct attribute *attr,
5608  	      const char *page, size_t length)
5609  {
5610  	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5611  	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5612  	ssize_t rv;
5613  
5614  	if (!entry->store)
5615  		return -EIO;
5616  	if (!capable(CAP_SYS_ADMIN))
5617  		return -EACCES;
5618  	spin_lock(&all_mddevs_lock);
5619  	if (!mddev_get(mddev)) {
5620  		spin_unlock(&all_mddevs_lock);
5621  		return -EBUSY;
5622  	}
5623  	spin_unlock(&all_mddevs_lock);
5624  	rv = entry->store(mddev, page, length);
5625  	mddev_put(mddev);
5626  	return rv;
5627  }
5628  
md_kobj_release(struct kobject * ko)5629  static void md_kobj_release(struct kobject *ko)
5630  {
5631  	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5632  
5633  	if (mddev->sysfs_state)
5634  		sysfs_put(mddev->sysfs_state);
5635  	if (mddev->sysfs_level)
5636  		sysfs_put(mddev->sysfs_level);
5637  
5638  	del_gendisk(mddev->gendisk);
5639  	put_disk(mddev->gendisk);
5640  }
5641  
5642  static const struct sysfs_ops md_sysfs_ops = {
5643  	.show	= md_attr_show,
5644  	.store	= md_attr_store,
5645  };
5646  static const struct kobj_type md_ktype = {
5647  	.release	= md_kobj_release,
5648  	.sysfs_ops	= &md_sysfs_ops,
5649  	.default_groups	= md_attr_groups,
5650  };
5651  
5652  int mdp_major = 0;
5653  
mddev_delayed_delete(struct work_struct * ws)5654  static void mddev_delayed_delete(struct work_struct *ws)
5655  {
5656  	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5657  
5658  	kobject_put(&mddev->kobj);
5659  }
5660  
no_op(struct percpu_ref * r)5661  static void no_op(struct percpu_ref *r) {}
5662  
mddev_init_writes_pending(struct mddev * mddev)5663  int mddev_init_writes_pending(struct mddev *mddev)
5664  {
5665  	if (mddev->writes_pending.percpu_count_ptr)
5666  		return 0;
5667  	if (percpu_ref_init(&mddev->writes_pending, no_op,
5668  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5669  		return -ENOMEM;
5670  	/* We want to start with the refcount at zero */
5671  	percpu_ref_put(&mddev->writes_pending);
5672  	return 0;
5673  }
5674  EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5675  
md_alloc(dev_t dev,char * name)5676  struct mddev *md_alloc(dev_t dev, char *name)
5677  {
5678  	/*
5679  	 * If dev is zero, name is the name of a device to allocate with
5680  	 * an arbitrary minor number.  It will be "md_???"
5681  	 * If dev is non-zero it must be a device number with a MAJOR of
5682  	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5683  	 * the device is being created by opening a node in /dev.
5684  	 * If "name" is not NULL, the device is being created by
5685  	 * writing to /sys/module/md_mod/parameters/new_array.
5686  	 */
5687  	static DEFINE_MUTEX(disks_mutex);
5688  	struct mddev *mddev;
5689  	struct gendisk *disk;
5690  	int partitioned;
5691  	int shift;
5692  	int unit;
5693  	int error ;
5694  
5695  	/*
5696  	 * Wait for any previous instance of this device to be completely
5697  	 * removed (mddev_delayed_delete).
5698  	 */
5699  	flush_workqueue(md_misc_wq);
5700  
5701  	mutex_lock(&disks_mutex);
5702  	mddev = mddev_alloc(dev);
5703  	if (IS_ERR(mddev)) {
5704  		error = PTR_ERR(mddev);
5705  		goto out_unlock;
5706  	}
5707  
5708  	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5709  	shift = partitioned ? MdpMinorShift : 0;
5710  	unit = MINOR(mddev->unit) >> shift;
5711  
5712  	if (name && !dev) {
5713  		/* Need to ensure that 'name' is not a duplicate.
5714  		 */
5715  		struct mddev *mddev2;
5716  		spin_lock(&all_mddevs_lock);
5717  
5718  		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5719  			if (mddev2->gendisk &&
5720  			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5721  				spin_unlock(&all_mddevs_lock);
5722  				error = -EEXIST;
5723  				goto out_free_mddev;
5724  			}
5725  		spin_unlock(&all_mddevs_lock);
5726  	}
5727  	if (name && dev)
5728  		/*
5729  		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5730  		 */
5731  		mddev->hold_active = UNTIL_STOP;
5732  
5733  	error = -ENOMEM;
5734  	disk = blk_alloc_disk(NUMA_NO_NODE);
5735  	if (!disk)
5736  		goto out_free_mddev;
5737  
5738  	disk->major = MAJOR(mddev->unit);
5739  	disk->first_minor = unit << shift;
5740  	disk->minors = 1 << shift;
5741  	if (name)
5742  		strcpy(disk->disk_name, name);
5743  	else if (partitioned)
5744  		sprintf(disk->disk_name, "md_d%d", unit);
5745  	else
5746  		sprintf(disk->disk_name, "md%d", unit);
5747  	disk->fops = &md_fops;
5748  	disk->private_data = mddev;
5749  
5750  	mddev->queue = disk->queue;
5751  	blk_set_stacking_limits(&mddev->queue->limits);
5752  	blk_queue_write_cache(mddev->queue, true, true);
5753  	disk->events |= DISK_EVENT_MEDIA_CHANGE;
5754  	mddev->gendisk = disk;
5755  	error = add_disk(disk);
5756  	if (error)
5757  		goto out_put_disk;
5758  
5759  	kobject_init(&mddev->kobj, &md_ktype);
5760  	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5761  	if (error) {
5762  		/*
5763  		 * The disk is already live at this point.  Clear the hold flag
5764  		 * and let mddev_put take care of the deletion, as it isn't any
5765  		 * different from a normal close on last release now.
5766  		 */
5767  		mddev->hold_active = 0;
5768  		mutex_unlock(&disks_mutex);
5769  		mddev_put(mddev);
5770  		return ERR_PTR(error);
5771  	}
5772  
5773  	kobject_uevent(&mddev->kobj, KOBJ_ADD);
5774  	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5775  	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5776  	mutex_unlock(&disks_mutex);
5777  	return mddev;
5778  
5779  out_put_disk:
5780  	put_disk(disk);
5781  out_free_mddev:
5782  	mddev_free(mddev);
5783  out_unlock:
5784  	mutex_unlock(&disks_mutex);
5785  	return ERR_PTR(error);
5786  }
5787  
md_alloc_and_put(dev_t dev,char * name)5788  static int md_alloc_and_put(dev_t dev, char *name)
5789  {
5790  	struct mddev *mddev = md_alloc(dev, name);
5791  
5792  	if (IS_ERR(mddev))
5793  		return PTR_ERR(mddev);
5794  	mddev_put(mddev);
5795  	return 0;
5796  }
5797  
md_probe(dev_t dev)5798  static void md_probe(dev_t dev)
5799  {
5800  	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
5801  		return;
5802  	if (create_on_open)
5803  		md_alloc_and_put(dev, NULL);
5804  }
5805  
add_named_array(const char * val,const struct kernel_param * kp)5806  static int add_named_array(const char *val, const struct kernel_param *kp)
5807  {
5808  	/*
5809  	 * val must be "md_*" or "mdNNN".
5810  	 * For "md_*" we allocate an array with a large free minor number, and
5811  	 * set the name to val.  val must not already be an active name.
5812  	 * For "mdNNN" we allocate an array with the minor number NNN
5813  	 * which must not already be in use.
5814  	 */
5815  	int len = strlen(val);
5816  	char buf[DISK_NAME_LEN];
5817  	unsigned long devnum;
5818  
5819  	while (len && val[len-1] == '\n')
5820  		len--;
5821  	if (len >= DISK_NAME_LEN)
5822  		return -E2BIG;
5823  	strscpy(buf, val, len+1);
5824  	if (strncmp(buf, "md_", 3) == 0)
5825  		return md_alloc_and_put(0, buf);
5826  	if (strncmp(buf, "md", 2) == 0 &&
5827  	    isdigit(buf[2]) &&
5828  	    kstrtoul(buf+2, 10, &devnum) == 0 &&
5829  	    devnum <= MINORMASK)
5830  		return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
5831  
5832  	return -EINVAL;
5833  }
5834  
md_safemode_timeout(struct timer_list * t)5835  static void md_safemode_timeout(struct timer_list *t)
5836  {
5837  	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5838  
5839  	mddev->safemode = 1;
5840  	if (mddev->external)
5841  		sysfs_notify_dirent_safe(mddev->sysfs_state);
5842  
5843  	md_wakeup_thread(mddev->thread);
5844  }
5845  
5846  static int start_dirty_degraded;
active_io_release(struct percpu_ref * ref)5847  static void active_io_release(struct percpu_ref *ref)
5848  {
5849  	struct mddev *mddev = container_of(ref, struct mddev, active_io);
5850  
5851  	wake_up(&mddev->sb_wait);
5852  }
5853  
md_run(struct mddev * mddev)5854  int md_run(struct mddev *mddev)
5855  {
5856  	int err;
5857  	struct md_rdev *rdev;
5858  	struct md_personality *pers;
5859  	bool nowait = true;
5860  
5861  	if (list_empty(&mddev->disks))
5862  		/* cannot run an array with no devices.. */
5863  		return -EINVAL;
5864  
5865  	if (mddev->pers)
5866  		return -EBUSY;
5867  	/* Cannot run until previous stop completes properly */
5868  	if (mddev->sysfs_active)
5869  		return -EBUSY;
5870  
5871  	/*
5872  	 * Analyze all RAID superblock(s)
5873  	 */
5874  	if (!mddev->raid_disks) {
5875  		if (!mddev->persistent)
5876  			return -EINVAL;
5877  		err = analyze_sbs(mddev);
5878  		if (err)
5879  			return -EINVAL;
5880  	}
5881  
5882  	if (mddev->level != LEVEL_NONE)
5883  		request_module("md-level-%d", mddev->level);
5884  	else if (mddev->clevel[0])
5885  		request_module("md-%s", mddev->clevel);
5886  
5887  	/*
5888  	 * Drop all container device buffers, from now on
5889  	 * the only valid external interface is through the md
5890  	 * device.
5891  	 */
5892  	mddev->has_superblocks = false;
5893  	rdev_for_each(rdev, mddev) {
5894  		if (test_bit(Faulty, &rdev->flags))
5895  			continue;
5896  		sync_blockdev(rdev->bdev);
5897  		invalidate_bdev(rdev->bdev);
5898  		if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
5899  			mddev->ro = MD_RDONLY;
5900  			if (mddev->gendisk)
5901  				set_disk_ro(mddev->gendisk, 1);
5902  		}
5903  
5904  		if (rdev->sb_page)
5905  			mddev->has_superblocks = true;
5906  
5907  		/* perform some consistency tests on the device.
5908  		 * We don't want the data to overlap the metadata,
5909  		 * Internal Bitmap issues have been handled elsewhere.
5910  		 */
5911  		if (rdev->meta_bdev) {
5912  			/* Nothing to check */;
5913  		} else if (rdev->data_offset < rdev->sb_start) {
5914  			if (mddev->dev_sectors &&
5915  			    rdev->data_offset + mddev->dev_sectors
5916  			    > rdev->sb_start) {
5917  				pr_warn("md: %s: data overlaps metadata\n",
5918  					mdname(mddev));
5919  				return -EINVAL;
5920  			}
5921  		} else {
5922  			if (rdev->sb_start + rdev->sb_size/512
5923  			    > rdev->data_offset) {
5924  				pr_warn("md: %s: metadata overlaps data\n",
5925  					mdname(mddev));
5926  				return -EINVAL;
5927  			}
5928  		}
5929  		sysfs_notify_dirent_safe(rdev->sysfs_state);
5930  		nowait = nowait && bdev_nowait(rdev->bdev);
5931  	}
5932  
5933  	err = percpu_ref_init(&mddev->active_io, active_io_release,
5934  				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
5935  	if (err)
5936  		return err;
5937  
5938  	if (!bioset_initialized(&mddev->bio_set)) {
5939  		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5940  		if (err)
5941  			goto exit_active_io;
5942  	}
5943  	if (!bioset_initialized(&mddev->sync_set)) {
5944  		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5945  		if (err)
5946  			goto exit_bio_set;
5947  	}
5948  
5949  	if (!bioset_initialized(&mddev->io_clone_set)) {
5950  		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
5951  				  offsetof(struct md_io_clone, bio_clone), 0);
5952  		if (err)
5953  			goto exit_sync_set;
5954  	}
5955  
5956  	spin_lock(&pers_lock);
5957  	pers = find_pers(mddev->level, mddev->clevel);
5958  	if (!pers || !try_module_get(pers->owner)) {
5959  		spin_unlock(&pers_lock);
5960  		if (mddev->level != LEVEL_NONE)
5961  			pr_warn("md: personality for level %d is not loaded!\n",
5962  				mddev->level);
5963  		else
5964  			pr_warn("md: personality for level %s is not loaded!\n",
5965  				mddev->clevel);
5966  		err = -EINVAL;
5967  		goto abort;
5968  	}
5969  	spin_unlock(&pers_lock);
5970  	if (mddev->level != pers->level) {
5971  		mddev->level = pers->level;
5972  		mddev->new_level = pers->level;
5973  	}
5974  	strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5975  
5976  	if (mddev->reshape_position != MaxSector &&
5977  	    pers->start_reshape == NULL) {
5978  		/* This personality cannot handle reshaping... */
5979  		module_put(pers->owner);
5980  		err = -EINVAL;
5981  		goto abort;
5982  	}
5983  
5984  	if (pers->sync_request) {
5985  		/* Warn if this is a potentially silly
5986  		 * configuration.
5987  		 */
5988  		struct md_rdev *rdev2;
5989  		int warned = 0;
5990  
5991  		rdev_for_each(rdev, mddev)
5992  			rdev_for_each(rdev2, mddev) {
5993  				if (rdev < rdev2 &&
5994  				    rdev->bdev->bd_disk ==
5995  				    rdev2->bdev->bd_disk) {
5996  					pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
5997  						mdname(mddev),
5998  						rdev->bdev,
5999  						rdev2->bdev);
6000  					warned = 1;
6001  				}
6002  			}
6003  
6004  		if (warned)
6005  			pr_warn("True protection against single-disk failure might be compromised.\n");
6006  	}
6007  
6008  	mddev->recovery = 0;
6009  	/* may be over-ridden by personality */
6010  	mddev->resync_max_sectors = mddev->dev_sectors;
6011  
6012  	mddev->ok_start_degraded = start_dirty_degraded;
6013  
6014  	if (start_readonly && md_is_rdwr(mddev))
6015  		mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6016  
6017  	err = pers->run(mddev);
6018  	if (err)
6019  		pr_warn("md: pers->run() failed ...\n");
6020  	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6021  		WARN_ONCE(!mddev->external_size,
6022  			  "%s: default size too small, but 'external_size' not in effect?\n",
6023  			  __func__);
6024  		pr_warn("md: invalid array_size %llu > default size %llu\n",
6025  			(unsigned long long)mddev->array_sectors / 2,
6026  			(unsigned long long)pers->size(mddev, 0, 0) / 2);
6027  		err = -EINVAL;
6028  	}
6029  	if (err == 0 && pers->sync_request &&
6030  	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6031  		struct bitmap *bitmap;
6032  
6033  		bitmap = md_bitmap_create(mddev, -1);
6034  		if (IS_ERR(bitmap)) {
6035  			err = PTR_ERR(bitmap);
6036  			pr_warn("%s: failed to create bitmap (%d)\n",
6037  				mdname(mddev), err);
6038  		} else
6039  			mddev->bitmap = bitmap;
6040  
6041  	}
6042  	if (err)
6043  		goto bitmap_abort;
6044  
6045  	if (mddev->bitmap_info.max_write_behind > 0) {
6046  		bool create_pool = false;
6047  
6048  		rdev_for_each(rdev, mddev) {
6049  			if (test_bit(WriteMostly, &rdev->flags) &&
6050  			    rdev_init_serial(rdev))
6051  				create_pool = true;
6052  		}
6053  		if (create_pool && mddev->serial_info_pool == NULL) {
6054  			mddev->serial_info_pool =
6055  				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6056  						    sizeof(struct serial_info));
6057  			if (!mddev->serial_info_pool) {
6058  				err = -ENOMEM;
6059  				goto bitmap_abort;
6060  			}
6061  		}
6062  	}
6063  
6064  	if (mddev->queue) {
6065  		bool nonrot = true;
6066  
6067  		rdev_for_each(rdev, mddev) {
6068  			if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) {
6069  				nonrot = false;
6070  				break;
6071  			}
6072  		}
6073  		if (mddev->degraded)
6074  			nonrot = false;
6075  		if (nonrot)
6076  			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
6077  		else
6078  			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
6079  		blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
6080  
6081  		/* Set the NOWAIT flags if all underlying devices support it */
6082  		if (nowait)
6083  			blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
6084  	}
6085  	if (pers->sync_request) {
6086  		if (mddev->kobj.sd &&
6087  		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6088  			pr_warn("md: cannot register extra attributes for %s\n",
6089  				mdname(mddev));
6090  		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6091  		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6092  		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6093  	} else if (mddev->ro == MD_AUTO_READ)
6094  		mddev->ro = MD_RDWR;
6095  
6096  	atomic_set(&mddev->max_corr_read_errors,
6097  		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6098  	mddev->safemode = 0;
6099  	if (mddev_is_clustered(mddev))
6100  		mddev->safemode_delay = 0;
6101  	else
6102  		mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6103  	mddev->in_sync = 1;
6104  	smp_wmb();
6105  	spin_lock(&mddev->lock);
6106  	mddev->pers = pers;
6107  	spin_unlock(&mddev->lock);
6108  	rdev_for_each(rdev, mddev)
6109  		if (rdev->raid_disk >= 0)
6110  			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6111  
6112  	if (mddev->degraded && md_is_rdwr(mddev))
6113  		/* This ensures that recovering status is reported immediately
6114  		 * via sysfs - until a lack of spares is confirmed.
6115  		 */
6116  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6117  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6118  
6119  	if (mddev->sb_flags)
6120  		md_update_sb(mddev, 0);
6121  
6122  	md_new_event();
6123  	return 0;
6124  
6125  bitmap_abort:
6126  	mddev_detach(mddev);
6127  	if (mddev->private)
6128  		pers->free(mddev, mddev->private);
6129  	mddev->private = NULL;
6130  	module_put(pers->owner);
6131  	md_bitmap_destroy(mddev);
6132  abort:
6133  	bioset_exit(&mddev->io_clone_set);
6134  exit_sync_set:
6135  	bioset_exit(&mddev->sync_set);
6136  exit_bio_set:
6137  	bioset_exit(&mddev->bio_set);
6138  exit_active_io:
6139  	percpu_ref_exit(&mddev->active_io);
6140  	return err;
6141  }
6142  EXPORT_SYMBOL_GPL(md_run);
6143  
do_md_run(struct mddev * mddev)6144  int do_md_run(struct mddev *mddev)
6145  {
6146  	int err;
6147  
6148  	set_bit(MD_NOT_READY, &mddev->flags);
6149  	err = md_run(mddev);
6150  	if (err)
6151  		goto out;
6152  	err = md_bitmap_load(mddev);
6153  	if (err) {
6154  		md_bitmap_destroy(mddev);
6155  		goto out;
6156  	}
6157  
6158  	if (mddev_is_clustered(mddev))
6159  		md_allow_write(mddev);
6160  
6161  	/* run start up tasks that require md_thread */
6162  	md_start(mddev);
6163  
6164  	md_wakeup_thread(mddev->thread);
6165  	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6166  
6167  	set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6168  	clear_bit(MD_NOT_READY, &mddev->flags);
6169  	mddev->changed = 1;
6170  	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6171  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6172  	sysfs_notify_dirent_safe(mddev->sysfs_action);
6173  	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6174  out:
6175  	clear_bit(MD_NOT_READY, &mddev->flags);
6176  	return err;
6177  }
6178  
md_start(struct mddev * mddev)6179  int md_start(struct mddev *mddev)
6180  {
6181  	int ret = 0;
6182  
6183  	if (mddev->pers->start) {
6184  		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6185  		md_wakeup_thread(mddev->thread);
6186  		ret = mddev->pers->start(mddev);
6187  		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6188  		md_wakeup_thread(mddev->sync_thread);
6189  	}
6190  	return ret;
6191  }
6192  EXPORT_SYMBOL_GPL(md_start);
6193  
restart_array(struct mddev * mddev)6194  static int restart_array(struct mddev *mddev)
6195  {
6196  	struct gendisk *disk = mddev->gendisk;
6197  	struct md_rdev *rdev;
6198  	bool has_journal = false;
6199  	bool has_readonly = false;
6200  
6201  	/* Complain if it has no devices */
6202  	if (list_empty(&mddev->disks))
6203  		return -ENXIO;
6204  	if (!mddev->pers)
6205  		return -EINVAL;
6206  	if (md_is_rdwr(mddev))
6207  		return -EBUSY;
6208  
6209  	rcu_read_lock();
6210  	rdev_for_each_rcu(rdev, mddev) {
6211  		if (test_bit(Journal, &rdev->flags) &&
6212  		    !test_bit(Faulty, &rdev->flags))
6213  			has_journal = true;
6214  		if (rdev_read_only(rdev))
6215  			has_readonly = true;
6216  	}
6217  	rcu_read_unlock();
6218  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6219  		/* Don't restart rw with journal missing/faulty */
6220  			return -EINVAL;
6221  	if (has_readonly)
6222  		return -EROFS;
6223  
6224  	mddev->safemode = 0;
6225  	mddev->ro = MD_RDWR;
6226  	set_disk_ro(disk, 0);
6227  	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6228  	/* Kick recovery or resync if necessary */
6229  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6230  	md_wakeup_thread(mddev->thread);
6231  	md_wakeup_thread(mddev->sync_thread);
6232  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6233  	return 0;
6234  }
6235  
md_clean(struct mddev * mddev)6236  static void md_clean(struct mddev *mddev)
6237  {
6238  	mddev->array_sectors = 0;
6239  	mddev->external_size = 0;
6240  	mddev->dev_sectors = 0;
6241  	mddev->raid_disks = 0;
6242  	mddev->recovery_cp = 0;
6243  	mddev->resync_min = 0;
6244  	mddev->resync_max = MaxSector;
6245  	mddev->reshape_position = MaxSector;
6246  	/* we still need mddev->external in export_rdev, do not clear it yet */
6247  	mddev->persistent = 0;
6248  	mddev->level = LEVEL_NONE;
6249  	mddev->clevel[0] = 0;
6250  	/*
6251  	 * Don't clear MD_CLOSING, or mddev can be opened again.
6252  	 * 'hold_active != 0' means mddev is still in the creation
6253  	 * process and will be used later.
6254  	 */
6255  	if (mddev->hold_active)
6256  		mddev->flags = 0;
6257  	else
6258  		mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6259  	mddev->sb_flags = 0;
6260  	mddev->ro = MD_RDWR;
6261  	mddev->metadata_type[0] = 0;
6262  	mddev->chunk_sectors = 0;
6263  	mddev->ctime = mddev->utime = 0;
6264  	mddev->layout = 0;
6265  	mddev->max_disks = 0;
6266  	mddev->events = 0;
6267  	mddev->can_decrease_events = 0;
6268  	mddev->delta_disks = 0;
6269  	mddev->reshape_backwards = 0;
6270  	mddev->new_level = LEVEL_NONE;
6271  	mddev->new_layout = 0;
6272  	mddev->new_chunk_sectors = 0;
6273  	mddev->curr_resync = MD_RESYNC_NONE;
6274  	atomic64_set(&mddev->resync_mismatches, 0);
6275  	mddev->suspend_lo = mddev->suspend_hi = 0;
6276  	mddev->sync_speed_min = mddev->sync_speed_max = 0;
6277  	mddev->recovery = 0;
6278  	mddev->in_sync = 0;
6279  	mddev->changed = 0;
6280  	mddev->degraded = 0;
6281  	mddev->safemode = 0;
6282  	mddev->private = NULL;
6283  	mddev->cluster_info = NULL;
6284  	mddev->bitmap_info.offset = 0;
6285  	mddev->bitmap_info.default_offset = 0;
6286  	mddev->bitmap_info.default_space = 0;
6287  	mddev->bitmap_info.chunksize = 0;
6288  	mddev->bitmap_info.daemon_sleep = 0;
6289  	mddev->bitmap_info.max_write_behind = 0;
6290  	mddev->bitmap_info.nodes = 0;
6291  }
6292  
__md_stop_writes(struct mddev * mddev)6293  static void __md_stop_writes(struct mddev *mddev)
6294  {
6295  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6296  	if (work_pending(&mddev->sync_work))
6297  		flush_workqueue(md_misc_wq);
6298  	if (mddev->sync_thread) {
6299  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6300  		md_reap_sync_thread(mddev);
6301  	}
6302  
6303  	del_timer_sync(&mddev->safemode_timer);
6304  
6305  	if (mddev->pers && mddev->pers->quiesce) {
6306  		mddev->pers->quiesce(mddev, 1);
6307  		mddev->pers->quiesce(mddev, 0);
6308  	}
6309  	md_bitmap_flush(mddev);
6310  
6311  	if (md_is_rdwr(mddev) &&
6312  	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6313  	     mddev->sb_flags)) {
6314  		/* mark array as shutdown cleanly */
6315  		if (!mddev_is_clustered(mddev))
6316  			mddev->in_sync = 1;
6317  		md_update_sb(mddev, 1);
6318  	}
6319  	/* disable policy to guarantee rdevs free resources for serialization */
6320  	mddev->serialize_policy = 0;
6321  	mddev_destroy_serial_pool(mddev, NULL, true);
6322  }
6323  
md_stop_writes(struct mddev * mddev)6324  void md_stop_writes(struct mddev *mddev)
6325  {
6326  	mddev_lock_nointr(mddev);
6327  	__md_stop_writes(mddev);
6328  	mddev_unlock(mddev);
6329  }
6330  EXPORT_SYMBOL_GPL(md_stop_writes);
6331  
mddev_detach(struct mddev * mddev)6332  static void mddev_detach(struct mddev *mddev)
6333  {
6334  	md_bitmap_wait_behind_writes(mddev);
6335  	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6336  		mddev->pers->quiesce(mddev, 1);
6337  		mddev->pers->quiesce(mddev, 0);
6338  	}
6339  	md_unregister_thread(mddev, &mddev->thread);
6340  	if (mddev->queue)
6341  		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
6342  }
6343  
__md_stop(struct mddev * mddev)6344  static void __md_stop(struct mddev *mddev)
6345  {
6346  	struct md_personality *pers = mddev->pers;
6347  	md_bitmap_destroy(mddev);
6348  	mddev_detach(mddev);
6349  	/* Ensure ->event_work is done */
6350  	if (mddev->event_work.func)
6351  		flush_workqueue(md_misc_wq);
6352  	spin_lock(&mddev->lock);
6353  	mddev->pers = NULL;
6354  	spin_unlock(&mddev->lock);
6355  	if (mddev->private)
6356  		pers->free(mddev, mddev->private);
6357  	mddev->private = NULL;
6358  	if (pers->sync_request && mddev->to_remove == NULL)
6359  		mddev->to_remove = &md_redundancy_group;
6360  	module_put(pers->owner);
6361  	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6362  
6363  	percpu_ref_exit(&mddev->active_io);
6364  	bioset_exit(&mddev->bio_set);
6365  	bioset_exit(&mddev->sync_set);
6366  	bioset_exit(&mddev->io_clone_set);
6367  }
6368  
md_stop(struct mddev * mddev)6369  void md_stop(struct mddev *mddev)
6370  {
6371  	lockdep_assert_held(&mddev->reconfig_mutex);
6372  
6373  	/* stop the array and free an attached data structures.
6374  	 * This is called from dm-raid
6375  	 */
6376  	__md_stop_writes(mddev);
6377  	__md_stop(mddev);
6378  	percpu_ref_exit(&mddev->writes_pending);
6379  }
6380  
6381  EXPORT_SYMBOL_GPL(md_stop);
6382  
md_set_readonly(struct mddev * mddev,struct block_device * bdev)6383  static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6384  {
6385  	int err = 0;
6386  	int did_freeze = 0;
6387  
6388  	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6389  		return -EBUSY;
6390  
6391  	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6392  		did_freeze = 1;
6393  		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6394  		md_wakeup_thread(mddev->thread);
6395  	}
6396  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6397  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6398  
6399  	/*
6400  	 * Thread might be blocked waiting for metadata update which will now
6401  	 * never happen
6402  	 */
6403  	md_wakeup_thread_directly(mddev->sync_thread);
6404  
6405  	mddev_unlock(mddev);
6406  	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6407  					  &mddev->recovery));
6408  	wait_event(mddev->sb_wait,
6409  		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6410  	mddev_lock_nointr(mddev);
6411  
6412  	mutex_lock(&mddev->open_mutex);
6413  	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6414  	    mddev->sync_thread ||
6415  	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6416  		pr_warn("md: %s still in use.\n",mdname(mddev));
6417  		err = -EBUSY;
6418  		goto out;
6419  	}
6420  
6421  	if (mddev->pers) {
6422  		__md_stop_writes(mddev);
6423  
6424  		if (mddev->ro == MD_RDONLY) {
6425  			err  = -ENXIO;
6426  			goto out;
6427  		}
6428  
6429  		mddev->ro = MD_RDONLY;
6430  		set_disk_ro(mddev->gendisk, 1);
6431  	}
6432  
6433  out:
6434  	if ((mddev->pers && !err) || did_freeze) {
6435  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6436  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6437  		md_wakeup_thread(mddev->thread);
6438  		sysfs_notify_dirent_safe(mddev->sysfs_state);
6439  	}
6440  
6441  	mutex_unlock(&mddev->open_mutex);
6442  	return err;
6443  }
6444  
6445  /* mode:
6446   *   0 - completely stop and dis-assemble array
6447   *   2 - stop but do not disassemble array
6448   */
do_md_stop(struct mddev * mddev,int mode,struct block_device * bdev)6449  static int do_md_stop(struct mddev *mddev, int mode,
6450  		      struct block_device *bdev)
6451  {
6452  	struct gendisk *disk = mddev->gendisk;
6453  	struct md_rdev *rdev;
6454  	int did_freeze = 0;
6455  
6456  	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6457  		did_freeze = 1;
6458  		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6459  		md_wakeup_thread(mddev->thread);
6460  	}
6461  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6462  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6463  
6464  	/*
6465  	 * Thread might be blocked waiting for metadata update which will now
6466  	 * never happen
6467  	 */
6468  	md_wakeup_thread_directly(mddev->sync_thread);
6469  
6470  	mddev_unlock(mddev);
6471  	wait_event(resync_wait, (mddev->sync_thread == NULL &&
6472  				 !test_bit(MD_RECOVERY_RUNNING,
6473  					   &mddev->recovery)));
6474  	mddev_lock_nointr(mddev);
6475  
6476  	mutex_lock(&mddev->open_mutex);
6477  	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6478  	    mddev->sysfs_active ||
6479  	    mddev->sync_thread ||
6480  	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6481  		pr_warn("md: %s still in use.\n",mdname(mddev));
6482  		mutex_unlock(&mddev->open_mutex);
6483  		if (did_freeze) {
6484  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6485  			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6486  			md_wakeup_thread(mddev->thread);
6487  		}
6488  		return -EBUSY;
6489  	}
6490  	if (mddev->pers) {
6491  		if (!md_is_rdwr(mddev))
6492  			set_disk_ro(disk, 0);
6493  
6494  		__md_stop_writes(mddev);
6495  		__md_stop(mddev);
6496  
6497  		/* tell userspace to handle 'inactive' */
6498  		sysfs_notify_dirent_safe(mddev->sysfs_state);
6499  
6500  		rdev_for_each(rdev, mddev)
6501  			if (rdev->raid_disk >= 0)
6502  				sysfs_unlink_rdev(mddev, rdev);
6503  
6504  		set_capacity_and_notify(disk, 0);
6505  		mutex_unlock(&mddev->open_mutex);
6506  		mddev->changed = 1;
6507  
6508  		if (!md_is_rdwr(mddev))
6509  			mddev->ro = MD_RDWR;
6510  	} else
6511  		mutex_unlock(&mddev->open_mutex);
6512  	/*
6513  	 * Free resources if final stop
6514  	 */
6515  	if (mode == 0) {
6516  		pr_info("md: %s stopped.\n", mdname(mddev));
6517  
6518  		if (mddev->bitmap_info.file) {
6519  			struct file *f = mddev->bitmap_info.file;
6520  			spin_lock(&mddev->lock);
6521  			mddev->bitmap_info.file = NULL;
6522  			spin_unlock(&mddev->lock);
6523  			fput(f);
6524  		}
6525  		mddev->bitmap_info.offset = 0;
6526  
6527  		export_array(mddev);
6528  
6529  		md_clean(mddev);
6530  		if (mddev->hold_active == UNTIL_STOP)
6531  			mddev->hold_active = 0;
6532  	}
6533  	md_new_event();
6534  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6535  	return 0;
6536  }
6537  
6538  #ifndef MODULE
autorun_array(struct mddev * mddev)6539  static void autorun_array(struct mddev *mddev)
6540  {
6541  	struct md_rdev *rdev;
6542  	int err;
6543  
6544  	if (list_empty(&mddev->disks))
6545  		return;
6546  
6547  	pr_info("md: running: ");
6548  
6549  	rdev_for_each(rdev, mddev) {
6550  		pr_cont("<%pg>", rdev->bdev);
6551  	}
6552  	pr_cont("\n");
6553  
6554  	err = do_md_run(mddev);
6555  	if (err) {
6556  		pr_warn("md: do_md_run() returned %d\n", err);
6557  		do_md_stop(mddev, 0, NULL);
6558  	}
6559  }
6560  
6561  /*
6562   * lets try to run arrays based on all disks that have arrived
6563   * until now. (those are in pending_raid_disks)
6564   *
6565   * the method: pick the first pending disk, collect all disks with
6566   * the same UUID, remove all from the pending list and put them into
6567   * the 'same_array' list. Then order this list based on superblock
6568   * update time (freshest comes first), kick out 'old' disks and
6569   * compare superblocks. If everything's fine then run it.
6570   *
6571   * If "unit" is allocated, then bump its reference count
6572   */
autorun_devices(int part)6573  static void autorun_devices(int part)
6574  {
6575  	struct md_rdev *rdev0, *rdev, *tmp;
6576  	struct mddev *mddev;
6577  
6578  	pr_info("md: autorun ...\n");
6579  	while (!list_empty(&pending_raid_disks)) {
6580  		int unit;
6581  		dev_t dev;
6582  		LIST_HEAD(candidates);
6583  		rdev0 = list_entry(pending_raid_disks.next,
6584  					 struct md_rdev, same_set);
6585  
6586  		pr_debug("md: considering %pg ...\n", rdev0->bdev);
6587  		INIT_LIST_HEAD(&candidates);
6588  		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6589  			if (super_90_load(rdev, rdev0, 0) >= 0) {
6590  				pr_debug("md:  adding %pg ...\n",
6591  					 rdev->bdev);
6592  				list_move(&rdev->same_set, &candidates);
6593  			}
6594  		/*
6595  		 * now we have a set of devices, with all of them having
6596  		 * mostly sane superblocks. It's time to allocate the
6597  		 * mddev.
6598  		 */
6599  		if (part) {
6600  			dev = MKDEV(mdp_major,
6601  				    rdev0->preferred_minor << MdpMinorShift);
6602  			unit = MINOR(dev) >> MdpMinorShift;
6603  		} else {
6604  			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6605  			unit = MINOR(dev);
6606  		}
6607  		if (rdev0->preferred_minor != unit) {
6608  			pr_warn("md: unit number in %pg is bad: %d\n",
6609  				rdev0->bdev, rdev0->preferred_minor);
6610  			break;
6611  		}
6612  
6613  		mddev = md_alloc(dev, NULL);
6614  		if (IS_ERR(mddev))
6615  			break;
6616  
6617  		if (mddev_lock(mddev))
6618  			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6619  		else if (mddev->raid_disks || mddev->major_version
6620  			 || !list_empty(&mddev->disks)) {
6621  			pr_warn("md: %s already running, cannot run %pg\n",
6622  				mdname(mddev), rdev0->bdev);
6623  			mddev_unlock(mddev);
6624  		} else {
6625  			pr_debug("md: created %s\n", mdname(mddev));
6626  			mddev->persistent = 1;
6627  			rdev_for_each_list(rdev, tmp, &candidates) {
6628  				list_del_init(&rdev->same_set);
6629  				if (bind_rdev_to_array(rdev, mddev))
6630  					export_rdev(rdev, mddev);
6631  			}
6632  			autorun_array(mddev);
6633  			mddev_unlock(mddev);
6634  		}
6635  		/* on success, candidates will be empty, on error
6636  		 * it won't...
6637  		 */
6638  		rdev_for_each_list(rdev, tmp, &candidates) {
6639  			list_del_init(&rdev->same_set);
6640  			export_rdev(rdev, mddev);
6641  		}
6642  		mddev_put(mddev);
6643  	}
6644  	pr_info("md: ... autorun DONE.\n");
6645  }
6646  #endif /* !MODULE */
6647  
get_version(void __user * arg)6648  static int get_version(void __user *arg)
6649  {
6650  	mdu_version_t ver;
6651  
6652  	ver.major = MD_MAJOR_VERSION;
6653  	ver.minor = MD_MINOR_VERSION;
6654  	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6655  
6656  	if (copy_to_user(arg, &ver, sizeof(ver)))
6657  		return -EFAULT;
6658  
6659  	return 0;
6660  }
6661  
get_array_info(struct mddev * mddev,void __user * arg)6662  static int get_array_info(struct mddev *mddev, void __user *arg)
6663  {
6664  	mdu_array_info_t info;
6665  	int nr,working,insync,failed,spare;
6666  	struct md_rdev *rdev;
6667  
6668  	nr = working = insync = failed = spare = 0;
6669  	rcu_read_lock();
6670  	rdev_for_each_rcu(rdev, mddev) {
6671  		nr++;
6672  		if (test_bit(Faulty, &rdev->flags))
6673  			failed++;
6674  		else {
6675  			working++;
6676  			if (test_bit(In_sync, &rdev->flags))
6677  				insync++;
6678  			else if (test_bit(Journal, &rdev->flags))
6679  				/* TODO: add journal count to md_u.h */
6680  				;
6681  			else
6682  				spare++;
6683  		}
6684  	}
6685  	rcu_read_unlock();
6686  
6687  	info.major_version = mddev->major_version;
6688  	info.minor_version = mddev->minor_version;
6689  	info.patch_version = MD_PATCHLEVEL_VERSION;
6690  	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6691  	info.level         = mddev->level;
6692  	info.size          = mddev->dev_sectors / 2;
6693  	if (info.size != mddev->dev_sectors / 2) /* overflow */
6694  		info.size = -1;
6695  	info.nr_disks      = nr;
6696  	info.raid_disks    = mddev->raid_disks;
6697  	info.md_minor      = mddev->md_minor;
6698  	info.not_persistent= !mddev->persistent;
6699  
6700  	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6701  	info.state         = 0;
6702  	if (mddev->in_sync)
6703  		info.state = (1<<MD_SB_CLEAN);
6704  	if (mddev->bitmap && mddev->bitmap_info.offset)
6705  		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6706  	if (mddev_is_clustered(mddev))
6707  		info.state |= (1<<MD_SB_CLUSTERED);
6708  	info.active_disks  = insync;
6709  	info.working_disks = working;
6710  	info.failed_disks  = failed;
6711  	info.spare_disks   = spare;
6712  
6713  	info.layout        = mddev->layout;
6714  	info.chunk_size    = mddev->chunk_sectors << 9;
6715  
6716  	if (copy_to_user(arg, &info, sizeof(info)))
6717  		return -EFAULT;
6718  
6719  	return 0;
6720  }
6721  
get_bitmap_file(struct mddev * mddev,void __user * arg)6722  static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6723  {
6724  	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6725  	char *ptr;
6726  	int err;
6727  
6728  	file = kzalloc(sizeof(*file), GFP_NOIO);
6729  	if (!file)
6730  		return -ENOMEM;
6731  
6732  	err = 0;
6733  	spin_lock(&mddev->lock);
6734  	/* bitmap enabled */
6735  	if (mddev->bitmap_info.file) {
6736  		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6737  				sizeof(file->pathname));
6738  		if (IS_ERR(ptr))
6739  			err = PTR_ERR(ptr);
6740  		else
6741  			memmove(file->pathname, ptr,
6742  				sizeof(file->pathname)-(ptr-file->pathname));
6743  	}
6744  	spin_unlock(&mddev->lock);
6745  
6746  	if (err == 0 &&
6747  	    copy_to_user(arg, file, sizeof(*file)))
6748  		err = -EFAULT;
6749  
6750  	kfree(file);
6751  	return err;
6752  }
6753  
get_disk_info(struct mddev * mddev,void __user * arg)6754  static int get_disk_info(struct mddev *mddev, void __user * arg)
6755  {
6756  	mdu_disk_info_t info;
6757  	struct md_rdev *rdev;
6758  
6759  	if (copy_from_user(&info, arg, sizeof(info)))
6760  		return -EFAULT;
6761  
6762  	rcu_read_lock();
6763  	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6764  	if (rdev) {
6765  		info.major = MAJOR(rdev->bdev->bd_dev);
6766  		info.minor = MINOR(rdev->bdev->bd_dev);
6767  		info.raid_disk = rdev->raid_disk;
6768  		info.state = 0;
6769  		if (test_bit(Faulty, &rdev->flags))
6770  			info.state |= (1<<MD_DISK_FAULTY);
6771  		else if (test_bit(In_sync, &rdev->flags)) {
6772  			info.state |= (1<<MD_DISK_ACTIVE);
6773  			info.state |= (1<<MD_DISK_SYNC);
6774  		}
6775  		if (test_bit(Journal, &rdev->flags))
6776  			info.state |= (1<<MD_DISK_JOURNAL);
6777  		if (test_bit(WriteMostly, &rdev->flags))
6778  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6779  		if (test_bit(FailFast, &rdev->flags))
6780  			info.state |= (1<<MD_DISK_FAILFAST);
6781  	} else {
6782  		info.major = info.minor = 0;
6783  		info.raid_disk = -1;
6784  		info.state = (1<<MD_DISK_REMOVED);
6785  	}
6786  	rcu_read_unlock();
6787  
6788  	if (copy_to_user(arg, &info, sizeof(info)))
6789  		return -EFAULT;
6790  
6791  	return 0;
6792  }
6793  
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)6794  int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6795  {
6796  	struct md_rdev *rdev;
6797  	dev_t dev = MKDEV(info->major,info->minor);
6798  
6799  	if (mddev_is_clustered(mddev) &&
6800  		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6801  		pr_warn("%s: Cannot add to clustered mddev.\n",
6802  			mdname(mddev));
6803  		return -EINVAL;
6804  	}
6805  
6806  	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6807  		return -EOVERFLOW;
6808  
6809  	if (!mddev->raid_disks) {
6810  		int err;
6811  		/* expecting a device which has a superblock */
6812  		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6813  		if (IS_ERR(rdev)) {
6814  			pr_warn("md: md_import_device returned %ld\n",
6815  				PTR_ERR(rdev));
6816  			return PTR_ERR(rdev);
6817  		}
6818  		if (!list_empty(&mddev->disks)) {
6819  			struct md_rdev *rdev0
6820  				= list_entry(mddev->disks.next,
6821  					     struct md_rdev, same_set);
6822  			err = super_types[mddev->major_version]
6823  				.load_super(rdev, rdev0, mddev->minor_version);
6824  			if (err < 0) {
6825  				pr_warn("md: %pg has different UUID to %pg\n",
6826  					rdev->bdev,
6827  					rdev0->bdev);
6828  				export_rdev(rdev, mddev);
6829  				return -EINVAL;
6830  			}
6831  		}
6832  		err = bind_rdev_to_array(rdev, mddev);
6833  		if (err)
6834  			export_rdev(rdev, mddev);
6835  		return err;
6836  	}
6837  
6838  	/*
6839  	 * md_add_new_disk can be used once the array is assembled
6840  	 * to add "hot spares".  They must already have a superblock
6841  	 * written
6842  	 */
6843  	if (mddev->pers) {
6844  		int err;
6845  		if (!mddev->pers->hot_add_disk) {
6846  			pr_warn("%s: personality does not support diskops!\n",
6847  				mdname(mddev));
6848  			return -EINVAL;
6849  		}
6850  		if (mddev->persistent)
6851  			rdev = md_import_device(dev, mddev->major_version,
6852  						mddev->minor_version);
6853  		else
6854  			rdev = md_import_device(dev, -1, -1);
6855  		if (IS_ERR(rdev)) {
6856  			pr_warn("md: md_import_device returned %ld\n",
6857  				PTR_ERR(rdev));
6858  			return PTR_ERR(rdev);
6859  		}
6860  		/* set saved_raid_disk if appropriate */
6861  		if (!mddev->persistent) {
6862  			if (info->state & (1<<MD_DISK_SYNC)  &&
6863  			    info->raid_disk < mddev->raid_disks) {
6864  				rdev->raid_disk = info->raid_disk;
6865  				clear_bit(Bitmap_sync, &rdev->flags);
6866  			} else
6867  				rdev->raid_disk = -1;
6868  			rdev->saved_raid_disk = rdev->raid_disk;
6869  		} else
6870  			super_types[mddev->major_version].
6871  				validate_super(mddev, NULL/*freshest*/, rdev);
6872  		if ((info->state & (1<<MD_DISK_SYNC)) &&
6873  		     rdev->raid_disk != info->raid_disk) {
6874  			/* This was a hot-add request, but events doesn't
6875  			 * match, so reject it.
6876  			 */
6877  			export_rdev(rdev, mddev);
6878  			return -EINVAL;
6879  		}
6880  
6881  		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6882  		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6883  			set_bit(WriteMostly, &rdev->flags);
6884  		else
6885  			clear_bit(WriteMostly, &rdev->flags);
6886  		if (info->state & (1<<MD_DISK_FAILFAST))
6887  			set_bit(FailFast, &rdev->flags);
6888  		else
6889  			clear_bit(FailFast, &rdev->flags);
6890  
6891  		if (info->state & (1<<MD_DISK_JOURNAL)) {
6892  			struct md_rdev *rdev2;
6893  			bool has_journal = false;
6894  
6895  			/* make sure no existing journal disk */
6896  			rdev_for_each(rdev2, mddev) {
6897  				if (test_bit(Journal, &rdev2->flags)) {
6898  					has_journal = true;
6899  					break;
6900  				}
6901  			}
6902  			if (has_journal || mddev->bitmap) {
6903  				export_rdev(rdev, mddev);
6904  				return -EBUSY;
6905  			}
6906  			set_bit(Journal, &rdev->flags);
6907  		}
6908  		/*
6909  		 * check whether the device shows up in other nodes
6910  		 */
6911  		if (mddev_is_clustered(mddev)) {
6912  			if (info->state & (1 << MD_DISK_CANDIDATE))
6913  				set_bit(Candidate, &rdev->flags);
6914  			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6915  				/* --add initiated by this node */
6916  				err = md_cluster_ops->add_new_disk(mddev, rdev);
6917  				if (err) {
6918  					export_rdev(rdev, mddev);
6919  					return err;
6920  				}
6921  			}
6922  		}
6923  
6924  		rdev->raid_disk = -1;
6925  		err = bind_rdev_to_array(rdev, mddev);
6926  
6927  		if (err)
6928  			export_rdev(rdev, mddev);
6929  
6930  		if (mddev_is_clustered(mddev)) {
6931  			if (info->state & (1 << MD_DISK_CANDIDATE)) {
6932  				if (!err) {
6933  					err = md_cluster_ops->new_disk_ack(mddev,
6934  						err == 0);
6935  					if (err)
6936  						md_kick_rdev_from_array(rdev);
6937  				}
6938  			} else {
6939  				if (err)
6940  					md_cluster_ops->add_new_disk_cancel(mddev);
6941  				else
6942  					err = add_bound_rdev(rdev);
6943  			}
6944  
6945  		} else if (!err)
6946  			err = add_bound_rdev(rdev);
6947  
6948  		return err;
6949  	}
6950  
6951  	/* otherwise, md_add_new_disk is only allowed
6952  	 * for major_version==0 superblocks
6953  	 */
6954  	if (mddev->major_version != 0) {
6955  		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6956  		return -EINVAL;
6957  	}
6958  
6959  	if (!(info->state & (1<<MD_DISK_FAULTY))) {
6960  		int err;
6961  		rdev = md_import_device(dev, -1, 0);
6962  		if (IS_ERR(rdev)) {
6963  			pr_warn("md: error, md_import_device() returned %ld\n",
6964  				PTR_ERR(rdev));
6965  			return PTR_ERR(rdev);
6966  		}
6967  		rdev->desc_nr = info->number;
6968  		if (info->raid_disk < mddev->raid_disks)
6969  			rdev->raid_disk = info->raid_disk;
6970  		else
6971  			rdev->raid_disk = -1;
6972  
6973  		if (rdev->raid_disk < mddev->raid_disks)
6974  			if (info->state & (1<<MD_DISK_SYNC))
6975  				set_bit(In_sync, &rdev->flags);
6976  
6977  		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6978  			set_bit(WriteMostly, &rdev->flags);
6979  		if (info->state & (1<<MD_DISK_FAILFAST))
6980  			set_bit(FailFast, &rdev->flags);
6981  
6982  		if (!mddev->persistent) {
6983  			pr_debug("md: nonpersistent superblock ...\n");
6984  			rdev->sb_start = bdev_nr_sectors(rdev->bdev);
6985  		} else
6986  			rdev->sb_start = calc_dev_sboffset(rdev);
6987  		rdev->sectors = rdev->sb_start;
6988  
6989  		err = bind_rdev_to_array(rdev, mddev);
6990  		if (err) {
6991  			export_rdev(rdev, mddev);
6992  			return err;
6993  		}
6994  	}
6995  
6996  	return 0;
6997  }
6998  
hot_remove_disk(struct mddev * mddev,dev_t dev)6999  static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7000  {
7001  	struct md_rdev *rdev;
7002  
7003  	if (!mddev->pers)
7004  		return -ENODEV;
7005  
7006  	rdev = find_rdev(mddev, dev);
7007  	if (!rdev)
7008  		return -ENXIO;
7009  
7010  	if (rdev->raid_disk < 0)
7011  		goto kick_rdev;
7012  
7013  	clear_bit(Blocked, &rdev->flags);
7014  	remove_and_add_spares(mddev, rdev);
7015  
7016  	if (rdev->raid_disk >= 0)
7017  		goto busy;
7018  
7019  kick_rdev:
7020  	if (mddev_is_clustered(mddev)) {
7021  		if (md_cluster_ops->remove_disk(mddev, rdev))
7022  			goto busy;
7023  	}
7024  
7025  	md_kick_rdev_from_array(rdev);
7026  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7027  	if (mddev->thread)
7028  		md_wakeup_thread(mddev->thread);
7029  	else
7030  		md_update_sb(mddev, 1);
7031  	md_new_event();
7032  
7033  	return 0;
7034  busy:
7035  	pr_debug("md: cannot remove active disk %pg from %s ...\n",
7036  		 rdev->bdev, mdname(mddev));
7037  	return -EBUSY;
7038  }
7039  
hot_add_disk(struct mddev * mddev,dev_t dev)7040  static int hot_add_disk(struct mddev *mddev, dev_t dev)
7041  {
7042  	int err;
7043  	struct md_rdev *rdev;
7044  
7045  	if (!mddev->pers)
7046  		return -ENODEV;
7047  
7048  	if (mddev->major_version != 0) {
7049  		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7050  			mdname(mddev));
7051  		return -EINVAL;
7052  	}
7053  	if (!mddev->pers->hot_add_disk) {
7054  		pr_warn("%s: personality does not support diskops!\n",
7055  			mdname(mddev));
7056  		return -EINVAL;
7057  	}
7058  
7059  	rdev = md_import_device(dev, -1, 0);
7060  	if (IS_ERR(rdev)) {
7061  		pr_warn("md: error, md_import_device() returned %ld\n",
7062  			PTR_ERR(rdev));
7063  		return -EINVAL;
7064  	}
7065  
7066  	if (mddev->persistent)
7067  		rdev->sb_start = calc_dev_sboffset(rdev);
7068  	else
7069  		rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7070  
7071  	rdev->sectors = rdev->sb_start;
7072  
7073  	if (test_bit(Faulty, &rdev->flags)) {
7074  		pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7075  			rdev->bdev, mdname(mddev));
7076  		err = -EINVAL;
7077  		goto abort_export;
7078  	}
7079  
7080  	clear_bit(In_sync, &rdev->flags);
7081  	rdev->desc_nr = -1;
7082  	rdev->saved_raid_disk = -1;
7083  	err = bind_rdev_to_array(rdev, mddev);
7084  	if (err)
7085  		goto abort_export;
7086  
7087  	/*
7088  	 * The rest should better be atomic, we can have disk failures
7089  	 * noticed in interrupt contexts ...
7090  	 */
7091  
7092  	rdev->raid_disk = -1;
7093  
7094  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7095  	if (!mddev->thread)
7096  		md_update_sb(mddev, 1);
7097  	/*
7098  	 * If the new disk does not support REQ_NOWAIT,
7099  	 * disable on the whole MD.
7100  	 */
7101  	if (!bdev_nowait(rdev->bdev)) {
7102  		pr_info("%s: Disabling nowait because %pg does not support nowait\n",
7103  			mdname(mddev), rdev->bdev);
7104  		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
7105  	}
7106  	/*
7107  	 * Kick recovery, maybe this spare has to be added to the
7108  	 * array immediately.
7109  	 */
7110  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7111  	md_wakeup_thread(mddev->thread);
7112  	md_new_event();
7113  	return 0;
7114  
7115  abort_export:
7116  	export_rdev(rdev, mddev);
7117  	return err;
7118  }
7119  
set_bitmap_file(struct mddev * mddev,int fd)7120  static int set_bitmap_file(struct mddev *mddev, int fd)
7121  {
7122  	int err = 0;
7123  
7124  	if (mddev->pers) {
7125  		if (!mddev->pers->quiesce || !mddev->thread)
7126  			return -EBUSY;
7127  		if (mddev->recovery || mddev->sync_thread)
7128  			return -EBUSY;
7129  		/* we should be able to change the bitmap.. */
7130  	}
7131  
7132  	if (fd >= 0) {
7133  		struct inode *inode;
7134  		struct file *f;
7135  
7136  		if (mddev->bitmap || mddev->bitmap_info.file)
7137  			return -EEXIST; /* cannot add when bitmap is present */
7138  
7139  		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7140  			pr_warn("%s: bitmap files not supported by this kernel\n",
7141  				mdname(mddev));
7142  			return -EINVAL;
7143  		}
7144  		pr_warn("%s: using deprecated bitmap file support\n",
7145  			mdname(mddev));
7146  
7147  		f = fget(fd);
7148  
7149  		if (f == NULL) {
7150  			pr_warn("%s: error: failed to get bitmap file\n",
7151  				mdname(mddev));
7152  			return -EBADF;
7153  		}
7154  
7155  		inode = f->f_mapping->host;
7156  		if (!S_ISREG(inode->i_mode)) {
7157  			pr_warn("%s: error: bitmap file must be a regular file\n",
7158  				mdname(mddev));
7159  			err = -EBADF;
7160  		} else if (!(f->f_mode & FMODE_WRITE)) {
7161  			pr_warn("%s: error: bitmap file must open for write\n",
7162  				mdname(mddev));
7163  			err = -EBADF;
7164  		} else if (atomic_read(&inode->i_writecount) != 1) {
7165  			pr_warn("%s: error: bitmap file is already in use\n",
7166  				mdname(mddev));
7167  			err = -EBUSY;
7168  		}
7169  		if (err) {
7170  			fput(f);
7171  			return err;
7172  		}
7173  		mddev->bitmap_info.file = f;
7174  		mddev->bitmap_info.offset = 0; /* file overrides offset */
7175  	} else if (mddev->bitmap == NULL)
7176  		return -ENOENT; /* cannot remove what isn't there */
7177  	err = 0;
7178  	if (mddev->pers) {
7179  		if (fd >= 0) {
7180  			struct bitmap *bitmap;
7181  
7182  			bitmap = md_bitmap_create(mddev, -1);
7183  			mddev_suspend(mddev);
7184  			if (!IS_ERR(bitmap)) {
7185  				mddev->bitmap = bitmap;
7186  				err = md_bitmap_load(mddev);
7187  			} else
7188  				err = PTR_ERR(bitmap);
7189  			if (err) {
7190  				md_bitmap_destroy(mddev);
7191  				fd = -1;
7192  			}
7193  			mddev_resume(mddev);
7194  		} else if (fd < 0) {
7195  			mddev_suspend(mddev);
7196  			md_bitmap_destroy(mddev);
7197  			mddev_resume(mddev);
7198  		}
7199  	}
7200  	if (fd < 0) {
7201  		struct file *f = mddev->bitmap_info.file;
7202  		if (f) {
7203  			spin_lock(&mddev->lock);
7204  			mddev->bitmap_info.file = NULL;
7205  			spin_unlock(&mddev->lock);
7206  			fput(f);
7207  		}
7208  	}
7209  
7210  	return err;
7211  }
7212  
7213  /*
7214   * md_set_array_info is used two different ways
7215   * The original usage is when creating a new array.
7216   * In this usage, raid_disks is > 0 and it together with
7217   *  level, size, not_persistent,layout,chunksize determine the
7218   *  shape of the array.
7219   *  This will always create an array with a type-0.90.0 superblock.
7220   * The newer usage is when assembling an array.
7221   *  In this case raid_disks will be 0, and the major_version field is
7222   *  use to determine which style super-blocks are to be found on the devices.
7223   *  The minor and patch _version numbers are also kept incase the
7224   *  super_block handler wishes to interpret them.
7225   */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7226  int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7227  {
7228  	if (info->raid_disks == 0) {
7229  		/* just setting version number for superblock loading */
7230  		if (info->major_version < 0 ||
7231  		    info->major_version >= ARRAY_SIZE(super_types) ||
7232  		    super_types[info->major_version].name == NULL) {
7233  			/* maybe try to auto-load a module? */
7234  			pr_warn("md: superblock version %d not known\n",
7235  				info->major_version);
7236  			return -EINVAL;
7237  		}
7238  		mddev->major_version = info->major_version;
7239  		mddev->minor_version = info->minor_version;
7240  		mddev->patch_version = info->patch_version;
7241  		mddev->persistent = !info->not_persistent;
7242  		/* ensure mddev_put doesn't delete this now that there
7243  		 * is some minimal configuration.
7244  		 */
7245  		mddev->ctime         = ktime_get_real_seconds();
7246  		return 0;
7247  	}
7248  	mddev->major_version = MD_MAJOR_VERSION;
7249  	mddev->minor_version = MD_MINOR_VERSION;
7250  	mddev->patch_version = MD_PATCHLEVEL_VERSION;
7251  	mddev->ctime         = ktime_get_real_seconds();
7252  
7253  	mddev->level         = info->level;
7254  	mddev->clevel[0]     = 0;
7255  	mddev->dev_sectors   = 2 * (sector_t)info->size;
7256  	mddev->raid_disks    = info->raid_disks;
7257  	/* don't set md_minor, it is determined by which /dev/md* was
7258  	 * openned
7259  	 */
7260  	if (info->state & (1<<MD_SB_CLEAN))
7261  		mddev->recovery_cp = MaxSector;
7262  	else
7263  		mddev->recovery_cp = 0;
7264  	mddev->persistent    = ! info->not_persistent;
7265  	mddev->external	     = 0;
7266  
7267  	mddev->layout        = info->layout;
7268  	if (mddev->level == 0)
7269  		/* Cannot trust RAID0 layout info here */
7270  		mddev->layout = -1;
7271  	mddev->chunk_sectors = info->chunk_size >> 9;
7272  
7273  	if (mddev->persistent) {
7274  		mddev->max_disks = MD_SB_DISKS;
7275  		mddev->flags = 0;
7276  		mddev->sb_flags = 0;
7277  	}
7278  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7279  
7280  	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7281  	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7282  	mddev->bitmap_info.offset = 0;
7283  
7284  	mddev->reshape_position = MaxSector;
7285  
7286  	/*
7287  	 * Generate a 128 bit UUID
7288  	 */
7289  	get_random_bytes(mddev->uuid, 16);
7290  
7291  	mddev->new_level = mddev->level;
7292  	mddev->new_chunk_sectors = mddev->chunk_sectors;
7293  	mddev->new_layout = mddev->layout;
7294  	mddev->delta_disks = 0;
7295  	mddev->reshape_backwards = 0;
7296  
7297  	return 0;
7298  }
7299  
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7300  void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7301  {
7302  	lockdep_assert_held(&mddev->reconfig_mutex);
7303  
7304  	if (mddev->external_size)
7305  		return;
7306  
7307  	mddev->array_sectors = array_sectors;
7308  }
7309  EXPORT_SYMBOL(md_set_array_sectors);
7310  
update_size(struct mddev * mddev,sector_t num_sectors)7311  static int update_size(struct mddev *mddev, sector_t num_sectors)
7312  {
7313  	struct md_rdev *rdev;
7314  	int rv;
7315  	int fit = (num_sectors == 0);
7316  	sector_t old_dev_sectors = mddev->dev_sectors;
7317  
7318  	if (mddev->pers->resize == NULL)
7319  		return -EINVAL;
7320  	/* The "num_sectors" is the number of sectors of each device that
7321  	 * is used.  This can only make sense for arrays with redundancy.
7322  	 * linear and raid0 always use whatever space is available. We can only
7323  	 * consider changing this number if no resync or reconstruction is
7324  	 * happening, and if the new size is acceptable. It must fit before the
7325  	 * sb_start or, if that is <data_offset, it must fit before the size
7326  	 * of each device.  If num_sectors is zero, we find the largest size
7327  	 * that fits.
7328  	 */
7329  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7330  	    mddev->sync_thread)
7331  		return -EBUSY;
7332  	if (!md_is_rdwr(mddev))
7333  		return -EROFS;
7334  
7335  	rdev_for_each(rdev, mddev) {
7336  		sector_t avail = rdev->sectors;
7337  
7338  		if (fit && (num_sectors == 0 || num_sectors > avail))
7339  			num_sectors = avail;
7340  		if (avail < num_sectors)
7341  			return -ENOSPC;
7342  	}
7343  	rv = mddev->pers->resize(mddev, num_sectors);
7344  	if (!rv) {
7345  		if (mddev_is_clustered(mddev))
7346  			md_cluster_ops->update_size(mddev, old_dev_sectors);
7347  		else if (mddev->queue) {
7348  			set_capacity_and_notify(mddev->gendisk,
7349  						mddev->array_sectors);
7350  		}
7351  	}
7352  	return rv;
7353  }
7354  
update_raid_disks(struct mddev * mddev,int raid_disks)7355  static int update_raid_disks(struct mddev *mddev, int raid_disks)
7356  {
7357  	int rv;
7358  	struct md_rdev *rdev;
7359  	/* change the number of raid disks */
7360  	if (mddev->pers->check_reshape == NULL)
7361  		return -EINVAL;
7362  	if (!md_is_rdwr(mddev))
7363  		return -EROFS;
7364  	if (raid_disks <= 0 ||
7365  	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7366  		return -EINVAL;
7367  	if (mddev->sync_thread ||
7368  	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7369  	    test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7370  	    mddev->reshape_position != MaxSector)
7371  		return -EBUSY;
7372  
7373  	rdev_for_each(rdev, mddev) {
7374  		if (mddev->raid_disks < raid_disks &&
7375  		    rdev->data_offset < rdev->new_data_offset)
7376  			return -EINVAL;
7377  		if (mddev->raid_disks > raid_disks &&
7378  		    rdev->data_offset > rdev->new_data_offset)
7379  			return -EINVAL;
7380  	}
7381  
7382  	mddev->delta_disks = raid_disks - mddev->raid_disks;
7383  	if (mddev->delta_disks < 0)
7384  		mddev->reshape_backwards = 1;
7385  	else if (mddev->delta_disks > 0)
7386  		mddev->reshape_backwards = 0;
7387  
7388  	rv = mddev->pers->check_reshape(mddev);
7389  	if (rv < 0) {
7390  		mddev->delta_disks = 0;
7391  		mddev->reshape_backwards = 0;
7392  	}
7393  	return rv;
7394  }
7395  
7396  /*
7397   * update_array_info is used to change the configuration of an
7398   * on-line array.
7399   * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7400   * fields in the info are checked against the array.
7401   * Any differences that cannot be handled will cause an error.
7402   * Normally, only one change can be managed at a time.
7403   */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7404  static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7405  {
7406  	int rv = 0;
7407  	int cnt = 0;
7408  	int state = 0;
7409  
7410  	/* calculate expected state,ignoring low bits */
7411  	if (mddev->bitmap && mddev->bitmap_info.offset)
7412  		state |= (1 << MD_SB_BITMAP_PRESENT);
7413  
7414  	if (mddev->major_version != info->major_version ||
7415  	    mddev->minor_version != info->minor_version ||
7416  /*	    mddev->patch_version != info->patch_version || */
7417  	    mddev->ctime         != info->ctime         ||
7418  	    mddev->level         != info->level         ||
7419  /*	    mddev->layout        != info->layout        || */
7420  	    mddev->persistent	 != !info->not_persistent ||
7421  	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7422  	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7423  	    ((state^info->state) & 0xfffffe00)
7424  		)
7425  		return -EINVAL;
7426  	/* Check there is only one change */
7427  	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7428  		cnt++;
7429  	if (mddev->raid_disks != info->raid_disks)
7430  		cnt++;
7431  	if (mddev->layout != info->layout)
7432  		cnt++;
7433  	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7434  		cnt++;
7435  	if (cnt == 0)
7436  		return 0;
7437  	if (cnt > 1)
7438  		return -EINVAL;
7439  
7440  	if (mddev->layout != info->layout) {
7441  		/* Change layout
7442  		 * we don't need to do anything at the md level, the
7443  		 * personality will take care of it all.
7444  		 */
7445  		if (mddev->pers->check_reshape == NULL)
7446  			return -EINVAL;
7447  		else {
7448  			mddev->new_layout = info->layout;
7449  			rv = mddev->pers->check_reshape(mddev);
7450  			if (rv)
7451  				mddev->new_layout = mddev->layout;
7452  			return rv;
7453  		}
7454  	}
7455  	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7456  		rv = update_size(mddev, (sector_t)info->size * 2);
7457  
7458  	if (mddev->raid_disks    != info->raid_disks)
7459  		rv = update_raid_disks(mddev, info->raid_disks);
7460  
7461  	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7462  		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7463  			rv = -EINVAL;
7464  			goto err;
7465  		}
7466  		if (mddev->recovery || mddev->sync_thread) {
7467  			rv = -EBUSY;
7468  			goto err;
7469  		}
7470  		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7471  			struct bitmap *bitmap;
7472  			/* add the bitmap */
7473  			if (mddev->bitmap) {
7474  				rv = -EEXIST;
7475  				goto err;
7476  			}
7477  			if (mddev->bitmap_info.default_offset == 0) {
7478  				rv = -EINVAL;
7479  				goto err;
7480  			}
7481  			mddev->bitmap_info.offset =
7482  				mddev->bitmap_info.default_offset;
7483  			mddev->bitmap_info.space =
7484  				mddev->bitmap_info.default_space;
7485  			bitmap = md_bitmap_create(mddev, -1);
7486  			mddev_suspend(mddev);
7487  			if (!IS_ERR(bitmap)) {
7488  				mddev->bitmap = bitmap;
7489  				rv = md_bitmap_load(mddev);
7490  			} else
7491  				rv = PTR_ERR(bitmap);
7492  			if (rv)
7493  				md_bitmap_destroy(mddev);
7494  			mddev_resume(mddev);
7495  		} else {
7496  			/* remove the bitmap */
7497  			if (!mddev->bitmap) {
7498  				rv = -ENOENT;
7499  				goto err;
7500  			}
7501  			if (mddev->bitmap->storage.file) {
7502  				rv = -EINVAL;
7503  				goto err;
7504  			}
7505  			if (mddev->bitmap_info.nodes) {
7506  				/* hold PW on all the bitmap lock */
7507  				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7508  					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7509  					rv = -EPERM;
7510  					md_cluster_ops->unlock_all_bitmaps(mddev);
7511  					goto err;
7512  				}
7513  
7514  				mddev->bitmap_info.nodes = 0;
7515  				md_cluster_ops->leave(mddev);
7516  				module_put(md_cluster_mod);
7517  				mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7518  			}
7519  			mddev_suspend(mddev);
7520  			md_bitmap_destroy(mddev);
7521  			mddev_resume(mddev);
7522  			mddev->bitmap_info.offset = 0;
7523  		}
7524  	}
7525  	md_update_sb(mddev, 1);
7526  	return rv;
7527  err:
7528  	return rv;
7529  }
7530  
set_disk_faulty(struct mddev * mddev,dev_t dev)7531  static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7532  {
7533  	struct md_rdev *rdev;
7534  	int err = 0;
7535  
7536  	if (mddev->pers == NULL)
7537  		return -ENODEV;
7538  
7539  	rcu_read_lock();
7540  	rdev = md_find_rdev_rcu(mddev, dev);
7541  	if (!rdev)
7542  		err =  -ENODEV;
7543  	else {
7544  		md_error(mddev, rdev);
7545  		if (test_bit(MD_BROKEN, &mddev->flags))
7546  			err = -EBUSY;
7547  	}
7548  	rcu_read_unlock();
7549  	return err;
7550  }
7551  
7552  /*
7553   * We have a problem here : there is no easy way to give a CHS
7554   * virtual geometry. We currently pretend that we have a 2 heads
7555   * 4 sectors (with a BIG number of cylinders...). This drives
7556   * dosfs just mad... ;-)
7557   */
md_getgeo(struct block_device * bdev,struct hd_geometry * geo)7558  static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7559  {
7560  	struct mddev *mddev = bdev->bd_disk->private_data;
7561  
7562  	geo->heads = 2;
7563  	geo->sectors = 4;
7564  	geo->cylinders = mddev->array_sectors / 8;
7565  	return 0;
7566  }
7567  
md_ioctl_valid(unsigned int cmd)7568  static inline bool md_ioctl_valid(unsigned int cmd)
7569  {
7570  	switch (cmd) {
7571  	case ADD_NEW_DISK:
7572  	case GET_ARRAY_INFO:
7573  	case GET_BITMAP_FILE:
7574  	case GET_DISK_INFO:
7575  	case HOT_ADD_DISK:
7576  	case HOT_REMOVE_DISK:
7577  	case RAID_VERSION:
7578  	case RESTART_ARRAY_RW:
7579  	case RUN_ARRAY:
7580  	case SET_ARRAY_INFO:
7581  	case SET_BITMAP_FILE:
7582  	case SET_DISK_FAULTY:
7583  	case STOP_ARRAY:
7584  	case STOP_ARRAY_RO:
7585  	case CLUSTERED_DISK_NACK:
7586  		return true;
7587  	default:
7588  		return false;
7589  	}
7590  }
7591  
__md_set_array_info(struct mddev * mddev,void __user * argp)7592  static int __md_set_array_info(struct mddev *mddev, void __user *argp)
7593  {
7594  	mdu_array_info_t info;
7595  	int err;
7596  
7597  	if (!argp)
7598  		memset(&info, 0, sizeof(info));
7599  	else if (copy_from_user(&info, argp, sizeof(info)))
7600  		return -EFAULT;
7601  
7602  	if (mddev->pers) {
7603  		err = update_array_info(mddev, &info);
7604  		if (err)
7605  			pr_warn("md: couldn't update array info. %d\n", err);
7606  		return err;
7607  	}
7608  
7609  	if (!list_empty(&mddev->disks)) {
7610  		pr_warn("md: array %s already has disks!\n", mdname(mddev));
7611  		return -EBUSY;
7612  	}
7613  
7614  	if (mddev->raid_disks) {
7615  		pr_warn("md: array %s already initialised!\n", mdname(mddev));
7616  		return -EBUSY;
7617  	}
7618  
7619  	err = md_set_array_info(mddev, &info);
7620  	if (err)
7621  		pr_warn("md: couldn't set array info. %d\n", err);
7622  
7623  	return err;
7624  }
7625  
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7626  static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
7627  			unsigned int cmd, unsigned long arg)
7628  {
7629  	int err = 0;
7630  	void __user *argp = (void __user *)arg;
7631  	struct mddev *mddev = NULL;
7632  
7633  	if (!md_ioctl_valid(cmd))
7634  		return -ENOTTY;
7635  
7636  	switch (cmd) {
7637  	case RAID_VERSION:
7638  	case GET_ARRAY_INFO:
7639  	case GET_DISK_INFO:
7640  		break;
7641  	default:
7642  		if (!capable(CAP_SYS_ADMIN))
7643  			return -EACCES;
7644  	}
7645  
7646  	/*
7647  	 * Commands dealing with the RAID driver but not any
7648  	 * particular array:
7649  	 */
7650  	switch (cmd) {
7651  	case RAID_VERSION:
7652  		err = get_version(argp);
7653  		goto out;
7654  	default:;
7655  	}
7656  
7657  	/*
7658  	 * Commands creating/starting a new array:
7659  	 */
7660  
7661  	mddev = bdev->bd_disk->private_data;
7662  
7663  	/* Some actions do not requires the mutex */
7664  	switch (cmd) {
7665  	case GET_ARRAY_INFO:
7666  		if (!mddev->raid_disks && !mddev->external)
7667  			err = -ENODEV;
7668  		else
7669  			err = get_array_info(mddev, argp);
7670  		goto out;
7671  
7672  	case GET_DISK_INFO:
7673  		if (!mddev->raid_disks && !mddev->external)
7674  			err = -ENODEV;
7675  		else
7676  			err = get_disk_info(mddev, argp);
7677  		goto out;
7678  
7679  	case SET_DISK_FAULTY:
7680  		err = set_disk_faulty(mddev, new_decode_dev(arg));
7681  		goto out;
7682  
7683  	case GET_BITMAP_FILE:
7684  		err = get_bitmap_file(mddev, argp);
7685  		goto out;
7686  
7687  	}
7688  
7689  	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7690  		/* Need to flush page cache, and ensure no-one else opens
7691  		 * and writes
7692  		 */
7693  		mutex_lock(&mddev->open_mutex);
7694  		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7695  			mutex_unlock(&mddev->open_mutex);
7696  			err = -EBUSY;
7697  			goto out;
7698  		}
7699  		if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
7700  			mutex_unlock(&mddev->open_mutex);
7701  			err = -EBUSY;
7702  			goto out;
7703  		}
7704  		mutex_unlock(&mddev->open_mutex);
7705  		sync_blockdev(bdev);
7706  	}
7707  	err = mddev_lock(mddev);
7708  	if (err) {
7709  		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7710  			 err, cmd);
7711  		goto out;
7712  	}
7713  
7714  	if (cmd == SET_ARRAY_INFO) {
7715  		err = __md_set_array_info(mddev, argp);
7716  		goto unlock;
7717  	}
7718  
7719  	/*
7720  	 * Commands querying/configuring an existing array:
7721  	 */
7722  	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7723  	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7724  	if ((!mddev->raid_disks && !mddev->external)
7725  	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7726  	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7727  	    && cmd != GET_BITMAP_FILE) {
7728  		err = -ENODEV;
7729  		goto unlock;
7730  	}
7731  
7732  	/*
7733  	 * Commands even a read-only array can execute:
7734  	 */
7735  	switch (cmd) {
7736  	case RESTART_ARRAY_RW:
7737  		err = restart_array(mddev);
7738  		goto unlock;
7739  
7740  	case STOP_ARRAY:
7741  		err = do_md_stop(mddev, 0, bdev);
7742  		goto unlock;
7743  
7744  	case STOP_ARRAY_RO:
7745  		err = md_set_readonly(mddev, bdev);
7746  		goto unlock;
7747  
7748  	case HOT_REMOVE_DISK:
7749  		err = hot_remove_disk(mddev, new_decode_dev(arg));
7750  		goto unlock;
7751  
7752  	case ADD_NEW_DISK:
7753  		/* We can support ADD_NEW_DISK on read-only arrays
7754  		 * only if we are re-adding a preexisting device.
7755  		 * So require mddev->pers and MD_DISK_SYNC.
7756  		 */
7757  		if (mddev->pers) {
7758  			mdu_disk_info_t info;
7759  			if (copy_from_user(&info, argp, sizeof(info)))
7760  				err = -EFAULT;
7761  			else if (!(info.state & (1<<MD_DISK_SYNC)))
7762  				/* Need to clear read-only for this */
7763  				break;
7764  			else
7765  				err = md_add_new_disk(mddev, &info);
7766  			goto unlock;
7767  		}
7768  		break;
7769  	}
7770  
7771  	/*
7772  	 * The remaining ioctls are changing the state of the
7773  	 * superblock, so we do not allow them on read-only arrays.
7774  	 */
7775  	if (!md_is_rdwr(mddev) && mddev->pers) {
7776  		if (mddev->ro != MD_AUTO_READ) {
7777  			err = -EROFS;
7778  			goto unlock;
7779  		}
7780  		mddev->ro = MD_RDWR;
7781  		sysfs_notify_dirent_safe(mddev->sysfs_state);
7782  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7783  		/* mddev_unlock will wake thread */
7784  		/* If a device failed while we were read-only, we
7785  		 * need to make sure the metadata is updated now.
7786  		 */
7787  		if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7788  			mddev_unlock(mddev);
7789  			wait_event(mddev->sb_wait,
7790  				   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7791  				   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7792  			mddev_lock_nointr(mddev);
7793  		}
7794  	}
7795  
7796  	switch (cmd) {
7797  	case ADD_NEW_DISK:
7798  	{
7799  		mdu_disk_info_t info;
7800  		if (copy_from_user(&info, argp, sizeof(info)))
7801  			err = -EFAULT;
7802  		else
7803  			err = md_add_new_disk(mddev, &info);
7804  		goto unlock;
7805  	}
7806  
7807  	case CLUSTERED_DISK_NACK:
7808  		if (mddev_is_clustered(mddev))
7809  			md_cluster_ops->new_disk_ack(mddev, false);
7810  		else
7811  			err = -EINVAL;
7812  		goto unlock;
7813  
7814  	case HOT_ADD_DISK:
7815  		err = hot_add_disk(mddev, new_decode_dev(arg));
7816  		goto unlock;
7817  
7818  	case RUN_ARRAY:
7819  		err = do_md_run(mddev);
7820  		goto unlock;
7821  
7822  	case SET_BITMAP_FILE:
7823  		err = set_bitmap_file(mddev, (int)arg);
7824  		goto unlock;
7825  
7826  	default:
7827  		err = -EINVAL;
7828  		goto unlock;
7829  	}
7830  
7831  unlock:
7832  	if (mddev->hold_active == UNTIL_IOCTL &&
7833  	    err != -EINVAL)
7834  		mddev->hold_active = 0;
7835  	mddev_unlock(mddev);
7836  out:
7837  	if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
7838  		clear_bit(MD_CLOSING, &mddev->flags);
7839  	return err;
7840  }
7841  #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7842  static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
7843  		    unsigned int cmd, unsigned long arg)
7844  {
7845  	switch (cmd) {
7846  	case HOT_REMOVE_DISK:
7847  	case HOT_ADD_DISK:
7848  	case SET_DISK_FAULTY:
7849  	case SET_BITMAP_FILE:
7850  		/* These take in integer arg, do not convert */
7851  		break;
7852  	default:
7853  		arg = (unsigned long)compat_ptr(arg);
7854  		break;
7855  	}
7856  
7857  	return md_ioctl(bdev, mode, cmd, arg);
7858  }
7859  #endif /* CONFIG_COMPAT */
7860  
md_set_read_only(struct block_device * bdev,bool ro)7861  static int md_set_read_only(struct block_device *bdev, bool ro)
7862  {
7863  	struct mddev *mddev = bdev->bd_disk->private_data;
7864  	int err;
7865  
7866  	err = mddev_lock(mddev);
7867  	if (err)
7868  		return err;
7869  
7870  	if (!mddev->raid_disks && !mddev->external) {
7871  		err = -ENODEV;
7872  		goto out_unlock;
7873  	}
7874  
7875  	/*
7876  	 * Transitioning to read-auto need only happen for arrays that call
7877  	 * md_write_start and which are not ready for writes yet.
7878  	 */
7879  	if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
7880  		err = restart_array(mddev);
7881  		if (err)
7882  			goto out_unlock;
7883  		mddev->ro = MD_AUTO_READ;
7884  	}
7885  
7886  out_unlock:
7887  	mddev_unlock(mddev);
7888  	return err;
7889  }
7890  
md_open(struct gendisk * disk,blk_mode_t mode)7891  static int md_open(struct gendisk *disk, blk_mode_t mode)
7892  {
7893  	struct mddev *mddev;
7894  	int err;
7895  
7896  	spin_lock(&all_mddevs_lock);
7897  	mddev = mddev_get(disk->private_data);
7898  	spin_unlock(&all_mddevs_lock);
7899  	if (!mddev)
7900  		return -ENODEV;
7901  
7902  	err = mutex_lock_interruptible(&mddev->open_mutex);
7903  	if (err)
7904  		goto out;
7905  
7906  	err = -ENODEV;
7907  	if (test_bit(MD_CLOSING, &mddev->flags))
7908  		goto out_unlock;
7909  
7910  	atomic_inc(&mddev->openers);
7911  	mutex_unlock(&mddev->open_mutex);
7912  
7913  	disk_check_media_change(disk);
7914  	return 0;
7915  
7916  out_unlock:
7917  	mutex_unlock(&mddev->open_mutex);
7918  out:
7919  	mddev_put(mddev);
7920  	return err;
7921  }
7922  
md_release(struct gendisk * disk)7923  static void md_release(struct gendisk *disk)
7924  {
7925  	struct mddev *mddev = disk->private_data;
7926  
7927  	BUG_ON(!mddev);
7928  	atomic_dec(&mddev->openers);
7929  	mddev_put(mddev);
7930  }
7931  
md_check_events(struct gendisk * disk,unsigned int clearing)7932  static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
7933  {
7934  	struct mddev *mddev = disk->private_data;
7935  	unsigned int ret = 0;
7936  
7937  	if (mddev->changed)
7938  		ret = DISK_EVENT_MEDIA_CHANGE;
7939  	mddev->changed = 0;
7940  	return ret;
7941  }
7942  
md_free_disk(struct gendisk * disk)7943  static void md_free_disk(struct gendisk *disk)
7944  {
7945  	struct mddev *mddev = disk->private_data;
7946  
7947  	percpu_ref_exit(&mddev->writes_pending);
7948  	mddev_free(mddev);
7949  }
7950  
7951  const struct block_device_operations md_fops =
7952  {
7953  	.owner		= THIS_MODULE,
7954  	.submit_bio	= md_submit_bio,
7955  	.open		= md_open,
7956  	.release	= md_release,
7957  	.ioctl		= md_ioctl,
7958  #ifdef CONFIG_COMPAT
7959  	.compat_ioctl	= md_compat_ioctl,
7960  #endif
7961  	.getgeo		= md_getgeo,
7962  	.check_events	= md_check_events,
7963  	.set_read_only	= md_set_read_only,
7964  	.free_disk	= md_free_disk,
7965  };
7966  
md_thread(void * arg)7967  static int md_thread(void *arg)
7968  {
7969  	struct md_thread *thread = arg;
7970  
7971  	/*
7972  	 * md_thread is a 'system-thread', it's priority should be very
7973  	 * high. We avoid resource deadlocks individually in each
7974  	 * raid personality. (RAID5 does preallocation) We also use RR and
7975  	 * the very same RT priority as kswapd, thus we will never get
7976  	 * into a priority inversion deadlock.
7977  	 *
7978  	 * we definitely have to have equal or higher priority than
7979  	 * bdflush, otherwise bdflush will deadlock if there are too
7980  	 * many dirty RAID5 blocks.
7981  	 */
7982  
7983  	allow_signal(SIGKILL);
7984  	while (!kthread_should_stop()) {
7985  
7986  		/* We need to wait INTERRUPTIBLE so that
7987  		 * we don't add to the load-average.
7988  		 * That means we need to be sure no signals are
7989  		 * pending
7990  		 */
7991  		if (signal_pending(current))
7992  			flush_signals(current);
7993  
7994  		wait_event_interruptible_timeout
7995  			(thread->wqueue,
7996  			 test_bit(THREAD_WAKEUP, &thread->flags)
7997  			 || kthread_should_stop() || kthread_should_park(),
7998  			 thread->timeout);
7999  
8000  		clear_bit(THREAD_WAKEUP, &thread->flags);
8001  		if (kthread_should_park())
8002  			kthread_parkme();
8003  		if (!kthread_should_stop())
8004  			thread->run(thread);
8005  	}
8006  
8007  	return 0;
8008  }
8009  
md_wakeup_thread_directly(struct md_thread __rcu * thread)8010  static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
8011  {
8012  	struct md_thread *t;
8013  
8014  	rcu_read_lock();
8015  	t = rcu_dereference(thread);
8016  	if (t)
8017  		wake_up_process(t->tsk);
8018  	rcu_read_unlock();
8019  }
8020  
md_wakeup_thread(struct md_thread __rcu * thread)8021  void md_wakeup_thread(struct md_thread __rcu *thread)
8022  {
8023  	struct md_thread *t;
8024  
8025  	rcu_read_lock();
8026  	t = rcu_dereference(thread);
8027  	if (t) {
8028  		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8029  		set_bit(THREAD_WAKEUP, &t->flags);
8030  		wake_up(&t->wqueue);
8031  	}
8032  	rcu_read_unlock();
8033  }
8034  EXPORT_SYMBOL(md_wakeup_thread);
8035  
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8036  struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8037  		struct mddev *mddev, const char *name)
8038  {
8039  	struct md_thread *thread;
8040  
8041  	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8042  	if (!thread)
8043  		return NULL;
8044  
8045  	init_waitqueue_head(&thread->wqueue);
8046  
8047  	thread->run = run;
8048  	thread->mddev = mddev;
8049  	thread->timeout = MAX_SCHEDULE_TIMEOUT;
8050  	thread->tsk = kthread_run(md_thread, thread,
8051  				  "%s_%s",
8052  				  mdname(thread->mddev),
8053  				  name);
8054  	if (IS_ERR(thread->tsk)) {
8055  		kfree(thread);
8056  		return NULL;
8057  	}
8058  	return thread;
8059  }
8060  EXPORT_SYMBOL(md_register_thread);
8061  
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8062  void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8063  {
8064  	struct md_thread *thread = rcu_dereference_protected(*threadp,
8065  					lockdep_is_held(&mddev->reconfig_mutex));
8066  
8067  	if (!thread)
8068  		return;
8069  
8070  	rcu_assign_pointer(*threadp, NULL);
8071  	synchronize_rcu();
8072  
8073  	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8074  	kthread_stop(thread->tsk);
8075  	kfree(thread);
8076  }
8077  EXPORT_SYMBOL(md_unregister_thread);
8078  
md_error(struct mddev * mddev,struct md_rdev * rdev)8079  void md_error(struct mddev *mddev, struct md_rdev *rdev)
8080  {
8081  	if (!rdev || test_bit(Faulty, &rdev->flags))
8082  		return;
8083  
8084  	if (!mddev->pers || !mddev->pers->error_handler)
8085  		return;
8086  	mddev->pers->error_handler(mddev, rdev);
8087  
8088  	if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
8089  		return;
8090  
8091  	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8092  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8093  	sysfs_notify_dirent_safe(rdev->sysfs_state);
8094  	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8095  	if (!test_bit(MD_BROKEN, &mddev->flags)) {
8096  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8097  		md_wakeup_thread(mddev->thread);
8098  	}
8099  	if (mddev->event_work.func)
8100  		queue_work(md_misc_wq, &mddev->event_work);
8101  	md_new_event();
8102  }
8103  EXPORT_SYMBOL(md_error);
8104  
8105  /* seq_file implementation /proc/mdstat */
8106  
status_unused(struct seq_file * seq)8107  static void status_unused(struct seq_file *seq)
8108  {
8109  	int i = 0;
8110  	struct md_rdev *rdev;
8111  
8112  	seq_printf(seq, "unused devices: ");
8113  
8114  	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8115  		i++;
8116  		seq_printf(seq, "%pg ", rdev->bdev);
8117  	}
8118  	if (!i)
8119  		seq_printf(seq, "<none>");
8120  
8121  	seq_printf(seq, "\n");
8122  }
8123  
status_personalities(struct seq_file * seq)8124  static void status_personalities(struct seq_file *seq)
8125  {
8126  	struct md_personality *pers;
8127  
8128  	seq_puts(seq, "Personalities : ");
8129  	spin_lock(&pers_lock);
8130  	list_for_each_entry(pers, &pers_list, list)
8131  		seq_printf(seq, "[%s] ", pers->name);
8132  
8133  	spin_unlock(&pers_lock);
8134  	seq_puts(seq, "\n");
8135  }
8136  
status_resync(struct seq_file * seq,struct mddev * mddev)8137  static int status_resync(struct seq_file *seq, struct mddev *mddev)
8138  {
8139  	sector_t max_sectors, resync, res;
8140  	unsigned long dt, db = 0;
8141  	sector_t rt, curr_mark_cnt, resync_mark_cnt;
8142  	int scale, recovery_active;
8143  	unsigned int per_milli;
8144  
8145  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8146  	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8147  		max_sectors = mddev->resync_max_sectors;
8148  	else
8149  		max_sectors = mddev->dev_sectors;
8150  
8151  	resync = mddev->curr_resync;
8152  	if (resync < MD_RESYNC_ACTIVE) {
8153  		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8154  			/* Still cleaning up */
8155  			resync = max_sectors;
8156  	} else if (resync > max_sectors) {
8157  		resync = max_sectors;
8158  	} else {
8159  		res = atomic_read(&mddev->recovery_active);
8160  		/*
8161  		 * Resync has started, but the subtraction has overflowed or
8162  		 * yielded one of the special values. Force it to active to
8163  		 * ensure the status reports an active resync.
8164  		 */
8165  		if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8166  			resync = MD_RESYNC_ACTIVE;
8167  		else
8168  			resync -= res;
8169  	}
8170  
8171  	if (resync == MD_RESYNC_NONE) {
8172  		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8173  			struct md_rdev *rdev;
8174  
8175  			rdev_for_each(rdev, mddev)
8176  				if (rdev->raid_disk >= 0 &&
8177  				    !test_bit(Faulty, &rdev->flags) &&
8178  				    rdev->recovery_offset != MaxSector &&
8179  				    rdev->recovery_offset) {
8180  					seq_printf(seq, "\trecover=REMOTE");
8181  					return 1;
8182  				}
8183  			if (mddev->reshape_position != MaxSector)
8184  				seq_printf(seq, "\treshape=REMOTE");
8185  			else
8186  				seq_printf(seq, "\tresync=REMOTE");
8187  			return 1;
8188  		}
8189  		if (mddev->recovery_cp < MaxSector) {
8190  			seq_printf(seq, "\tresync=PENDING");
8191  			return 1;
8192  		}
8193  		return 0;
8194  	}
8195  	if (resync < MD_RESYNC_ACTIVE) {
8196  		seq_printf(seq, "\tresync=DELAYED");
8197  		return 1;
8198  	}
8199  
8200  	WARN_ON(max_sectors == 0);
8201  	/* Pick 'scale' such that (resync>>scale)*1000 will fit
8202  	 * in a sector_t, and (max_sectors>>scale) will fit in a
8203  	 * u32, as those are the requirements for sector_div.
8204  	 * Thus 'scale' must be at least 10
8205  	 */
8206  	scale = 10;
8207  	if (sizeof(sector_t) > sizeof(unsigned long)) {
8208  		while ( max_sectors/2 > (1ULL<<(scale+32)))
8209  			scale++;
8210  	}
8211  	res = (resync>>scale)*1000;
8212  	sector_div(res, (u32)((max_sectors>>scale)+1));
8213  
8214  	per_milli = res;
8215  	{
8216  		int i, x = per_milli/50, y = 20-x;
8217  		seq_printf(seq, "[");
8218  		for (i = 0; i < x; i++)
8219  			seq_printf(seq, "=");
8220  		seq_printf(seq, ">");
8221  		for (i = 0; i < y; i++)
8222  			seq_printf(seq, ".");
8223  		seq_printf(seq, "] ");
8224  	}
8225  	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8226  		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8227  		    "reshape" :
8228  		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8229  		     "check" :
8230  		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8231  		      "resync" : "recovery"))),
8232  		   per_milli/10, per_milli % 10,
8233  		   (unsigned long long) resync/2,
8234  		   (unsigned long long) max_sectors/2);
8235  
8236  	/*
8237  	 * dt: time from mark until now
8238  	 * db: blocks written from mark until now
8239  	 * rt: remaining time
8240  	 *
8241  	 * rt is a sector_t, which is always 64bit now. We are keeping
8242  	 * the original algorithm, but it is not really necessary.
8243  	 *
8244  	 * Original algorithm:
8245  	 *   So we divide before multiply in case it is 32bit and close
8246  	 *   to the limit.
8247  	 *   We scale the divisor (db) by 32 to avoid losing precision
8248  	 *   near the end of resync when the number of remaining sectors
8249  	 *   is close to 'db'.
8250  	 *   We then divide rt by 32 after multiplying by db to compensate.
8251  	 *   The '+1' avoids division by zero if db is very small.
8252  	 */
8253  	dt = ((jiffies - mddev->resync_mark) / HZ);
8254  	if (!dt) dt++;
8255  
8256  	curr_mark_cnt = mddev->curr_mark_cnt;
8257  	recovery_active = atomic_read(&mddev->recovery_active);
8258  	resync_mark_cnt = mddev->resync_mark_cnt;
8259  
8260  	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8261  		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8262  
8263  	rt = max_sectors - resync;    /* number of remaining sectors */
8264  	rt = div64_u64(rt, db/32+1);
8265  	rt *= dt;
8266  	rt >>= 5;
8267  
8268  	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8269  		   ((unsigned long)rt % 60)/6);
8270  
8271  	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8272  	return 1;
8273  }
8274  
md_seq_start(struct seq_file * seq,loff_t * pos)8275  static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8276  	__acquires(&all_mddevs_lock)
8277  {
8278  	seq->poll_event = atomic_read(&md_event_count);
8279  	spin_lock(&all_mddevs_lock);
8280  
8281  	return seq_list_start_head(&all_mddevs, *pos);
8282  }
8283  
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8284  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8285  {
8286  	return seq_list_next(v, &all_mddevs, pos);
8287  }
8288  
md_seq_stop(struct seq_file * seq,void * v)8289  static void md_seq_stop(struct seq_file *seq, void *v)
8290  	__releases(&all_mddevs_lock)
8291  {
8292  	spin_unlock(&all_mddevs_lock);
8293  }
8294  
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8295  static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8296  {
8297  	struct md_bitmap_stats stats;
8298  	unsigned long used_pages;
8299  	unsigned long chunk_kb;
8300  	int err;
8301  
8302  	err = md_bitmap_get_stats(mddev->bitmap, &stats);
8303  	if (err)
8304  		return;
8305  
8306  	chunk_kb = mddev->bitmap_info.chunksize >> 10;
8307  	used_pages = stats.pages - stats.missing_pages;
8308  
8309  	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8310  		   used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8311  		   chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8312  		   chunk_kb ? "KB" : "B");
8313  
8314  	if (stats.file) {
8315  		seq_puts(seq, ", file: ");
8316  		seq_file_path(seq, stats.file, " \t\n");
8317  	}
8318  
8319  	seq_putc(seq, '\n');
8320  }
8321  
md_seq_show(struct seq_file * seq,void * v)8322  static int md_seq_show(struct seq_file *seq, void *v)
8323  {
8324  	struct mddev *mddev;
8325  	sector_t sectors;
8326  	struct md_rdev *rdev;
8327  
8328  	if (v == &all_mddevs) {
8329  		status_personalities(seq);
8330  		if (list_empty(&all_mddevs))
8331  			status_unused(seq);
8332  		return 0;
8333  	}
8334  
8335  	mddev = list_entry(v, struct mddev, all_mddevs);
8336  	if (!mddev_get(mddev))
8337  		return 0;
8338  
8339  	spin_unlock(&all_mddevs_lock);
8340  
8341  	/* prevent bitmap to be freed after checking */
8342  	mutex_lock(&mddev->bitmap_info.mutex);
8343  
8344  	spin_lock(&mddev->lock);
8345  	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8346  		seq_printf(seq, "%s : %sactive", mdname(mddev),
8347  						mddev->pers ? "" : "in");
8348  		if (mddev->pers) {
8349  			if (mddev->ro == MD_RDONLY)
8350  				seq_printf(seq, " (read-only)");
8351  			if (mddev->ro == MD_AUTO_READ)
8352  				seq_printf(seq, " (auto-read-only)");
8353  			seq_printf(seq, " %s", mddev->pers->name);
8354  		}
8355  
8356  		sectors = 0;
8357  		rcu_read_lock();
8358  		rdev_for_each_rcu(rdev, mddev) {
8359  			seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8360  
8361  			if (test_bit(WriteMostly, &rdev->flags))
8362  				seq_printf(seq, "(W)");
8363  			if (test_bit(Journal, &rdev->flags))
8364  				seq_printf(seq, "(J)");
8365  			if (test_bit(Faulty, &rdev->flags)) {
8366  				seq_printf(seq, "(F)");
8367  				continue;
8368  			}
8369  			if (rdev->raid_disk < 0)
8370  				seq_printf(seq, "(S)"); /* spare */
8371  			if (test_bit(Replacement, &rdev->flags))
8372  				seq_printf(seq, "(R)");
8373  			sectors += rdev->sectors;
8374  		}
8375  		rcu_read_unlock();
8376  
8377  		if (!list_empty(&mddev->disks)) {
8378  			if (mddev->pers)
8379  				seq_printf(seq, "\n      %llu blocks",
8380  					   (unsigned long long)
8381  					   mddev->array_sectors / 2);
8382  			else
8383  				seq_printf(seq, "\n      %llu blocks",
8384  					   (unsigned long long)sectors / 2);
8385  		}
8386  		if (mddev->persistent) {
8387  			if (mddev->major_version != 0 ||
8388  			    mddev->minor_version != 90) {
8389  				seq_printf(seq," super %d.%d",
8390  					   mddev->major_version,
8391  					   mddev->minor_version);
8392  			}
8393  		} else if (mddev->external)
8394  			seq_printf(seq, " super external:%s",
8395  				   mddev->metadata_type);
8396  		else
8397  			seq_printf(seq, " super non-persistent");
8398  
8399  		if (mddev->pers) {
8400  			mddev->pers->status(seq, mddev);
8401  			seq_printf(seq, "\n      ");
8402  			if (mddev->pers->sync_request) {
8403  				if (status_resync(seq, mddev))
8404  					seq_printf(seq, "\n      ");
8405  			}
8406  		} else
8407  			seq_printf(seq, "\n       ");
8408  
8409  		md_bitmap_status(seq, mddev);
8410  
8411  		seq_printf(seq, "\n");
8412  	}
8413  	spin_unlock(&mddev->lock);
8414  	mutex_unlock(&mddev->bitmap_info.mutex);
8415  	spin_lock(&all_mddevs_lock);
8416  
8417  	if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8418  		status_unused(seq);
8419  
8420  	if (atomic_dec_and_test(&mddev->active))
8421  		__mddev_put(mddev);
8422  
8423  	return 0;
8424  }
8425  
8426  static const struct seq_operations md_seq_ops = {
8427  	.start  = md_seq_start,
8428  	.next   = md_seq_next,
8429  	.stop   = md_seq_stop,
8430  	.show   = md_seq_show,
8431  };
8432  
md_seq_open(struct inode * inode,struct file * file)8433  static int md_seq_open(struct inode *inode, struct file *file)
8434  {
8435  	struct seq_file *seq;
8436  	int error;
8437  
8438  	error = seq_open(file, &md_seq_ops);
8439  	if (error)
8440  		return error;
8441  
8442  	seq = file->private_data;
8443  	seq->poll_event = atomic_read(&md_event_count);
8444  	return error;
8445  }
8446  
8447  static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8448  static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8449  {
8450  	struct seq_file *seq = filp->private_data;
8451  	__poll_t mask;
8452  
8453  	if (md_unloading)
8454  		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8455  	poll_wait(filp, &md_event_waiters, wait);
8456  
8457  	/* always allow read */
8458  	mask = EPOLLIN | EPOLLRDNORM;
8459  
8460  	if (seq->poll_event != atomic_read(&md_event_count))
8461  		mask |= EPOLLERR | EPOLLPRI;
8462  	return mask;
8463  }
8464  
8465  static const struct proc_ops mdstat_proc_ops = {
8466  	.proc_open	= md_seq_open,
8467  	.proc_read	= seq_read,
8468  	.proc_lseek	= seq_lseek,
8469  	.proc_release	= seq_release,
8470  	.proc_poll	= mdstat_poll,
8471  };
8472  
register_md_personality(struct md_personality * p)8473  int register_md_personality(struct md_personality *p)
8474  {
8475  	pr_debug("md: %s personality registered for level %d\n",
8476  		 p->name, p->level);
8477  	spin_lock(&pers_lock);
8478  	list_add_tail(&p->list, &pers_list);
8479  	spin_unlock(&pers_lock);
8480  	return 0;
8481  }
8482  EXPORT_SYMBOL(register_md_personality);
8483  
unregister_md_personality(struct md_personality * p)8484  int unregister_md_personality(struct md_personality *p)
8485  {
8486  	pr_debug("md: %s personality unregistered\n", p->name);
8487  	spin_lock(&pers_lock);
8488  	list_del_init(&p->list);
8489  	spin_unlock(&pers_lock);
8490  	return 0;
8491  }
8492  EXPORT_SYMBOL(unregister_md_personality);
8493  
register_md_cluster_operations(struct md_cluster_operations * ops,struct module * module)8494  int register_md_cluster_operations(struct md_cluster_operations *ops,
8495  				   struct module *module)
8496  {
8497  	int ret = 0;
8498  	spin_lock(&pers_lock);
8499  	if (md_cluster_ops != NULL)
8500  		ret = -EALREADY;
8501  	else {
8502  		md_cluster_ops = ops;
8503  		md_cluster_mod = module;
8504  	}
8505  	spin_unlock(&pers_lock);
8506  	return ret;
8507  }
8508  EXPORT_SYMBOL(register_md_cluster_operations);
8509  
unregister_md_cluster_operations(void)8510  int unregister_md_cluster_operations(void)
8511  {
8512  	spin_lock(&pers_lock);
8513  	md_cluster_ops = NULL;
8514  	spin_unlock(&pers_lock);
8515  	return 0;
8516  }
8517  EXPORT_SYMBOL(unregister_md_cluster_operations);
8518  
md_setup_cluster(struct mddev * mddev,int nodes)8519  int md_setup_cluster(struct mddev *mddev, int nodes)
8520  {
8521  	int ret;
8522  	if (!md_cluster_ops)
8523  		request_module("md-cluster");
8524  	spin_lock(&pers_lock);
8525  	/* ensure module won't be unloaded */
8526  	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8527  		pr_warn("can't find md-cluster module or get its reference.\n");
8528  		spin_unlock(&pers_lock);
8529  		return -ENOENT;
8530  	}
8531  	spin_unlock(&pers_lock);
8532  
8533  	ret = md_cluster_ops->join(mddev, nodes);
8534  	if (!ret)
8535  		mddev->safemode_delay = 0;
8536  	return ret;
8537  }
8538  
md_cluster_stop(struct mddev * mddev)8539  void md_cluster_stop(struct mddev *mddev)
8540  {
8541  	if (!md_cluster_ops)
8542  		return;
8543  	md_cluster_ops->leave(mddev);
8544  	module_put(md_cluster_mod);
8545  }
8546  
is_mddev_idle(struct mddev * mddev,int init)8547  static int is_mddev_idle(struct mddev *mddev, int init)
8548  {
8549  	struct md_rdev *rdev;
8550  	int idle;
8551  	int curr_events;
8552  
8553  	idle = 1;
8554  	rcu_read_lock();
8555  	rdev_for_each_rcu(rdev, mddev) {
8556  		struct gendisk *disk = rdev->bdev->bd_disk;
8557  		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
8558  			      atomic_read(&disk->sync_io);
8559  		/* sync IO will cause sync_io to increase before the disk_stats
8560  		 * as sync_io is counted when a request starts, and
8561  		 * disk_stats is counted when it completes.
8562  		 * So resync activity will cause curr_events to be smaller than
8563  		 * when there was no such activity.
8564  		 * non-sync IO will cause disk_stat to increase without
8565  		 * increasing sync_io so curr_events will (eventually)
8566  		 * be larger than it was before.  Once it becomes
8567  		 * substantially larger, the test below will cause
8568  		 * the array to appear non-idle, and resync will slow
8569  		 * down.
8570  		 * If there is a lot of outstanding resync activity when
8571  		 * we set last_event to curr_events, then all that activity
8572  		 * completing might cause the array to appear non-idle
8573  		 * and resync will be slowed down even though there might
8574  		 * not have been non-resync activity.  This will only
8575  		 * happen once though.  'last_events' will soon reflect
8576  		 * the state where there is little or no outstanding
8577  		 * resync requests, and further resync activity will
8578  		 * always make curr_events less than last_events.
8579  		 *
8580  		 */
8581  		if (init || curr_events - rdev->last_events > 64) {
8582  			rdev->last_events = curr_events;
8583  			idle = 0;
8584  		}
8585  	}
8586  	rcu_read_unlock();
8587  	return idle;
8588  }
8589  
md_done_sync(struct mddev * mddev,int blocks,int ok)8590  void md_done_sync(struct mddev *mddev, int blocks, int ok)
8591  {
8592  	/* another "blocks" (512byte) blocks have been synced */
8593  	atomic_sub(blocks, &mddev->recovery_active);
8594  	wake_up(&mddev->recovery_wait);
8595  	if (!ok) {
8596  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8597  		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8598  		md_wakeup_thread(mddev->thread);
8599  		// stop recovery, signal do_sync ....
8600  	}
8601  }
8602  EXPORT_SYMBOL(md_done_sync);
8603  
8604  /* md_write_start(mddev, bi)
8605   * If we need to update some array metadata (e.g. 'active' flag
8606   * in superblock) before writing, schedule a superblock update
8607   * and wait for it to complete.
8608   * A return value of 'false' means that the write wasn't recorded
8609   * and cannot proceed as the array is being suspend.
8610   */
md_write_start(struct mddev * mddev,struct bio * bi)8611  bool md_write_start(struct mddev *mddev, struct bio *bi)
8612  {
8613  	int did_change = 0;
8614  
8615  	if (bio_data_dir(bi) != WRITE)
8616  		return true;
8617  
8618  	BUG_ON(mddev->ro == MD_RDONLY);
8619  	if (mddev->ro == MD_AUTO_READ) {
8620  		/* need to switch to read/write */
8621  		mddev->ro = MD_RDWR;
8622  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8623  		md_wakeup_thread(mddev->thread);
8624  		md_wakeup_thread(mddev->sync_thread);
8625  		did_change = 1;
8626  	}
8627  	rcu_read_lock();
8628  	percpu_ref_get(&mddev->writes_pending);
8629  	smp_mb(); /* Match smp_mb in set_in_sync() */
8630  	if (mddev->safemode == 1)
8631  		mddev->safemode = 0;
8632  	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8633  	if (mddev->in_sync || mddev->sync_checkers) {
8634  		spin_lock(&mddev->lock);
8635  		if (mddev->in_sync) {
8636  			mddev->in_sync = 0;
8637  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8638  			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8639  			md_wakeup_thread(mddev->thread);
8640  			did_change = 1;
8641  		}
8642  		spin_unlock(&mddev->lock);
8643  	}
8644  	rcu_read_unlock();
8645  	if (did_change)
8646  		sysfs_notify_dirent_safe(mddev->sysfs_state);
8647  	if (!mddev->has_superblocks)
8648  		return true;
8649  	wait_event(mddev->sb_wait,
8650  		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8651  		   is_md_suspended(mddev));
8652  	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8653  		percpu_ref_put(&mddev->writes_pending);
8654  		return false;
8655  	}
8656  	return true;
8657  }
8658  EXPORT_SYMBOL(md_write_start);
8659  
8660  /* md_write_inc can only be called when md_write_start() has
8661   * already been called at least once of the current request.
8662   * It increments the counter and is useful when a single request
8663   * is split into several parts.  Each part causes an increment and
8664   * so needs a matching md_write_end().
8665   * Unlike md_write_start(), it is safe to call md_write_inc() inside
8666   * a spinlocked region.
8667   */
md_write_inc(struct mddev * mddev,struct bio * bi)8668  void md_write_inc(struct mddev *mddev, struct bio *bi)
8669  {
8670  	if (bio_data_dir(bi) != WRITE)
8671  		return;
8672  	WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
8673  	percpu_ref_get(&mddev->writes_pending);
8674  }
8675  EXPORT_SYMBOL(md_write_inc);
8676  
md_write_end(struct mddev * mddev)8677  void md_write_end(struct mddev *mddev)
8678  {
8679  	percpu_ref_put(&mddev->writes_pending);
8680  
8681  	if (mddev->safemode == 2)
8682  		md_wakeup_thread(mddev->thread);
8683  	else if (mddev->safemode_delay)
8684  		/* The roundup() ensures this only performs locking once
8685  		 * every ->safemode_delay jiffies
8686  		 */
8687  		mod_timer(&mddev->safemode_timer,
8688  			  roundup(jiffies, mddev->safemode_delay) +
8689  			  mddev->safemode_delay);
8690  }
8691  
8692  EXPORT_SYMBOL(md_write_end);
8693  
8694  /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)8695  void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8696  			struct bio *bio, sector_t start, sector_t size)
8697  {
8698  	struct bio *discard_bio = NULL;
8699  
8700  	if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8701  			&discard_bio) || !discard_bio)
8702  		return;
8703  
8704  	bio_chain(discard_bio, bio);
8705  	bio_clone_blkg_association(discard_bio, bio);
8706  	if (mddev->gendisk)
8707  		trace_block_bio_remap(discard_bio,
8708  				disk_devt(mddev->gendisk),
8709  				bio->bi_iter.bi_sector);
8710  	submit_bio_noacct(discard_bio);
8711  }
8712  EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8713  
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)8714  static void md_bitmap_start(struct mddev *mddev,
8715  			    struct md_io_clone *md_io_clone)
8716  {
8717  	if (mddev->pers->bitmap_sector)
8718  		mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
8719  					   &md_io_clone->sectors);
8720  
8721  	md_bitmap_startwrite(mddev->bitmap, md_io_clone->offset,
8722  			     md_io_clone->sectors);
8723  }
8724  
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)8725  static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
8726  {
8727  	md_bitmap_endwrite(mddev->bitmap, md_io_clone->offset,
8728  			   md_io_clone->sectors);
8729  }
8730  
md_end_clone_io(struct bio * bio)8731  static void md_end_clone_io(struct bio *bio)
8732  {
8733  	struct md_io_clone *md_io_clone = bio->bi_private;
8734  	struct bio *orig_bio = md_io_clone->orig_bio;
8735  	struct mddev *mddev = md_io_clone->mddev;
8736  
8737  	if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8738  		md_bitmap_end(mddev, md_io_clone);
8739  
8740  	if (bio->bi_status && !orig_bio->bi_status)
8741  		orig_bio->bi_status = bio->bi_status;
8742  
8743  	if (md_io_clone->start_time)
8744  		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8745  
8746  	bio_put(bio);
8747  	bio_endio(orig_bio);
8748  	percpu_ref_put(&mddev->active_io);
8749  }
8750  
md_clone_bio(struct mddev * mddev,struct bio ** bio)8751  static void md_clone_bio(struct mddev *mddev, struct bio **bio)
8752  {
8753  	struct block_device *bdev = (*bio)->bi_bdev;
8754  	struct md_io_clone *md_io_clone;
8755  	struct bio *clone =
8756  		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
8757  
8758  	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
8759  	md_io_clone->orig_bio = *bio;
8760  	md_io_clone->mddev = mddev;
8761  	if (blk_queue_io_stat(bdev->bd_disk->queue))
8762  		md_io_clone->start_time = bio_start_io_acct(*bio);
8763  
8764  	if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
8765  		md_io_clone->offset = (*bio)->bi_iter.bi_sector;
8766  		md_io_clone->sectors = bio_sectors(*bio);
8767  		md_bitmap_start(mddev, md_io_clone);
8768  	}
8769  
8770  	clone->bi_end_io = md_end_clone_io;
8771  	clone->bi_private = md_io_clone;
8772  	*bio = clone;
8773  }
8774  
md_account_bio(struct mddev * mddev,struct bio ** bio)8775  void md_account_bio(struct mddev *mddev, struct bio **bio)
8776  {
8777  	percpu_ref_get(&mddev->active_io);
8778  	md_clone_bio(mddev, bio);
8779  }
8780  EXPORT_SYMBOL_GPL(md_account_bio);
8781  
8782  /* md_allow_write(mddev)
8783   * Calling this ensures that the array is marked 'active' so that writes
8784   * may proceed without blocking.  It is important to call this before
8785   * attempting a GFP_KERNEL allocation while holding the mddev lock.
8786   * Must be called with mddev_lock held.
8787   */
md_allow_write(struct mddev * mddev)8788  void md_allow_write(struct mddev *mddev)
8789  {
8790  	if (!mddev->pers)
8791  		return;
8792  	if (!md_is_rdwr(mddev))
8793  		return;
8794  	if (!mddev->pers->sync_request)
8795  		return;
8796  
8797  	spin_lock(&mddev->lock);
8798  	if (mddev->in_sync) {
8799  		mddev->in_sync = 0;
8800  		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8801  		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8802  		if (mddev->safemode_delay &&
8803  		    mddev->safemode == 0)
8804  			mddev->safemode = 1;
8805  		spin_unlock(&mddev->lock);
8806  		md_update_sb(mddev, 0);
8807  		sysfs_notify_dirent_safe(mddev->sysfs_state);
8808  		/* wait for the dirty state to be recorded in the metadata */
8809  		wait_event(mddev->sb_wait,
8810  			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8811  	} else
8812  		spin_unlock(&mddev->lock);
8813  }
8814  EXPORT_SYMBOL_GPL(md_allow_write);
8815  
8816  #define SYNC_MARKS	10
8817  #define	SYNC_MARK_STEP	(3*HZ)
8818  #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)8819  void md_do_sync(struct md_thread *thread)
8820  {
8821  	struct mddev *mddev = thread->mddev;
8822  	struct mddev *mddev2;
8823  	unsigned int currspeed = 0, window;
8824  	sector_t max_sectors,j, io_sectors, recovery_done;
8825  	unsigned long mark[SYNC_MARKS];
8826  	unsigned long update_time;
8827  	sector_t mark_cnt[SYNC_MARKS];
8828  	int last_mark,m;
8829  	sector_t last_check;
8830  	int skipped = 0;
8831  	struct md_rdev *rdev;
8832  	char *desc, *action = NULL;
8833  	struct blk_plug plug;
8834  	int ret;
8835  
8836  	/* just incase thread restarts... */
8837  	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8838  	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8839  		return;
8840  	if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
8841  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8842  		return;
8843  	}
8844  
8845  	if (mddev_is_clustered(mddev)) {
8846  		ret = md_cluster_ops->resync_start(mddev);
8847  		if (ret)
8848  			goto skip;
8849  
8850  		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8851  		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8852  			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8853  			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8854  		     && ((unsigned long long)mddev->curr_resync_completed
8855  			 < (unsigned long long)mddev->resync_max_sectors))
8856  			goto skip;
8857  	}
8858  
8859  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8860  		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8861  			desc = "data-check";
8862  			action = "check";
8863  		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8864  			desc = "requested-resync";
8865  			action = "repair";
8866  		} else
8867  			desc = "resync";
8868  	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8869  		desc = "reshape";
8870  	else
8871  		desc = "recovery";
8872  
8873  	mddev->last_sync_action = action ?: desc;
8874  
8875  	/*
8876  	 * Before starting a resync we must have set curr_resync to
8877  	 * 2, and then checked that every "conflicting" array has curr_resync
8878  	 * less than ours.  When we find one that is the same or higher
8879  	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8880  	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8881  	 * This will mean we have to start checking from the beginning again.
8882  	 *
8883  	 */
8884  
8885  	do {
8886  		int mddev2_minor = -1;
8887  		mddev->curr_resync = MD_RESYNC_DELAYED;
8888  
8889  	try_again:
8890  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8891  			goto skip;
8892  		spin_lock(&all_mddevs_lock);
8893  		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
8894  			if (test_bit(MD_DELETED, &mddev2->flags))
8895  				continue;
8896  			if (mddev2 == mddev)
8897  				continue;
8898  			if (!mddev->parallel_resync
8899  			&&  mddev2->curr_resync
8900  			&&  match_mddev_units(mddev, mddev2)) {
8901  				DEFINE_WAIT(wq);
8902  				if (mddev < mddev2 &&
8903  				    mddev->curr_resync == MD_RESYNC_DELAYED) {
8904  					/* arbitrarily yield */
8905  					mddev->curr_resync = MD_RESYNC_YIELDED;
8906  					wake_up(&resync_wait);
8907  				}
8908  				if (mddev > mddev2 &&
8909  				    mddev->curr_resync == MD_RESYNC_YIELDED)
8910  					/* no need to wait here, we can wait the next
8911  					 * time 'round when curr_resync == 2
8912  					 */
8913  					continue;
8914  				/* We need to wait 'interruptible' so as not to
8915  				 * contribute to the load average, and not to
8916  				 * be caught by 'softlockup'
8917  				 */
8918  				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8919  				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8920  				    mddev2->curr_resync >= mddev->curr_resync) {
8921  					if (mddev2_minor != mddev2->md_minor) {
8922  						mddev2_minor = mddev2->md_minor;
8923  						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8924  							desc, mdname(mddev),
8925  							mdname(mddev2));
8926  					}
8927  					spin_unlock(&all_mddevs_lock);
8928  
8929  					if (signal_pending(current))
8930  						flush_signals(current);
8931  					schedule();
8932  					finish_wait(&resync_wait, &wq);
8933  					goto try_again;
8934  				}
8935  				finish_wait(&resync_wait, &wq);
8936  			}
8937  		}
8938  		spin_unlock(&all_mddevs_lock);
8939  	} while (mddev->curr_resync < MD_RESYNC_DELAYED);
8940  
8941  	j = 0;
8942  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8943  		/* resync follows the size requested by the personality,
8944  		 * which defaults to physical size, but can be virtual size
8945  		 */
8946  		max_sectors = mddev->resync_max_sectors;
8947  		atomic64_set(&mddev->resync_mismatches, 0);
8948  		/* we don't use the checkpoint if there's a bitmap */
8949  		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8950  			j = mddev->resync_min;
8951  		else if (!mddev->bitmap)
8952  			j = mddev->recovery_cp;
8953  
8954  	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8955  		max_sectors = mddev->resync_max_sectors;
8956  		/*
8957  		 * If the original node aborts reshaping then we continue the
8958  		 * reshaping, so set j again to avoid restart reshape from the
8959  		 * first beginning
8960  		 */
8961  		if (mddev_is_clustered(mddev) &&
8962  		    mddev->reshape_position != MaxSector)
8963  			j = mddev->reshape_position;
8964  	} else {
8965  		/* recovery follows the physical size of devices */
8966  		max_sectors = mddev->dev_sectors;
8967  		j = MaxSector;
8968  		rcu_read_lock();
8969  		rdev_for_each_rcu(rdev, mddev)
8970  			if (rdev->raid_disk >= 0 &&
8971  			    !test_bit(Journal, &rdev->flags) &&
8972  			    !test_bit(Faulty, &rdev->flags) &&
8973  			    !test_bit(In_sync, &rdev->flags) &&
8974  			    rdev->recovery_offset < j)
8975  				j = rdev->recovery_offset;
8976  		rcu_read_unlock();
8977  
8978  		/* If there is a bitmap, we need to make sure all
8979  		 * writes that started before we added a spare
8980  		 * complete before we start doing a recovery.
8981  		 * Otherwise the write might complete and (via
8982  		 * bitmap_endwrite) set a bit in the bitmap after the
8983  		 * recovery has checked that bit and skipped that
8984  		 * region.
8985  		 */
8986  		if (mddev->bitmap) {
8987  			mddev->pers->quiesce(mddev, 1);
8988  			mddev->pers->quiesce(mddev, 0);
8989  		}
8990  	}
8991  
8992  	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8993  	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8994  	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8995  		 speed_max(mddev), desc);
8996  
8997  	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8998  
8999  	io_sectors = 0;
9000  	for (m = 0; m < SYNC_MARKS; m++) {
9001  		mark[m] = jiffies;
9002  		mark_cnt[m] = io_sectors;
9003  	}
9004  	last_mark = 0;
9005  	mddev->resync_mark = mark[last_mark];
9006  	mddev->resync_mark_cnt = mark_cnt[last_mark];
9007  
9008  	/*
9009  	 * Tune reconstruction:
9010  	 */
9011  	window = 32 * (PAGE_SIZE / 512);
9012  	pr_debug("md: using %dk window, over a total of %lluk.\n",
9013  		 window/2, (unsigned long long)max_sectors/2);
9014  
9015  	atomic_set(&mddev->recovery_active, 0);
9016  	last_check = 0;
9017  
9018  	if (j >= MD_RESYNC_ACTIVE) {
9019  		pr_debug("md: resuming %s of %s from checkpoint.\n",
9020  			 desc, mdname(mddev));
9021  		mddev->curr_resync = j;
9022  	} else
9023  		mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9024  	mddev->curr_resync_completed = j;
9025  	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9026  	md_new_event();
9027  	update_time = jiffies;
9028  
9029  	blk_start_plug(&plug);
9030  	while (j < max_sectors) {
9031  		sector_t sectors;
9032  
9033  		skipped = 0;
9034  
9035  		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9036  		    ((mddev->curr_resync > mddev->curr_resync_completed &&
9037  		      (mddev->curr_resync - mddev->curr_resync_completed)
9038  		      > (max_sectors >> 4)) ||
9039  		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9040  		     (j - mddev->curr_resync_completed)*2
9041  		     >= mddev->resync_max - mddev->curr_resync_completed ||
9042  		     mddev->curr_resync_completed > mddev->resync_max
9043  			    )) {
9044  			/* time to update curr_resync_completed */
9045  			wait_event(mddev->recovery_wait,
9046  				   atomic_read(&mddev->recovery_active) == 0);
9047  			mddev->curr_resync_completed = j;
9048  			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9049  			    j > mddev->recovery_cp)
9050  				mddev->recovery_cp = j;
9051  			update_time = jiffies;
9052  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9053  			sysfs_notify_dirent_safe(mddev->sysfs_completed);
9054  		}
9055  
9056  		while (j >= mddev->resync_max &&
9057  		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9058  			/* As this condition is controlled by user-space,
9059  			 * we can block indefinitely, so use '_interruptible'
9060  			 * to avoid triggering warnings.
9061  			 */
9062  			flush_signals(current); /* just in case */
9063  			wait_event_interruptible(mddev->recovery_wait,
9064  						 mddev->resync_max > j
9065  						 || test_bit(MD_RECOVERY_INTR,
9066  							     &mddev->recovery));
9067  		}
9068  
9069  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9070  			break;
9071  
9072  		sectors = mddev->pers->sync_request(mddev, j, &skipped);
9073  		if (sectors == 0) {
9074  			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9075  			break;
9076  		}
9077  
9078  		if (!skipped) { /* actual IO requested */
9079  			io_sectors += sectors;
9080  			atomic_add(sectors, &mddev->recovery_active);
9081  		}
9082  
9083  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9084  			break;
9085  
9086  		j += sectors;
9087  		if (j > max_sectors)
9088  			/* when skipping, extra large numbers can be returned. */
9089  			j = max_sectors;
9090  		if (j >= MD_RESYNC_ACTIVE)
9091  			mddev->curr_resync = j;
9092  		mddev->curr_mark_cnt = io_sectors;
9093  		if (last_check == 0)
9094  			/* this is the earliest that rebuild will be
9095  			 * visible in /proc/mdstat
9096  			 */
9097  			md_new_event();
9098  
9099  		if (last_check + window > io_sectors || j == max_sectors)
9100  			continue;
9101  
9102  		last_check = io_sectors;
9103  	repeat:
9104  		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9105  			/* step marks */
9106  			int next = (last_mark+1) % SYNC_MARKS;
9107  
9108  			mddev->resync_mark = mark[next];
9109  			mddev->resync_mark_cnt = mark_cnt[next];
9110  			mark[next] = jiffies;
9111  			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9112  			last_mark = next;
9113  		}
9114  
9115  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9116  			break;
9117  
9118  		/*
9119  		 * this loop exits only if either when we are slower than
9120  		 * the 'hard' speed limit, or the system was IO-idle for
9121  		 * a jiffy.
9122  		 * the system might be non-idle CPU-wise, but we only care
9123  		 * about not overloading the IO subsystem. (things like an
9124  		 * e2fsck being done on the RAID array should execute fast)
9125  		 */
9126  		cond_resched();
9127  
9128  		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9129  		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9130  			/((jiffies-mddev->resync_mark)/HZ +1) +1;
9131  
9132  		if (currspeed > speed_min(mddev)) {
9133  			if (currspeed > speed_max(mddev)) {
9134  				msleep(500);
9135  				goto repeat;
9136  			}
9137  			if (!is_mddev_idle(mddev, 0)) {
9138  				/*
9139  				 * Give other IO more of a chance.
9140  				 * The faster the devices, the less we wait.
9141  				 */
9142  				wait_event(mddev->recovery_wait,
9143  					   !atomic_read(&mddev->recovery_active));
9144  			}
9145  		}
9146  	}
9147  	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9148  		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9149  		? "interrupted" : "done");
9150  	/*
9151  	 * this also signals 'finished resyncing' to md_stop
9152  	 */
9153  	blk_finish_plug(&plug);
9154  	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9155  
9156  	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9157  	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9158  	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9159  		mddev->curr_resync_completed = mddev->curr_resync;
9160  		sysfs_notify_dirent_safe(mddev->sysfs_completed);
9161  	}
9162  	mddev->pers->sync_request(mddev, max_sectors, &skipped);
9163  
9164  	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9165  	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
9166  		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9167  			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9168  				if (mddev->curr_resync >= mddev->recovery_cp) {
9169  					pr_debug("md: checkpointing %s of %s.\n",
9170  						 desc, mdname(mddev));
9171  					if (test_bit(MD_RECOVERY_ERROR,
9172  						&mddev->recovery))
9173  						mddev->recovery_cp =
9174  							mddev->curr_resync_completed;
9175  					else
9176  						mddev->recovery_cp =
9177  							mddev->curr_resync;
9178  				}
9179  			} else
9180  				mddev->recovery_cp = MaxSector;
9181  		} else {
9182  			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9183  				mddev->curr_resync = MaxSector;
9184  			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9185  			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9186  				rcu_read_lock();
9187  				rdev_for_each_rcu(rdev, mddev)
9188  					if (rdev->raid_disk >= 0 &&
9189  					    mddev->delta_disks >= 0 &&
9190  					    !test_bit(Journal, &rdev->flags) &&
9191  					    !test_bit(Faulty, &rdev->flags) &&
9192  					    !test_bit(In_sync, &rdev->flags) &&
9193  					    rdev->recovery_offset < mddev->curr_resync)
9194  						rdev->recovery_offset = mddev->curr_resync;
9195  				rcu_read_unlock();
9196  			}
9197  		}
9198  	}
9199   skip:
9200  	/* set CHANGE_PENDING here since maybe another update is needed,
9201  	 * so other nodes are informed. It should be harmless for normal
9202  	 * raid */
9203  	set_mask_bits(&mddev->sb_flags, 0,
9204  		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9205  
9206  	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9207  			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9208  			mddev->delta_disks > 0 &&
9209  			mddev->pers->finish_reshape &&
9210  			mddev->pers->size &&
9211  			mddev->queue) {
9212  		mddev_lock_nointr(mddev);
9213  		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9214  		mddev_unlock(mddev);
9215  		if (!mddev_is_clustered(mddev))
9216  			set_capacity_and_notify(mddev->gendisk,
9217  						mddev->array_sectors);
9218  	}
9219  
9220  	spin_lock(&mddev->lock);
9221  	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9222  		/* We completed so min/max setting can be forgotten if used. */
9223  		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9224  			mddev->resync_min = 0;
9225  		mddev->resync_max = MaxSector;
9226  	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9227  		mddev->resync_min = mddev->curr_resync_completed;
9228  	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9229  	mddev->curr_resync = MD_RESYNC_NONE;
9230  	spin_unlock(&mddev->lock);
9231  
9232  	wake_up(&resync_wait);
9233  	wake_up(&mddev->sb_wait);
9234  	md_wakeup_thread(mddev->thread);
9235  	return;
9236  }
9237  EXPORT_SYMBOL_GPL(md_do_sync);
9238  
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9239  static int remove_and_add_spares(struct mddev *mddev,
9240  				 struct md_rdev *this)
9241  {
9242  	struct md_rdev *rdev;
9243  	int spares = 0;
9244  	int removed = 0;
9245  	bool remove_some = false;
9246  
9247  	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9248  		/* Mustn't remove devices when resync thread is running */
9249  		return 0;
9250  
9251  	rdev_for_each(rdev, mddev) {
9252  		if ((this == NULL || rdev == this) &&
9253  		    rdev->raid_disk >= 0 &&
9254  		    !test_bit(Blocked, &rdev->flags) &&
9255  		    test_bit(Faulty, &rdev->flags) &&
9256  		    atomic_read(&rdev->nr_pending)==0) {
9257  			/* Faulty non-Blocked devices with nr_pending == 0
9258  			 * never get nr_pending incremented,
9259  			 * never get Faulty cleared, and never get Blocked set.
9260  			 * So we can synchronize_rcu now rather than once per device
9261  			 */
9262  			remove_some = true;
9263  			set_bit(RemoveSynchronized, &rdev->flags);
9264  		}
9265  	}
9266  
9267  	if (remove_some)
9268  		synchronize_rcu();
9269  	rdev_for_each(rdev, mddev) {
9270  		if ((this == NULL || rdev == this) &&
9271  		    rdev->raid_disk >= 0 &&
9272  		    !test_bit(Blocked, &rdev->flags) &&
9273  		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
9274  		     (!test_bit(In_sync, &rdev->flags) &&
9275  		      !test_bit(Journal, &rdev->flags))) &&
9276  		    atomic_read(&rdev->nr_pending)==0)) {
9277  			if (mddev->pers->hot_remove_disk(
9278  				    mddev, rdev) == 0) {
9279  				sysfs_unlink_rdev(mddev, rdev);
9280  				rdev->saved_raid_disk = rdev->raid_disk;
9281  				rdev->raid_disk = -1;
9282  				removed++;
9283  			}
9284  		}
9285  		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
9286  			clear_bit(RemoveSynchronized, &rdev->flags);
9287  	}
9288  
9289  	if (removed && mddev->kobj.sd)
9290  		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9291  
9292  	if (this && removed)
9293  		goto no_add;
9294  
9295  	rdev_for_each(rdev, mddev) {
9296  		if (this && this != rdev)
9297  			continue;
9298  		if (test_bit(Candidate, &rdev->flags))
9299  			continue;
9300  		if (rdev->raid_disk >= 0 &&
9301  		    !test_bit(In_sync, &rdev->flags) &&
9302  		    !test_bit(Journal, &rdev->flags) &&
9303  		    !test_bit(Faulty, &rdev->flags))
9304  			spares++;
9305  		if (rdev->raid_disk >= 0)
9306  			continue;
9307  		if (test_bit(Faulty, &rdev->flags))
9308  			continue;
9309  		if (!test_bit(Journal, &rdev->flags)) {
9310  			if (!md_is_rdwr(mddev) &&
9311  			    !(rdev->saved_raid_disk >= 0 &&
9312  			      !test_bit(Bitmap_sync, &rdev->flags)))
9313  				continue;
9314  
9315  			rdev->recovery_offset = 0;
9316  		}
9317  		if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9318  			/* failure here is OK */
9319  			sysfs_link_rdev(mddev, rdev);
9320  			if (!test_bit(Journal, &rdev->flags))
9321  				spares++;
9322  			md_new_event();
9323  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9324  		}
9325  	}
9326  no_add:
9327  	if (removed)
9328  		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9329  	return spares;
9330  }
9331  
md_start_sync(struct work_struct * ws)9332  static void md_start_sync(struct work_struct *ws)
9333  {
9334  	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
9335  
9336  	rcu_assign_pointer(mddev->sync_thread,
9337  			   md_register_thread(md_do_sync, mddev, "resync"));
9338  	if (!mddev->sync_thread) {
9339  		pr_warn("%s: could not start resync thread...\n",
9340  			mdname(mddev));
9341  		/* leave the spares where they are, it shouldn't hurt */
9342  		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9343  		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9344  		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9345  		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9346  		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9347  		wake_up(&resync_wait);
9348  		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9349  				       &mddev->recovery))
9350  			if (mddev->sysfs_action)
9351  				sysfs_notify_dirent_safe(mddev->sysfs_action);
9352  	} else
9353  		md_wakeup_thread(mddev->sync_thread);
9354  	sysfs_notify_dirent_safe(mddev->sysfs_action);
9355  	md_new_event();
9356  }
9357  
9358  /*
9359   * This routine is regularly called by all per-raid-array threads to
9360   * deal with generic issues like resync and super-block update.
9361   * Raid personalities that don't have a thread (linear/raid0) do not
9362   * need this as they never do any recovery or update the superblock.
9363   *
9364   * It does not do any resync itself, but rather "forks" off other threads
9365   * to do that as needed.
9366   * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9367   * "->recovery" and create a thread at ->sync_thread.
9368   * When the thread finishes it sets MD_RECOVERY_DONE
9369   * and wakeups up this thread which will reap the thread and finish up.
9370   * This thread also removes any faulty devices (with nr_pending == 0).
9371   *
9372   * The overall approach is:
9373   *  1/ if the superblock needs updating, update it.
9374   *  2/ If a recovery thread is running, don't do anything else.
9375   *  3/ If recovery has finished, clean up, possibly marking spares active.
9376   *  4/ If there are any faulty devices, remove them.
9377   *  5/ If array is degraded, try to add spares devices
9378   *  6/ If array has spares or is not in-sync, start a resync thread.
9379   */
md_check_recovery(struct mddev * mddev)9380  void md_check_recovery(struct mddev *mddev)
9381  {
9382  	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
9383  		/* Write superblock - thread that called mddev_suspend()
9384  		 * holds reconfig_mutex for us.
9385  		 */
9386  		set_bit(MD_UPDATING_SB, &mddev->flags);
9387  		smp_mb__after_atomic();
9388  		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
9389  			md_update_sb(mddev, 0);
9390  		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
9391  		wake_up(&mddev->sb_wait);
9392  	}
9393  
9394  	if (is_md_suspended(mddev))
9395  		return;
9396  
9397  	if (mddev->bitmap)
9398  		md_bitmap_daemon_work(mddev);
9399  
9400  	if (signal_pending(current)) {
9401  		if (mddev->pers->sync_request && !mddev->external) {
9402  			pr_debug("md: %s in immediate safe mode\n",
9403  				 mdname(mddev));
9404  			mddev->safemode = 2;
9405  		}
9406  		flush_signals(current);
9407  	}
9408  
9409  	if (!md_is_rdwr(mddev) &&
9410  	    !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
9411  		return;
9412  	if ( ! (
9413  		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9414  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9415  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9416  		(mddev->external == 0 && mddev->safemode == 1) ||
9417  		(mddev->safemode == 2
9418  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9419  		))
9420  		return;
9421  
9422  	if (mddev_trylock(mddev)) {
9423  		int spares = 0;
9424  		bool try_set_sync = mddev->safemode != 0;
9425  
9426  		if (!mddev->external && mddev->safemode == 1)
9427  			mddev->safemode = 0;
9428  
9429  		if (!md_is_rdwr(mddev)) {
9430  			struct md_rdev *rdev;
9431  			if (!mddev->external && mddev->in_sync)
9432  				/* 'Blocked' flag not needed as failed devices
9433  				 * will be recorded if array switched to read/write.
9434  				 * Leaving it set will prevent the device
9435  				 * from being removed.
9436  				 */
9437  				rdev_for_each(rdev, mddev)
9438  					clear_bit(Blocked, &rdev->flags);
9439  			/* On a read-only array we can:
9440  			 * - remove failed devices
9441  			 * - add already-in_sync devices if the array itself
9442  			 *   is in-sync.
9443  			 * As we only add devices that are already in-sync,
9444  			 * we can activate the spares immediately.
9445  			 */
9446  			remove_and_add_spares(mddev, NULL);
9447  			/* There is no thread, but we need to call
9448  			 * ->spare_active and clear saved_raid_disk
9449  			 */
9450  			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9451  			md_reap_sync_thread(mddev);
9452  			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9453  			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9454  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9455  			goto unlock;
9456  		}
9457  
9458  		if (mddev_is_clustered(mddev)) {
9459  			struct md_rdev *rdev, *tmp;
9460  			/* kick the device if another node issued a
9461  			 * remove disk.
9462  			 */
9463  			rdev_for_each_safe(rdev, tmp, mddev) {
9464  				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9465  						rdev->raid_disk < 0)
9466  					md_kick_rdev_from_array(rdev);
9467  			}
9468  		}
9469  
9470  		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9471  			spin_lock(&mddev->lock);
9472  			set_in_sync(mddev);
9473  			spin_unlock(&mddev->lock);
9474  		}
9475  
9476  		if (mddev->sb_flags)
9477  			md_update_sb(mddev, 0);
9478  
9479  		/*
9480  		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
9481  		 * still set.
9482  		 */
9483  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9484  			if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9485  				/* resync/recovery still happening */
9486  				clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9487  				goto unlock;
9488  			}
9489  
9490  			if (WARN_ON_ONCE(!mddev->sync_thread))
9491  				goto unlock;
9492  
9493  			md_reap_sync_thread(mddev);
9494  			goto unlock;
9495  		}
9496  
9497  		/* Set RUNNING before clearing NEEDED to avoid
9498  		 * any transients in the value of "sync_action".
9499  		 */
9500  		mddev->curr_resync_completed = 0;
9501  		spin_lock(&mddev->lock);
9502  		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9503  		spin_unlock(&mddev->lock);
9504  		/* Clear some bits that don't mean anything, but
9505  		 * might be left set
9506  		 */
9507  		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9508  		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9509  
9510  		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9511  		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9512  			goto not_running;
9513  		/* no recovery is running.
9514  		 * remove any failed drives, then
9515  		 * add spares if possible.
9516  		 * Spares are also removed and re-added, to allow
9517  		 * the personality to fail the re-add.
9518  		 */
9519  
9520  		if (mddev->reshape_position != MaxSector) {
9521  			if (mddev->pers->check_reshape == NULL ||
9522  			    mddev->pers->check_reshape(mddev) != 0)
9523  				/* Cannot proceed */
9524  				goto not_running;
9525  			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9526  			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9527  		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
9528  			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9529  			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9530  			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9531  			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9532  		} else if (mddev->recovery_cp < MaxSector) {
9533  			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9534  			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9535  		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9536  			/* nothing to be done ... */
9537  			goto not_running;
9538  
9539  		if (mddev->pers->sync_request) {
9540  			if (spares) {
9541  				/* We are adding a device or devices to an array
9542  				 * which has the bitmap stored on all devices.
9543  				 * So make sure all bitmap pages get written
9544  				 */
9545  				md_bitmap_write_all(mddev->bitmap);
9546  			}
9547  			queue_work(md_misc_wq, &mddev->sync_work);
9548  			goto unlock;
9549  		}
9550  	not_running:
9551  		if (!mddev->sync_thread) {
9552  			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9553  			wake_up(&resync_wait);
9554  			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9555  					       &mddev->recovery))
9556  				if (mddev->sysfs_action)
9557  					sysfs_notify_dirent_safe(mddev->sysfs_action);
9558  		}
9559  	unlock:
9560  		wake_up(&mddev->sb_wait);
9561  		mddev_unlock(mddev);
9562  	}
9563  }
9564  EXPORT_SYMBOL(md_check_recovery);
9565  
md_reap_sync_thread(struct mddev * mddev)9566  void md_reap_sync_thread(struct mddev *mddev)
9567  {
9568  	struct md_rdev *rdev;
9569  	sector_t old_dev_sectors = mddev->dev_sectors;
9570  	bool is_reshaped = false;
9571  
9572  	/* resync has finished, collect result */
9573  	md_unregister_thread(mddev, &mddev->sync_thread);
9574  	atomic_inc(&mddev->sync_seq);
9575  
9576  	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9577  	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9578  	    mddev->degraded != mddev->raid_disks) {
9579  		/* success...*/
9580  		/* activate any spares */
9581  		if (mddev->pers->spare_active(mddev)) {
9582  			sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9583  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9584  		}
9585  	}
9586  	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9587  	    mddev->pers->finish_reshape) {
9588  		mddev->pers->finish_reshape(mddev);
9589  		if (mddev_is_clustered(mddev))
9590  			is_reshaped = true;
9591  	}
9592  
9593  	/* If array is no-longer degraded, then any saved_raid_disk
9594  	 * information must be scrapped.
9595  	 */
9596  	if (!mddev->degraded)
9597  		rdev_for_each(rdev, mddev)
9598  			rdev->saved_raid_disk = -1;
9599  
9600  	md_update_sb(mddev, 1);
9601  	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9602  	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9603  	 * clustered raid */
9604  	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9605  		md_cluster_ops->resync_finish(mddev);
9606  	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9607  	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9608  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9609  	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9610  	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9611  	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9612  	/*
9613  	 * We call md_cluster_ops->update_size here because sync_size could
9614  	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9615  	 * so it is time to update size across cluster.
9616  	 */
9617  	if (mddev_is_clustered(mddev) && is_reshaped
9618  				      && !test_bit(MD_CLOSING, &mddev->flags))
9619  		md_cluster_ops->update_size(mddev, old_dev_sectors);
9620  	/* flag recovery needed just to double check */
9621  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9622  	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9623  	sysfs_notify_dirent_safe(mddev->sysfs_action);
9624  	md_new_event();
9625  	if (mddev->event_work.func)
9626  		queue_work(md_misc_wq, &mddev->event_work);
9627  	wake_up(&resync_wait);
9628  }
9629  EXPORT_SYMBOL(md_reap_sync_thread);
9630  
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)9631  void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9632  {
9633  	sysfs_notify_dirent_safe(rdev->sysfs_state);
9634  	wait_event_timeout(rdev->blocked_wait,
9635  			   !test_bit(Blocked, &rdev->flags) &&
9636  			   !test_bit(BlockedBadBlocks, &rdev->flags),
9637  			   msecs_to_jiffies(5000));
9638  	rdev_dec_pending(rdev, mddev);
9639  }
9640  EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9641  
md_finish_reshape(struct mddev * mddev)9642  void md_finish_reshape(struct mddev *mddev)
9643  {
9644  	/* called be personality module when reshape completes. */
9645  	struct md_rdev *rdev;
9646  
9647  	rdev_for_each(rdev, mddev) {
9648  		if (rdev->data_offset > rdev->new_data_offset)
9649  			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9650  		else
9651  			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9652  		rdev->data_offset = rdev->new_data_offset;
9653  	}
9654  }
9655  EXPORT_SYMBOL(md_finish_reshape);
9656  
9657  /* Bad block management */
9658  
9659  /* Returns 1 on success, 0 on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9660  int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9661  		       int is_new)
9662  {
9663  	struct mddev *mddev = rdev->mddev;
9664  	int rv;
9665  	if (is_new)
9666  		s += rdev->new_data_offset;
9667  	else
9668  		s += rdev->data_offset;
9669  	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9670  	if (rv == 0) {
9671  		/* Make sure they get written out promptly */
9672  		if (test_bit(ExternalBbl, &rdev->flags))
9673  			sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9674  		sysfs_notify_dirent_safe(rdev->sysfs_state);
9675  		set_mask_bits(&mddev->sb_flags, 0,
9676  			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9677  		md_wakeup_thread(rdev->mddev->thread);
9678  		return 1;
9679  	} else
9680  		return 0;
9681  }
9682  EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9683  
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9684  int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9685  			 int is_new)
9686  {
9687  	int rv;
9688  	if (is_new)
9689  		s += rdev->new_data_offset;
9690  	else
9691  		s += rdev->data_offset;
9692  	rv = badblocks_clear(&rdev->badblocks, s, sectors);
9693  	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9694  		sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9695  	return rv;
9696  }
9697  EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9698  
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)9699  static int md_notify_reboot(struct notifier_block *this,
9700  			    unsigned long code, void *x)
9701  {
9702  	struct mddev *mddev, *n;
9703  	int need_delay = 0;
9704  
9705  	spin_lock(&all_mddevs_lock);
9706  	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
9707  		if (!mddev_get(mddev))
9708  			continue;
9709  		spin_unlock(&all_mddevs_lock);
9710  		if (mddev_trylock(mddev)) {
9711  			if (mddev->pers)
9712  				__md_stop_writes(mddev);
9713  			if (mddev->persistent)
9714  				mddev->safemode = 2;
9715  			mddev_unlock(mddev);
9716  		}
9717  		need_delay = 1;
9718  		mddev_put(mddev);
9719  		spin_lock(&all_mddevs_lock);
9720  	}
9721  	spin_unlock(&all_mddevs_lock);
9722  
9723  	/*
9724  	 * certain more exotic SCSI devices are known to be
9725  	 * volatile wrt too early system reboots. While the
9726  	 * right place to handle this issue is the given
9727  	 * driver, we do want to have a safe RAID driver ...
9728  	 */
9729  	if (need_delay)
9730  		msleep(1000);
9731  
9732  	return NOTIFY_DONE;
9733  }
9734  
9735  static struct notifier_block md_notifier = {
9736  	.notifier_call	= md_notify_reboot,
9737  	.next		= NULL,
9738  	.priority	= INT_MAX, /* before any real devices */
9739  };
9740  
md_geninit(void)9741  static void md_geninit(void)
9742  {
9743  	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9744  
9745  	proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
9746  }
9747  
md_init(void)9748  static int __init md_init(void)
9749  {
9750  	int ret = -ENOMEM;
9751  
9752  	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9753  	if (!md_wq)
9754  		goto err_wq;
9755  
9756  	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9757  	if (!md_misc_wq)
9758  		goto err_misc_wq;
9759  
9760  	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
9761  				       0);
9762  	if (!md_bitmap_wq)
9763  		goto err_bitmap_wq;
9764  
9765  	ret = __register_blkdev(MD_MAJOR, "md", md_probe);
9766  	if (ret < 0)
9767  		goto err_md;
9768  
9769  	ret = __register_blkdev(0, "mdp", md_probe);
9770  	if (ret < 0)
9771  		goto err_mdp;
9772  	mdp_major = ret;
9773  
9774  	register_reboot_notifier(&md_notifier);
9775  	raid_table_header = register_sysctl("dev/raid", raid_table);
9776  
9777  	md_geninit();
9778  	return 0;
9779  
9780  err_mdp:
9781  	unregister_blkdev(MD_MAJOR, "md");
9782  err_md:
9783  	destroy_workqueue(md_bitmap_wq);
9784  err_bitmap_wq:
9785  	destroy_workqueue(md_misc_wq);
9786  err_misc_wq:
9787  	destroy_workqueue(md_wq);
9788  err_wq:
9789  	return ret;
9790  }
9791  
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)9792  static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9793  {
9794  	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9795  	struct md_rdev *rdev2, *tmp;
9796  	int role, ret;
9797  
9798  	/*
9799  	 * If size is changed in another node then we need to
9800  	 * do resize as well.
9801  	 */
9802  	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9803  		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9804  		if (ret)
9805  			pr_info("md-cluster: resize failed\n");
9806  		else
9807  			md_bitmap_update_sb(mddev->bitmap);
9808  	}
9809  
9810  	/* Check for change of roles in the active devices */
9811  	rdev_for_each_safe(rdev2, tmp, mddev) {
9812  		if (test_bit(Faulty, &rdev2->flags))
9813  			continue;
9814  
9815  		/* Check if the roles changed */
9816  		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9817  
9818  		if (test_bit(Candidate, &rdev2->flags)) {
9819  			if (role == MD_DISK_ROLE_FAULTY) {
9820  				pr_info("md: Removing Candidate device %pg because add failed\n",
9821  					rdev2->bdev);
9822  				md_kick_rdev_from_array(rdev2);
9823  				continue;
9824  			}
9825  			else
9826  				clear_bit(Candidate, &rdev2->flags);
9827  		}
9828  
9829  		if (role != rdev2->raid_disk) {
9830  			/*
9831  			 * got activated except reshape is happening.
9832  			 */
9833  			if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
9834  			    !(le32_to_cpu(sb->feature_map) &
9835  			      MD_FEATURE_RESHAPE_ACTIVE)) {
9836  				rdev2->saved_raid_disk = role;
9837  				ret = remove_and_add_spares(mddev, rdev2);
9838  				pr_info("Activated spare: %pg\n",
9839  					rdev2->bdev);
9840  				/* wakeup mddev->thread here, so array could
9841  				 * perform resync with the new activated disk */
9842  				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9843  				md_wakeup_thread(mddev->thread);
9844  			}
9845  			/* device faulty
9846  			 * We just want to do the minimum to mark the disk
9847  			 * as faulty. The recovery is performed by the
9848  			 * one who initiated the error.
9849  			 */
9850  			if (role == MD_DISK_ROLE_FAULTY ||
9851  			    role == MD_DISK_ROLE_JOURNAL) {
9852  				md_error(mddev, rdev2);
9853  				clear_bit(Blocked, &rdev2->flags);
9854  			}
9855  		}
9856  	}
9857  
9858  	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
9859  		ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9860  		if (ret)
9861  			pr_warn("md: updating array disks failed. %d\n", ret);
9862  	}
9863  
9864  	/*
9865  	 * Since mddev->delta_disks has already updated in update_raid_disks,
9866  	 * so it is time to check reshape.
9867  	 */
9868  	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9869  	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9870  		/*
9871  		 * reshape is happening in the remote node, we need to
9872  		 * update reshape_position and call start_reshape.
9873  		 */
9874  		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9875  		if (mddev->pers->update_reshape_pos)
9876  			mddev->pers->update_reshape_pos(mddev);
9877  		if (mddev->pers->start_reshape)
9878  			mddev->pers->start_reshape(mddev);
9879  	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9880  		   mddev->reshape_position != MaxSector &&
9881  		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9882  		/* reshape is just done in another node. */
9883  		mddev->reshape_position = MaxSector;
9884  		if (mddev->pers->update_reshape_pos)
9885  			mddev->pers->update_reshape_pos(mddev);
9886  	}
9887  
9888  	/* Finally set the event to be up to date */
9889  	mddev->events = le64_to_cpu(sb->events);
9890  }
9891  
read_rdev(struct mddev * mddev,struct md_rdev * rdev)9892  static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9893  {
9894  	int err;
9895  	struct page *swapout = rdev->sb_page;
9896  	struct mdp_superblock_1 *sb;
9897  
9898  	/* Store the sb page of the rdev in the swapout temporary
9899  	 * variable in case we err in the future
9900  	 */
9901  	rdev->sb_page = NULL;
9902  	err = alloc_disk_sb(rdev);
9903  	if (err == 0) {
9904  		ClearPageUptodate(rdev->sb_page);
9905  		rdev->sb_loaded = 0;
9906  		err = super_types[mddev->major_version].
9907  			load_super(rdev, NULL, mddev->minor_version);
9908  	}
9909  	if (err < 0) {
9910  		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9911  				__func__, __LINE__, rdev->desc_nr, err);
9912  		if (rdev->sb_page)
9913  			put_page(rdev->sb_page);
9914  		rdev->sb_page = swapout;
9915  		rdev->sb_loaded = 1;
9916  		return err;
9917  	}
9918  
9919  	sb = page_address(rdev->sb_page);
9920  	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9921  	 * is not set
9922  	 */
9923  
9924  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9925  		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9926  
9927  	/* The other node finished recovery, call spare_active to set
9928  	 * device In_sync and mddev->degraded
9929  	 */
9930  	if (rdev->recovery_offset == MaxSector &&
9931  	    !test_bit(In_sync, &rdev->flags) &&
9932  	    mddev->pers->spare_active(mddev))
9933  		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9934  
9935  	put_page(swapout);
9936  	return 0;
9937  }
9938  
md_reload_sb(struct mddev * mddev,int nr)9939  void md_reload_sb(struct mddev *mddev, int nr)
9940  {
9941  	struct md_rdev *rdev = NULL, *iter;
9942  	int err;
9943  
9944  	/* Find the rdev */
9945  	rdev_for_each_rcu(iter, mddev) {
9946  		if (iter->desc_nr == nr) {
9947  			rdev = iter;
9948  			break;
9949  		}
9950  	}
9951  
9952  	if (!rdev) {
9953  		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9954  		return;
9955  	}
9956  
9957  	err = read_rdev(mddev, rdev);
9958  	if (err < 0)
9959  		return;
9960  
9961  	check_sb_changes(mddev, rdev);
9962  
9963  	/* Read all rdev's to update recovery_offset */
9964  	rdev_for_each_rcu(rdev, mddev) {
9965  		if (!test_bit(Faulty, &rdev->flags))
9966  			read_rdev(mddev, rdev);
9967  	}
9968  }
9969  EXPORT_SYMBOL(md_reload_sb);
9970  
9971  #ifndef MODULE
9972  
9973  /*
9974   * Searches all registered partitions for autorun RAID arrays
9975   * at boot time.
9976   */
9977  
9978  static DEFINE_MUTEX(detected_devices_mutex);
9979  static LIST_HEAD(all_detected_devices);
9980  struct detected_devices_node {
9981  	struct list_head list;
9982  	dev_t dev;
9983  };
9984  
md_autodetect_dev(dev_t dev)9985  void md_autodetect_dev(dev_t dev)
9986  {
9987  	struct detected_devices_node *node_detected_dev;
9988  
9989  	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9990  	if (node_detected_dev) {
9991  		node_detected_dev->dev = dev;
9992  		mutex_lock(&detected_devices_mutex);
9993  		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9994  		mutex_unlock(&detected_devices_mutex);
9995  	}
9996  }
9997  
md_autostart_arrays(int part)9998  void md_autostart_arrays(int part)
9999  {
10000  	struct md_rdev *rdev;
10001  	struct detected_devices_node *node_detected_dev;
10002  	dev_t dev;
10003  	int i_scanned, i_passed;
10004  
10005  	i_scanned = 0;
10006  	i_passed = 0;
10007  
10008  	pr_info("md: Autodetecting RAID arrays.\n");
10009  
10010  	mutex_lock(&detected_devices_mutex);
10011  	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10012  		i_scanned++;
10013  		node_detected_dev = list_entry(all_detected_devices.next,
10014  					struct detected_devices_node, list);
10015  		list_del(&node_detected_dev->list);
10016  		dev = node_detected_dev->dev;
10017  		kfree(node_detected_dev);
10018  		mutex_unlock(&detected_devices_mutex);
10019  		rdev = md_import_device(dev,0, 90);
10020  		mutex_lock(&detected_devices_mutex);
10021  		if (IS_ERR(rdev))
10022  			continue;
10023  
10024  		if (test_bit(Faulty, &rdev->flags))
10025  			continue;
10026  
10027  		set_bit(AutoDetected, &rdev->flags);
10028  		list_add(&rdev->same_set, &pending_raid_disks);
10029  		i_passed++;
10030  	}
10031  	mutex_unlock(&detected_devices_mutex);
10032  
10033  	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10034  
10035  	autorun_devices(part);
10036  }
10037  
10038  #endif /* !MODULE */
10039  
md_exit(void)10040  static __exit void md_exit(void)
10041  {
10042  	struct mddev *mddev, *n;
10043  	int delay = 1;
10044  
10045  	unregister_blkdev(MD_MAJOR,"md");
10046  	unregister_blkdev(mdp_major, "mdp");
10047  	unregister_reboot_notifier(&md_notifier);
10048  	unregister_sysctl_table(raid_table_header);
10049  
10050  	/* We cannot unload the modules while some process is
10051  	 * waiting for us in select() or poll() - wake them up
10052  	 */
10053  	md_unloading = 1;
10054  	while (waitqueue_active(&md_event_waiters)) {
10055  		/* not safe to leave yet */
10056  		wake_up(&md_event_waiters);
10057  		msleep(delay);
10058  		delay += delay;
10059  	}
10060  	remove_proc_entry("mdstat", NULL);
10061  
10062  	spin_lock(&all_mddevs_lock);
10063  	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
10064  		if (!mddev_get(mddev))
10065  			continue;
10066  		spin_unlock(&all_mddevs_lock);
10067  		export_array(mddev);
10068  		mddev->ctime = 0;
10069  		mddev->hold_active = 0;
10070  		/*
10071  		 * As the mddev is now fully clear, mddev_put will schedule
10072  		 * the mddev for destruction by a workqueue, and the
10073  		 * destroy_workqueue() below will wait for that to complete.
10074  		 */
10075  		mddev_put(mddev);
10076  		spin_lock(&all_mddevs_lock);
10077  	}
10078  	spin_unlock(&all_mddevs_lock);
10079  
10080  	destroy_workqueue(md_misc_wq);
10081  	destroy_workqueue(md_bitmap_wq);
10082  	destroy_workqueue(md_wq);
10083  }
10084  
10085  subsys_initcall(md_init);
module_exit(md_exit)10086  module_exit(md_exit)
10087  
10088  static int get_ro(char *buffer, const struct kernel_param *kp)
10089  {
10090  	return sprintf(buffer, "%d\n", start_readonly);
10091  }
set_ro(const char * val,const struct kernel_param * kp)10092  static int set_ro(const char *val, const struct kernel_param *kp)
10093  {
10094  	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10095  }
10096  
10097  module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10098  module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10099  module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10100  module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10101  
10102  MODULE_LICENSE("GPL");
10103  MODULE_DESCRIPTION("MD RAID framework");
10104  MODULE_ALIAS("md");
10105  MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10106