xref: /openbmc/linux/block/genhd.c (revision a44d924c)
1  /*
2   *  gendisk handling
3   */
4  
5  #include <linux/module.h>
6  #include <linux/fs.h>
7  #include <linux/genhd.h>
8  #include <linux/kdev_t.h>
9  #include <linux/kernel.h>
10  #include <linux/blkdev.h>
11  #include <linux/backing-dev.h>
12  #include <linux/init.h>
13  #include <linux/spinlock.h>
14  #include <linux/proc_fs.h>
15  #include <linux/seq_file.h>
16  #include <linux/slab.h>
17  #include <linux/kmod.h>
18  #include <linux/kobj_map.h>
19  #include <linux/mutex.h>
20  #include <linux/idr.h>
21  #include <linux/log2.h>
22  #include <linux/pm_runtime.h>
23  #include <linux/badblocks.h>
24  
25  #include "blk.h"
26  
27  static DEFINE_MUTEX(block_class_lock);
28  struct kobject *block_depr;
29  
30  /* for extended dynamic devt allocation, currently only one major is used */
31  #define NR_EXT_DEVT		(1 << MINORBITS)
32  
33  /* For extended devt allocation.  ext_devt_lock prevents look up
34   * results from going away underneath its user.
35   */
36  static DEFINE_SPINLOCK(ext_devt_lock);
37  static DEFINE_IDR(ext_devt_idr);
38  
39  static const struct device_type disk_type;
40  
41  static void disk_check_events(struct disk_events *ev,
42  			      unsigned int *clearing_ptr);
43  static void disk_alloc_events(struct gendisk *disk);
44  static void disk_add_events(struct gendisk *disk);
45  static void disk_del_events(struct gendisk *disk);
46  static void disk_release_events(struct gendisk *disk);
47  
48  void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
49  {
50  	if (queue_is_mq(q))
51  		return;
52  
53  	part_stat_local_inc(part, in_flight[rw]);
54  	if (part->partno)
55  		part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
56  }
57  
58  void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
59  {
60  	if (queue_is_mq(q))
61  		return;
62  
63  	part_stat_local_dec(part, in_flight[rw]);
64  	if (part->partno)
65  		part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
66  }
67  
68  unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
69  {
70  	int cpu;
71  	unsigned int inflight;
72  
73  	if (queue_is_mq(q)) {
74  		return blk_mq_in_flight(q, part);
75  	}
76  
77  	inflight = 0;
78  	for_each_possible_cpu(cpu) {
79  		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
80  			    part_stat_local_read_cpu(part, in_flight[1], cpu);
81  	}
82  	if ((int)inflight < 0)
83  		inflight = 0;
84  
85  	return inflight;
86  }
87  
88  void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
89  		       unsigned int inflight[2])
90  {
91  	int cpu;
92  
93  	if (queue_is_mq(q)) {
94  		blk_mq_in_flight_rw(q, part, inflight);
95  		return;
96  	}
97  
98  	inflight[0] = 0;
99  	inflight[1] = 0;
100  	for_each_possible_cpu(cpu) {
101  		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
102  		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
103  	}
104  	if ((int)inflight[0] < 0)
105  		inflight[0] = 0;
106  	if ((int)inflight[1] < 0)
107  		inflight[1] = 0;
108  }
109  
110  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
111  {
112  	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
113  
114  	if (unlikely(partno < 0 || partno >= ptbl->len))
115  		return NULL;
116  	return rcu_dereference(ptbl->part[partno]);
117  }
118  
119  /**
120   * disk_get_part - get partition
121   * @disk: disk to look partition from
122   * @partno: partition number
123   *
124   * Look for partition @partno from @disk.  If found, increment
125   * reference count and return it.
126   *
127   * CONTEXT:
128   * Don't care.
129   *
130   * RETURNS:
131   * Pointer to the found partition on success, NULL if not found.
132   */
133  struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
134  {
135  	struct hd_struct *part;
136  
137  	rcu_read_lock();
138  	part = __disk_get_part(disk, partno);
139  	if (part)
140  		get_device(part_to_dev(part));
141  	rcu_read_unlock();
142  
143  	return part;
144  }
145  EXPORT_SYMBOL_GPL(disk_get_part);
146  
147  /**
148   * disk_part_iter_init - initialize partition iterator
149   * @piter: iterator to initialize
150   * @disk: disk to iterate over
151   * @flags: DISK_PITER_* flags
152   *
153   * Initialize @piter so that it iterates over partitions of @disk.
154   *
155   * CONTEXT:
156   * Don't care.
157   */
158  void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
159  			  unsigned int flags)
160  {
161  	struct disk_part_tbl *ptbl;
162  
163  	rcu_read_lock();
164  	ptbl = rcu_dereference(disk->part_tbl);
165  
166  	piter->disk = disk;
167  	piter->part = NULL;
168  
169  	if (flags & DISK_PITER_REVERSE)
170  		piter->idx = ptbl->len - 1;
171  	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
172  		piter->idx = 0;
173  	else
174  		piter->idx = 1;
175  
176  	piter->flags = flags;
177  
178  	rcu_read_unlock();
179  }
180  EXPORT_SYMBOL_GPL(disk_part_iter_init);
181  
182  /**
183   * disk_part_iter_next - proceed iterator to the next partition and return it
184   * @piter: iterator of interest
185   *
186   * Proceed @piter to the next partition and return it.
187   *
188   * CONTEXT:
189   * Don't care.
190   */
191  struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
192  {
193  	struct disk_part_tbl *ptbl;
194  	int inc, end;
195  
196  	/* put the last partition */
197  	disk_put_part(piter->part);
198  	piter->part = NULL;
199  
200  	/* get part_tbl */
201  	rcu_read_lock();
202  	ptbl = rcu_dereference(piter->disk->part_tbl);
203  
204  	/* determine iteration parameters */
205  	if (piter->flags & DISK_PITER_REVERSE) {
206  		inc = -1;
207  		if (piter->flags & (DISK_PITER_INCL_PART0 |
208  				    DISK_PITER_INCL_EMPTY_PART0))
209  			end = -1;
210  		else
211  			end = 0;
212  	} else {
213  		inc = 1;
214  		end = ptbl->len;
215  	}
216  
217  	/* iterate to the next partition */
218  	for (; piter->idx != end; piter->idx += inc) {
219  		struct hd_struct *part;
220  
221  		part = rcu_dereference(ptbl->part[piter->idx]);
222  		if (!part)
223  			continue;
224  		if (!part_nr_sects_read(part) &&
225  		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
226  		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
227  		      piter->idx == 0))
228  			continue;
229  
230  		get_device(part_to_dev(part));
231  		piter->part = part;
232  		piter->idx += inc;
233  		break;
234  	}
235  
236  	rcu_read_unlock();
237  
238  	return piter->part;
239  }
240  EXPORT_SYMBOL_GPL(disk_part_iter_next);
241  
242  /**
243   * disk_part_iter_exit - finish up partition iteration
244   * @piter: iter of interest
245   *
246   * Called when iteration is over.  Cleans up @piter.
247   *
248   * CONTEXT:
249   * Don't care.
250   */
251  void disk_part_iter_exit(struct disk_part_iter *piter)
252  {
253  	disk_put_part(piter->part);
254  	piter->part = NULL;
255  }
256  EXPORT_SYMBOL_GPL(disk_part_iter_exit);
257  
258  static inline int sector_in_part(struct hd_struct *part, sector_t sector)
259  {
260  	return part->start_sect <= sector &&
261  		sector < part->start_sect + part_nr_sects_read(part);
262  }
263  
264  /**
265   * disk_map_sector_rcu - map sector to partition
266   * @disk: gendisk of interest
267   * @sector: sector to map
268   *
269   * Find out which partition @sector maps to on @disk.  This is
270   * primarily used for stats accounting.
271   *
272   * CONTEXT:
273   * RCU read locked.  The returned partition pointer is valid only
274   * while preemption is disabled.
275   *
276   * RETURNS:
277   * Found partition on success, part0 is returned if no partition matches
278   */
279  struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
280  {
281  	struct disk_part_tbl *ptbl;
282  	struct hd_struct *part;
283  	int i;
284  
285  	ptbl = rcu_dereference(disk->part_tbl);
286  
287  	part = rcu_dereference(ptbl->last_lookup);
288  	if (part && sector_in_part(part, sector))
289  		return part;
290  
291  	for (i = 1; i < ptbl->len; i++) {
292  		part = rcu_dereference(ptbl->part[i]);
293  
294  		if (part && sector_in_part(part, sector)) {
295  			rcu_assign_pointer(ptbl->last_lookup, part);
296  			return part;
297  		}
298  	}
299  	return &disk->part0;
300  }
301  EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
302  
303  /*
304   * Can be deleted altogether. Later.
305   *
306   */
307  #define BLKDEV_MAJOR_HASH_SIZE 255
308  static struct blk_major_name {
309  	struct blk_major_name *next;
310  	int major;
311  	char name[16];
312  } *major_names[BLKDEV_MAJOR_HASH_SIZE];
313  
314  /* index in the above - for now: assume no multimajor ranges */
315  static inline int major_to_index(unsigned major)
316  {
317  	return major % BLKDEV_MAJOR_HASH_SIZE;
318  }
319  
320  #ifdef CONFIG_PROC_FS
321  void blkdev_show(struct seq_file *seqf, off_t offset)
322  {
323  	struct blk_major_name *dp;
324  
325  	mutex_lock(&block_class_lock);
326  	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
327  		if (dp->major == offset)
328  			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
329  	mutex_unlock(&block_class_lock);
330  }
331  #endif /* CONFIG_PROC_FS */
332  
333  /**
334   * register_blkdev - register a new block device
335   *
336   * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
337   *         @major = 0, try to allocate any unused major number.
338   * @name: the name of the new block device as a zero terminated string
339   *
340   * The @name must be unique within the system.
341   *
342   * The return value depends on the @major input parameter:
343   *
344   *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
345   *    then the function returns zero on success, or a negative error code
346   *  - if any unused major number was requested with @major = 0 parameter
347   *    then the return value is the allocated major number in range
348   *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
349   *
350   * See Documentation/admin-guide/devices.txt for the list of allocated
351   * major numbers.
352   */
353  int register_blkdev(unsigned int major, const char *name)
354  {
355  	struct blk_major_name **n, *p;
356  	int index, ret = 0;
357  
358  	mutex_lock(&block_class_lock);
359  
360  	/* temporary */
361  	if (major == 0) {
362  		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
363  			if (major_names[index] == NULL)
364  				break;
365  		}
366  
367  		if (index == 0) {
368  			printk("register_blkdev: failed to get major for %s\n",
369  			       name);
370  			ret = -EBUSY;
371  			goto out;
372  		}
373  		major = index;
374  		ret = major;
375  	}
376  
377  	if (major >= BLKDEV_MAJOR_MAX) {
378  		pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
379  		       major, BLKDEV_MAJOR_MAX-1, name);
380  
381  		ret = -EINVAL;
382  		goto out;
383  	}
384  
385  	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
386  	if (p == NULL) {
387  		ret = -ENOMEM;
388  		goto out;
389  	}
390  
391  	p->major = major;
392  	strlcpy(p->name, name, sizeof(p->name));
393  	p->next = NULL;
394  	index = major_to_index(major);
395  
396  	for (n = &major_names[index]; *n; n = &(*n)->next) {
397  		if ((*n)->major == major)
398  			break;
399  	}
400  	if (!*n)
401  		*n = p;
402  	else
403  		ret = -EBUSY;
404  
405  	if (ret < 0) {
406  		printk("register_blkdev: cannot get major %u for %s\n",
407  		       major, name);
408  		kfree(p);
409  	}
410  out:
411  	mutex_unlock(&block_class_lock);
412  	return ret;
413  }
414  
415  EXPORT_SYMBOL(register_blkdev);
416  
417  void unregister_blkdev(unsigned int major, const char *name)
418  {
419  	struct blk_major_name **n;
420  	struct blk_major_name *p = NULL;
421  	int index = major_to_index(major);
422  
423  	mutex_lock(&block_class_lock);
424  	for (n = &major_names[index]; *n; n = &(*n)->next)
425  		if ((*n)->major == major)
426  			break;
427  	if (!*n || strcmp((*n)->name, name)) {
428  		WARN_ON(1);
429  	} else {
430  		p = *n;
431  		*n = p->next;
432  	}
433  	mutex_unlock(&block_class_lock);
434  	kfree(p);
435  }
436  
437  EXPORT_SYMBOL(unregister_blkdev);
438  
439  static struct kobj_map *bdev_map;
440  
441  /**
442   * blk_mangle_minor - scatter minor numbers apart
443   * @minor: minor number to mangle
444   *
445   * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
446   * is enabled.  Mangling twice gives the original value.
447   *
448   * RETURNS:
449   * Mangled value.
450   *
451   * CONTEXT:
452   * Don't care.
453   */
454  static int blk_mangle_minor(int minor)
455  {
456  #ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
457  	int i;
458  
459  	for (i = 0; i < MINORBITS / 2; i++) {
460  		int low = minor & (1 << i);
461  		int high = minor & (1 << (MINORBITS - 1 - i));
462  		int distance = MINORBITS - 1 - 2 * i;
463  
464  		minor ^= low | high;	/* clear both bits */
465  		low <<= distance;	/* swap the positions */
466  		high >>= distance;
467  		minor |= low | high;	/* and set */
468  	}
469  #endif
470  	return minor;
471  }
472  
473  /**
474   * blk_alloc_devt - allocate a dev_t for a partition
475   * @part: partition to allocate dev_t for
476   * @devt: out parameter for resulting dev_t
477   *
478   * Allocate a dev_t for block device.
479   *
480   * RETURNS:
481   * 0 on success, allocated dev_t is returned in *@devt.  -errno on
482   * failure.
483   *
484   * CONTEXT:
485   * Might sleep.
486   */
487  int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
488  {
489  	struct gendisk *disk = part_to_disk(part);
490  	int idx;
491  
492  	/* in consecutive minor range? */
493  	if (part->partno < disk->minors) {
494  		*devt = MKDEV(disk->major, disk->first_minor + part->partno);
495  		return 0;
496  	}
497  
498  	/* allocate ext devt */
499  	idr_preload(GFP_KERNEL);
500  
501  	spin_lock_bh(&ext_devt_lock);
502  	idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
503  	spin_unlock_bh(&ext_devt_lock);
504  
505  	idr_preload_end();
506  	if (idx < 0)
507  		return idx == -ENOSPC ? -EBUSY : idx;
508  
509  	*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
510  	return 0;
511  }
512  
513  /**
514   * blk_free_devt - free a dev_t
515   * @devt: dev_t to free
516   *
517   * Free @devt which was allocated using blk_alloc_devt().
518   *
519   * CONTEXT:
520   * Might sleep.
521   */
522  void blk_free_devt(dev_t devt)
523  {
524  	if (devt == MKDEV(0, 0))
525  		return;
526  
527  	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
528  		spin_lock_bh(&ext_devt_lock);
529  		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
530  		spin_unlock_bh(&ext_devt_lock);
531  	}
532  }
533  
534  static char *bdevt_str(dev_t devt, char *buf)
535  {
536  	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
537  		char tbuf[BDEVT_SIZE];
538  		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
539  		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
540  	} else
541  		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
542  
543  	return buf;
544  }
545  
546  /*
547   * Register device numbers dev..(dev+range-1)
548   * range must be nonzero
549   * The hash chain is sorted on range, so that subranges can override.
550   */
551  void blk_register_region(dev_t devt, unsigned long range, struct module *module,
552  			 struct kobject *(*probe)(dev_t, int *, void *),
553  			 int (*lock)(dev_t, void *), void *data)
554  {
555  	kobj_map(bdev_map, devt, range, module, probe, lock, data);
556  }
557  
558  EXPORT_SYMBOL(blk_register_region);
559  
560  void blk_unregister_region(dev_t devt, unsigned long range)
561  {
562  	kobj_unmap(bdev_map, devt, range);
563  }
564  
565  EXPORT_SYMBOL(blk_unregister_region);
566  
567  static struct kobject *exact_match(dev_t devt, int *partno, void *data)
568  {
569  	struct gendisk *p = data;
570  
571  	return &disk_to_dev(p)->kobj;
572  }
573  
574  static int exact_lock(dev_t devt, void *data)
575  {
576  	struct gendisk *p = data;
577  
578  	if (!get_disk_and_module(p))
579  		return -1;
580  	return 0;
581  }
582  
583  static void register_disk(struct device *parent, struct gendisk *disk,
584  			  const struct attribute_group **groups)
585  {
586  	struct device *ddev = disk_to_dev(disk);
587  	struct block_device *bdev;
588  	struct disk_part_iter piter;
589  	struct hd_struct *part;
590  	int err;
591  
592  	ddev->parent = parent;
593  
594  	dev_set_name(ddev, "%s", disk->disk_name);
595  
596  	/* delay uevents, until we scanned partition table */
597  	dev_set_uevent_suppress(ddev, 1);
598  
599  	if (groups) {
600  		WARN_ON(ddev->groups);
601  		ddev->groups = groups;
602  	}
603  	if (device_add(ddev))
604  		return;
605  	if (!sysfs_deprecated) {
606  		err = sysfs_create_link(block_depr, &ddev->kobj,
607  					kobject_name(&ddev->kobj));
608  		if (err) {
609  			device_del(ddev);
610  			return;
611  		}
612  	}
613  
614  	/*
615  	 * avoid probable deadlock caused by allocating memory with
616  	 * GFP_KERNEL in runtime_resume callback of its all ancestor
617  	 * devices
618  	 */
619  	pm_runtime_set_memalloc_noio(ddev, true);
620  
621  	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
622  	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
623  
624  	if (disk->flags & GENHD_FL_HIDDEN) {
625  		dev_set_uevent_suppress(ddev, 0);
626  		return;
627  	}
628  
629  	/* No minors to use for partitions */
630  	if (!disk_part_scan_enabled(disk))
631  		goto exit;
632  
633  	/* No such device (e.g., media were just removed) */
634  	if (!get_capacity(disk))
635  		goto exit;
636  
637  	bdev = bdget_disk(disk, 0);
638  	if (!bdev)
639  		goto exit;
640  
641  	bdev->bd_invalidated = 1;
642  	err = blkdev_get(bdev, FMODE_READ, NULL);
643  	if (err < 0)
644  		goto exit;
645  	blkdev_put(bdev, FMODE_READ);
646  
647  exit:
648  	/* announce disk after possible partitions are created */
649  	dev_set_uevent_suppress(ddev, 0);
650  	kobject_uevent(&ddev->kobj, KOBJ_ADD);
651  
652  	/* announce possible partitions */
653  	disk_part_iter_init(&piter, disk, 0);
654  	while ((part = disk_part_iter_next(&piter)))
655  		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
656  	disk_part_iter_exit(&piter);
657  
658  	err = sysfs_create_link(&ddev->kobj,
659  				&disk->queue->backing_dev_info->dev->kobj,
660  				"bdi");
661  	WARN_ON(err);
662  }
663  
664  /**
665   * __device_add_disk - add disk information to kernel list
666   * @parent: parent device for the disk
667   * @disk: per-device partitioning information
668   * @groups: Additional per-device sysfs groups
669   * @register_queue: register the queue if set to true
670   *
671   * This function registers the partitioning information in @disk
672   * with the kernel.
673   *
674   * FIXME: error handling
675   */
676  static void __device_add_disk(struct device *parent, struct gendisk *disk,
677  			      const struct attribute_group **groups,
678  			      bool register_queue)
679  {
680  	dev_t devt;
681  	int retval;
682  
683  	/* minors == 0 indicates to use ext devt from part0 and should
684  	 * be accompanied with EXT_DEVT flag.  Make sure all
685  	 * parameters make sense.
686  	 */
687  	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
688  	WARN_ON(!disk->minors &&
689  		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
690  
691  	disk->flags |= GENHD_FL_UP;
692  
693  	retval = blk_alloc_devt(&disk->part0, &devt);
694  	if (retval) {
695  		WARN_ON(1);
696  		return;
697  	}
698  	disk->major = MAJOR(devt);
699  	disk->first_minor = MINOR(devt);
700  
701  	disk_alloc_events(disk);
702  
703  	if (disk->flags & GENHD_FL_HIDDEN) {
704  		/*
705  		 * Don't let hidden disks show up in /proc/partitions,
706  		 * and don't bother scanning for partitions either.
707  		 */
708  		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
709  		disk->flags |= GENHD_FL_NO_PART_SCAN;
710  	} else {
711  		int ret;
712  
713  		/* Register BDI before referencing it from bdev */
714  		disk_to_dev(disk)->devt = devt;
715  		ret = bdi_register_owner(disk->queue->backing_dev_info,
716  						disk_to_dev(disk));
717  		WARN_ON(ret);
718  		blk_register_region(disk_devt(disk), disk->minors, NULL,
719  				    exact_match, exact_lock, disk);
720  	}
721  	register_disk(parent, disk, groups);
722  	if (register_queue)
723  		blk_register_queue(disk);
724  
725  	/*
726  	 * Take an extra ref on queue which will be put on disk_release()
727  	 * so that it sticks around as long as @disk is there.
728  	 */
729  	WARN_ON_ONCE(!blk_get_queue(disk->queue));
730  
731  	disk_add_events(disk);
732  	blk_integrity_add(disk);
733  }
734  
735  void device_add_disk(struct device *parent, struct gendisk *disk,
736  		     const struct attribute_group **groups)
737  
738  {
739  	__device_add_disk(parent, disk, groups, true);
740  }
741  EXPORT_SYMBOL(device_add_disk);
742  
743  void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
744  {
745  	__device_add_disk(parent, disk, NULL, false);
746  }
747  EXPORT_SYMBOL(device_add_disk_no_queue_reg);
748  
749  void del_gendisk(struct gendisk *disk)
750  {
751  	struct disk_part_iter piter;
752  	struct hd_struct *part;
753  
754  	blk_integrity_del(disk);
755  	disk_del_events(disk);
756  
757  	/*
758  	 * Block lookups of the disk until all bdevs are unhashed and the
759  	 * disk is marked as dead (GENHD_FL_UP cleared).
760  	 */
761  	down_write(&disk->lookup_sem);
762  	/* invalidate stuff */
763  	disk_part_iter_init(&piter, disk,
764  			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
765  	while ((part = disk_part_iter_next(&piter))) {
766  		invalidate_partition(disk, part->partno);
767  		bdev_unhash_inode(part_devt(part));
768  		delete_partition(disk, part->partno);
769  	}
770  	disk_part_iter_exit(&piter);
771  
772  	invalidate_partition(disk, 0);
773  	bdev_unhash_inode(disk_devt(disk));
774  	set_capacity(disk, 0);
775  	disk->flags &= ~GENHD_FL_UP;
776  	up_write(&disk->lookup_sem);
777  
778  	if (!(disk->flags & GENHD_FL_HIDDEN))
779  		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
780  	if (disk->queue) {
781  		/*
782  		 * Unregister bdi before releasing device numbers (as they can
783  		 * get reused and we'd get clashes in sysfs).
784  		 */
785  		if (!(disk->flags & GENHD_FL_HIDDEN))
786  			bdi_unregister(disk->queue->backing_dev_info);
787  		blk_unregister_queue(disk);
788  	} else {
789  		WARN_ON(1);
790  	}
791  
792  	if (!(disk->flags & GENHD_FL_HIDDEN))
793  		blk_unregister_region(disk_devt(disk), disk->minors);
794  
795  	kobject_put(disk->part0.holder_dir);
796  	kobject_put(disk->slave_dir);
797  
798  	part_stat_set_all(&disk->part0, 0);
799  	disk->part0.stamp = 0;
800  	if (!sysfs_deprecated)
801  		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
802  	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
803  	device_del(disk_to_dev(disk));
804  }
805  EXPORT_SYMBOL(del_gendisk);
806  
807  /* sysfs access to bad-blocks list. */
808  static ssize_t disk_badblocks_show(struct device *dev,
809  					struct device_attribute *attr,
810  					char *page)
811  {
812  	struct gendisk *disk = dev_to_disk(dev);
813  
814  	if (!disk->bb)
815  		return sprintf(page, "\n");
816  
817  	return badblocks_show(disk->bb, page, 0);
818  }
819  
820  static ssize_t disk_badblocks_store(struct device *dev,
821  					struct device_attribute *attr,
822  					const char *page, size_t len)
823  {
824  	struct gendisk *disk = dev_to_disk(dev);
825  
826  	if (!disk->bb)
827  		return -ENXIO;
828  
829  	return badblocks_store(disk->bb, page, len, 0);
830  }
831  
832  /**
833   * get_gendisk - get partitioning information for a given device
834   * @devt: device to get partitioning information for
835   * @partno: returned partition index
836   *
837   * This function gets the structure containing partitioning
838   * information for the given device @devt.
839   */
840  struct gendisk *get_gendisk(dev_t devt, int *partno)
841  {
842  	struct gendisk *disk = NULL;
843  
844  	if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
845  		struct kobject *kobj;
846  
847  		kobj = kobj_lookup(bdev_map, devt, partno);
848  		if (kobj)
849  			disk = dev_to_disk(kobj_to_dev(kobj));
850  	} else {
851  		struct hd_struct *part;
852  
853  		spin_lock_bh(&ext_devt_lock);
854  		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
855  		if (part && get_disk_and_module(part_to_disk(part))) {
856  			*partno = part->partno;
857  			disk = part_to_disk(part);
858  		}
859  		spin_unlock_bh(&ext_devt_lock);
860  	}
861  
862  	if (!disk)
863  		return NULL;
864  
865  	/*
866  	 * Synchronize with del_gendisk() to not return disk that is being
867  	 * destroyed.
868  	 */
869  	down_read(&disk->lookup_sem);
870  	if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
871  		     !(disk->flags & GENHD_FL_UP))) {
872  		up_read(&disk->lookup_sem);
873  		put_disk_and_module(disk);
874  		disk = NULL;
875  	} else {
876  		up_read(&disk->lookup_sem);
877  	}
878  	return disk;
879  }
880  EXPORT_SYMBOL(get_gendisk);
881  
882  /**
883   * bdget_disk - do bdget() by gendisk and partition number
884   * @disk: gendisk of interest
885   * @partno: partition number
886   *
887   * Find partition @partno from @disk, do bdget() on it.
888   *
889   * CONTEXT:
890   * Don't care.
891   *
892   * RETURNS:
893   * Resulting block_device on success, NULL on failure.
894   */
895  struct block_device *bdget_disk(struct gendisk *disk, int partno)
896  {
897  	struct hd_struct *part;
898  	struct block_device *bdev = NULL;
899  
900  	part = disk_get_part(disk, partno);
901  	if (part)
902  		bdev = bdget(part_devt(part));
903  	disk_put_part(part);
904  
905  	return bdev;
906  }
907  EXPORT_SYMBOL(bdget_disk);
908  
909  /*
910   * print a full list of all partitions - intended for places where the root
911   * filesystem can't be mounted and thus to give the victim some idea of what
912   * went wrong
913   */
914  void __init printk_all_partitions(void)
915  {
916  	struct class_dev_iter iter;
917  	struct device *dev;
918  
919  	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
920  	while ((dev = class_dev_iter_next(&iter))) {
921  		struct gendisk *disk = dev_to_disk(dev);
922  		struct disk_part_iter piter;
923  		struct hd_struct *part;
924  		char name_buf[BDEVNAME_SIZE];
925  		char devt_buf[BDEVT_SIZE];
926  
927  		/*
928  		 * Don't show empty devices or things that have been
929  		 * suppressed
930  		 */
931  		if (get_capacity(disk) == 0 ||
932  		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
933  			continue;
934  
935  		/*
936  		 * Note, unlike /proc/partitions, I am showing the
937  		 * numbers in hex - the same format as the root=
938  		 * option takes.
939  		 */
940  		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
941  		while ((part = disk_part_iter_next(&piter))) {
942  			bool is_part0 = part == &disk->part0;
943  
944  			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
945  			       bdevt_str(part_devt(part), devt_buf),
946  			       (unsigned long long)part_nr_sects_read(part) >> 1
947  			       , disk_name(disk, part->partno, name_buf),
948  			       part->info ? part->info->uuid : "");
949  			if (is_part0) {
950  				if (dev->parent && dev->parent->driver)
951  					printk(" driver: %s\n",
952  					      dev->parent->driver->name);
953  				else
954  					printk(" (driver?)\n");
955  			} else
956  				printk("\n");
957  		}
958  		disk_part_iter_exit(&piter);
959  	}
960  	class_dev_iter_exit(&iter);
961  }
962  
963  #ifdef CONFIG_PROC_FS
964  /* iterator */
965  static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
966  {
967  	loff_t skip = *pos;
968  	struct class_dev_iter *iter;
969  	struct device *dev;
970  
971  	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
972  	if (!iter)
973  		return ERR_PTR(-ENOMEM);
974  
975  	seqf->private = iter;
976  	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
977  	do {
978  		dev = class_dev_iter_next(iter);
979  		if (!dev)
980  			return NULL;
981  	} while (skip--);
982  
983  	return dev_to_disk(dev);
984  }
985  
986  static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
987  {
988  	struct device *dev;
989  
990  	(*pos)++;
991  	dev = class_dev_iter_next(seqf->private);
992  	if (dev)
993  		return dev_to_disk(dev);
994  
995  	return NULL;
996  }
997  
998  static void disk_seqf_stop(struct seq_file *seqf, void *v)
999  {
1000  	struct class_dev_iter *iter = seqf->private;
1001  
1002  	/* stop is called even after start failed :-( */
1003  	if (iter) {
1004  		class_dev_iter_exit(iter);
1005  		kfree(iter);
1006  		seqf->private = NULL;
1007  	}
1008  }
1009  
1010  static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
1011  {
1012  	void *p;
1013  
1014  	p = disk_seqf_start(seqf, pos);
1015  	if (!IS_ERR_OR_NULL(p) && !*pos)
1016  		seq_puts(seqf, "major minor  #blocks  name\n\n");
1017  	return p;
1018  }
1019  
1020  static int show_partition(struct seq_file *seqf, void *v)
1021  {
1022  	struct gendisk *sgp = v;
1023  	struct disk_part_iter piter;
1024  	struct hd_struct *part;
1025  	char buf[BDEVNAME_SIZE];
1026  
1027  	/* Don't show non-partitionable removeable devices or empty devices */
1028  	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
1029  				   (sgp->flags & GENHD_FL_REMOVABLE)))
1030  		return 0;
1031  	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
1032  		return 0;
1033  
1034  	/* show the full disk and all non-0 size partitions of it */
1035  	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
1036  	while ((part = disk_part_iter_next(&piter)))
1037  		seq_printf(seqf, "%4d  %7d %10llu %s\n",
1038  			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
1039  			   (unsigned long long)part_nr_sects_read(part) >> 1,
1040  			   disk_name(sgp, part->partno, buf));
1041  	disk_part_iter_exit(&piter);
1042  
1043  	return 0;
1044  }
1045  
1046  static const struct seq_operations partitions_op = {
1047  	.start	= show_partition_start,
1048  	.next	= disk_seqf_next,
1049  	.stop	= disk_seqf_stop,
1050  	.show	= show_partition
1051  };
1052  #endif
1053  
1054  
1055  static struct kobject *base_probe(dev_t devt, int *partno, void *data)
1056  {
1057  	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
1058  		/* Make old-style 2.4 aliases work */
1059  		request_module("block-major-%d", MAJOR(devt));
1060  	return NULL;
1061  }
1062  
1063  static int __init genhd_device_init(void)
1064  {
1065  	int error;
1066  
1067  	block_class.dev_kobj = sysfs_dev_block_kobj;
1068  	error = class_register(&block_class);
1069  	if (unlikely(error))
1070  		return error;
1071  	bdev_map = kobj_map_init(base_probe, &block_class_lock);
1072  	blk_dev_init();
1073  
1074  	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
1075  
1076  	/* create top-level block dir */
1077  	if (!sysfs_deprecated)
1078  		block_depr = kobject_create_and_add("block", NULL);
1079  	return 0;
1080  }
1081  
1082  subsys_initcall(genhd_device_init);
1083  
1084  static ssize_t disk_range_show(struct device *dev,
1085  			       struct device_attribute *attr, char *buf)
1086  {
1087  	struct gendisk *disk = dev_to_disk(dev);
1088  
1089  	return sprintf(buf, "%d\n", disk->minors);
1090  }
1091  
1092  static ssize_t disk_ext_range_show(struct device *dev,
1093  				   struct device_attribute *attr, char *buf)
1094  {
1095  	struct gendisk *disk = dev_to_disk(dev);
1096  
1097  	return sprintf(buf, "%d\n", disk_max_parts(disk));
1098  }
1099  
1100  static ssize_t disk_removable_show(struct device *dev,
1101  				   struct device_attribute *attr, char *buf)
1102  {
1103  	struct gendisk *disk = dev_to_disk(dev);
1104  
1105  	return sprintf(buf, "%d\n",
1106  		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
1107  }
1108  
1109  static ssize_t disk_hidden_show(struct device *dev,
1110  				   struct device_attribute *attr, char *buf)
1111  {
1112  	struct gendisk *disk = dev_to_disk(dev);
1113  
1114  	return sprintf(buf, "%d\n",
1115  		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
1116  }
1117  
1118  static ssize_t disk_ro_show(struct device *dev,
1119  				   struct device_attribute *attr, char *buf)
1120  {
1121  	struct gendisk *disk = dev_to_disk(dev);
1122  
1123  	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
1124  }
1125  
1126  static ssize_t disk_capability_show(struct device *dev,
1127  				    struct device_attribute *attr, char *buf)
1128  {
1129  	struct gendisk *disk = dev_to_disk(dev);
1130  
1131  	return sprintf(buf, "%x\n", disk->flags);
1132  }
1133  
1134  static ssize_t disk_alignment_offset_show(struct device *dev,
1135  					  struct device_attribute *attr,
1136  					  char *buf)
1137  {
1138  	struct gendisk *disk = dev_to_disk(dev);
1139  
1140  	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
1141  }
1142  
1143  static ssize_t disk_discard_alignment_show(struct device *dev,
1144  					   struct device_attribute *attr,
1145  					   char *buf)
1146  {
1147  	struct gendisk *disk = dev_to_disk(dev);
1148  
1149  	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
1150  }
1151  
1152  static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
1153  static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
1154  static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
1155  static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
1156  static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
1157  static DEVICE_ATTR(size, 0444, part_size_show, NULL);
1158  static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
1159  static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
1160  static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
1161  static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
1162  static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
1163  static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
1164  #ifdef CONFIG_FAIL_MAKE_REQUEST
1165  static struct device_attribute dev_attr_fail =
1166  	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1167  #endif
1168  #ifdef CONFIG_FAIL_IO_TIMEOUT
1169  static struct device_attribute dev_attr_fail_timeout =
1170  	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1171  #endif
1172  
1173  static struct attribute *disk_attrs[] = {
1174  	&dev_attr_range.attr,
1175  	&dev_attr_ext_range.attr,
1176  	&dev_attr_removable.attr,
1177  	&dev_attr_hidden.attr,
1178  	&dev_attr_ro.attr,
1179  	&dev_attr_size.attr,
1180  	&dev_attr_alignment_offset.attr,
1181  	&dev_attr_discard_alignment.attr,
1182  	&dev_attr_capability.attr,
1183  	&dev_attr_stat.attr,
1184  	&dev_attr_inflight.attr,
1185  	&dev_attr_badblocks.attr,
1186  #ifdef CONFIG_FAIL_MAKE_REQUEST
1187  	&dev_attr_fail.attr,
1188  #endif
1189  #ifdef CONFIG_FAIL_IO_TIMEOUT
1190  	&dev_attr_fail_timeout.attr,
1191  #endif
1192  	NULL
1193  };
1194  
1195  static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
1196  {
1197  	struct device *dev = container_of(kobj, typeof(*dev), kobj);
1198  	struct gendisk *disk = dev_to_disk(dev);
1199  
1200  	if (a == &dev_attr_badblocks.attr && !disk->bb)
1201  		return 0;
1202  	return a->mode;
1203  }
1204  
1205  static struct attribute_group disk_attr_group = {
1206  	.attrs = disk_attrs,
1207  	.is_visible = disk_visible,
1208  };
1209  
1210  static const struct attribute_group *disk_attr_groups[] = {
1211  	&disk_attr_group,
1212  	NULL
1213  };
1214  
1215  /**
1216   * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1217   * @disk: disk to replace part_tbl for
1218   * @new_ptbl: new part_tbl to install
1219   *
1220   * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
1221   * original ptbl is freed using RCU callback.
1222   *
1223   * LOCKING:
1224   * Matching bd_mutex locked or the caller is the only user of @disk.
1225   */
1226  static void disk_replace_part_tbl(struct gendisk *disk,
1227  				  struct disk_part_tbl *new_ptbl)
1228  {
1229  	struct disk_part_tbl *old_ptbl =
1230  		rcu_dereference_protected(disk->part_tbl, 1);
1231  
1232  	rcu_assign_pointer(disk->part_tbl, new_ptbl);
1233  
1234  	if (old_ptbl) {
1235  		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1236  		kfree_rcu(old_ptbl, rcu_head);
1237  	}
1238  }
1239  
1240  /**
1241   * disk_expand_part_tbl - expand disk->part_tbl
1242   * @disk: disk to expand part_tbl for
1243   * @partno: expand such that this partno can fit in
1244   *
1245   * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
1246   * uses RCU to allow unlocked dereferencing for stats and other stuff.
1247   *
1248   * LOCKING:
1249   * Matching bd_mutex locked or the caller is the only user of @disk.
1250   * Might sleep.
1251   *
1252   * RETURNS:
1253   * 0 on success, -errno on failure.
1254   */
1255  int disk_expand_part_tbl(struct gendisk *disk, int partno)
1256  {
1257  	struct disk_part_tbl *old_ptbl =
1258  		rcu_dereference_protected(disk->part_tbl, 1);
1259  	struct disk_part_tbl *new_ptbl;
1260  	int len = old_ptbl ? old_ptbl->len : 0;
1261  	int i, target;
1262  	size_t size;
1263  
1264  	/*
1265  	 * check for int overflow, since we can get here from blkpg_ioctl()
1266  	 * with a user passed 'partno'.
1267  	 */
1268  	target = partno + 1;
1269  	if (target < 0)
1270  		return -EINVAL;
1271  
1272  	/* disk_max_parts() is zero during initialization, ignore if so */
1273  	if (disk_max_parts(disk) && target > disk_max_parts(disk))
1274  		return -EINVAL;
1275  
1276  	if (target <= len)
1277  		return 0;
1278  
1279  	size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1280  	new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1281  	if (!new_ptbl)
1282  		return -ENOMEM;
1283  
1284  	new_ptbl->len = target;
1285  
1286  	for (i = 0; i < len; i++)
1287  		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1288  
1289  	disk_replace_part_tbl(disk, new_ptbl);
1290  	return 0;
1291  }
1292  
1293  static void disk_release(struct device *dev)
1294  {
1295  	struct gendisk *disk = dev_to_disk(dev);
1296  
1297  	blk_free_devt(dev->devt);
1298  	disk_release_events(disk);
1299  	kfree(disk->random);
1300  	disk_replace_part_tbl(disk, NULL);
1301  	hd_free_part(&disk->part0);
1302  	if (disk->queue)
1303  		blk_put_queue(disk->queue);
1304  	kfree(disk);
1305  }
1306  struct class block_class = {
1307  	.name		= "block",
1308  };
1309  
1310  static char *block_devnode(struct device *dev, umode_t *mode,
1311  			   kuid_t *uid, kgid_t *gid)
1312  {
1313  	struct gendisk *disk = dev_to_disk(dev);
1314  
1315  	if (disk->devnode)
1316  		return disk->devnode(disk, mode);
1317  	return NULL;
1318  }
1319  
1320  static const struct device_type disk_type = {
1321  	.name		= "disk",
1322  	.groups		= disk_attr_groups,
1323  	.release	= disk_release,
1324  	.devnode	= block_devnode,
1325  };
1326  
1327  #ifdef CONFIG_PROC_FS
1328  /*
1329   * aggregate disk stat collector.  Uses the same stats that the sysfs
1330   * entries do, above, but makes them available through one seq_file.
1331   *
1332   * The output looks suspiciously like /proc/partitions with a bunch of
1333   * extra fields.
1334   */
1335  static int diskstats_show(struct seq_file *seqf, void *v)
1336  {
1337  	struct gendisk *gp = v;
1338  	struct disk_part_iter piter;
1339  	struct hd_struct *hd;
1340  	char buf[BDEVNAME_SIZE];
1341  	unsigned int inflight;
1342  
1343  	/*
1344  	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1345  		seq_puts(seqf,	"major minor name"
1346  				"     rio rmerge rsect ruse wio wmerge "
1347  				"wsect wuse running use aveq"
1348  				"\n\n");
1349  	*/
1350  
1351  	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1352  	while ((hd = disk_part_iter_next(&piter))) {
1353  		inflight = part_in_flight(gp->queue, hd);
1354  		seq_printf(seqf, "%4d %7d %s "
1355  			   "%lu %lu %lu %u "
1356  			   "%lu %lu %lu %u "
1357  			   "%u %u %u "
1358  			   "%lu %lu %lu %u\n",
1359  			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1360  			   disk_name(gp, hd->partno, buf),
1361  			   part_stat_read(hd, ios[STAT_READ]),
1362  			   part_stat_read(hd, merges[STAT_READ]),
1363  			   part_stat_read(hd, sectors[STAT_READ]),
1364  			   (unsigned int)part_stat_read_msecs(hd, STAT_READ),
1365  			   part_stat_read(hd, ios[STAT_WRITE]),
1366  			   part_stat_read(hd, merges[STAT_WRITE]),
1367  			   part_stat_read(hd, sectors[STAT_WRITE]),
1368  			   (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
1369  			   inflight,
1370  			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1371  			   jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
1372  			   part_stat_read(hd, ios[STAT_DISCARD]),
1373  			   part_stat_read(hd, merges[STAT_DISCARD]),
1374  			   part_stat_read(hd, sectors[STAT_DISCARD]),
1375  			   (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD)
1376  			);
1377  	}
1378  	disk_part_iter_exit(&piter);
1379  
1380  	return 0;
1381  }
1382  
1383  static const struct seq_operations diskstats_op = {
1384  	.start	= disk_seqf_start,
1385  	.next	= disk_seqf_next,
1386  	.stop	= disk_seqf_stop,
1387  	.show	= diskstats_show
1388  };
1389  
1390  static int __init proc_genhd_init(void)
1391  {
1392  	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
1393  	proc_create_seq("partitions", 0, NULL, &partitions_op);
1394  	return 0;
1395  }
1396  module_init(proc_genhd_init);
1397  #endif /* CONFIG_PROC_FS */
1398  
1399  dev_t blk_lookup_devt(const char *name, int partno)
1400  {
1401  	dev_t devt = MKDEV(0, 0);
1402  	struct class_dev_iter iter;
1403  	struct device *dev;
1404  
1405  	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1406  	while ((dev = class_dev_iter_next(&iter))) {
1407  		struct gendisk *disk = dev_to_disk(dev);
1408  		struct hd_struct *part;
1409  
1410  		if (strcmp(dev_name(dev), name))
1411  			continue;
1412  
1413  		if (partno < disk->minors) {
1414  			/* We need to return the right devno, even
1415  			 * if the partition doesn't exist yet.
1416  			 */
1417  			devt = MKDEV(MAJOR(dev->devt),
1418  				     MINOR(dev->devt) + partno);
1419  			break;
1420  		}
1421  		part = disk_get_part(disk, partno);
1422  		if (part) {
1423  			devt = part_devt(part);
1424  			disk_put_part(part);
1425  			break;
1426  		}
1427  		disk_put_part(part);
1428  	}
1429  	class_dev_iter_exit(&iter);
1430  	return devt;
1431  }
1432  EXPORT_SYMBOL(blk_lookup_devt);
1433  
1434  struct gendisk *__alloc_disk_node(int minors, int node_id)
1435  {
1436  	struct gendisk *disk;
1437  	struct disk_part_tbl *ptbl;
1438  
1439  	if (minors > DISK_MAX_PARTS) {
1440  		printk(KERN_ERR
1441  			"block: can't allocate more than %d partitions\n",
1442  			DISK_MAX_PARTS);
1443  		minors = DISK_MAX_PARTS;
1444  	}
1445  
1446  	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1447  	if (disk) {
1448  		if (!init_part_stats(&disk->part0)) {
1449  			kfree(disk);
1450  			return NULL;
1451  		}
1452  		init_rwsem(&disk->lookup_sem);
1453  		disk->node_id = node_id;
1454  		if (disk_expand_part_tbl(disk, 0)) {
1455  			free_part_stats(&disk->part0);
1456  			kfree(disk);
1457  			return NULL;
1458  		}
1459  		ptbl = rcu_dereference_protected(disk->part_tbl, 1);
1460  		rcu_assign_pointer(ptbl->part[0], &disk->part0);
1461  
1462  		/*
1463  		 * set_capacity() and get_capacity() currently don't use
1464  		 * seqcounter to read/update the part0->nr_sects. Still init
1465  		 * the counter as we can read the sectors in IO submission
1466  		 * patch using seqence counters.
1467  		 *
1468  		 * TODO: Ideally set_capacity() and get_capacity() should be
1469  		 * converted to make use of bd_mutex and sequence counters.
1470  		 */
1471  		seqcount_init(&disk->part0.nr_sects_seq);
1472  		if (hd_ref_init(&disk->part0)) {
1473  			hd_free_part(&disk->part0);
1474  			kfree(disk);
1475  			return NULL;
1476  		}
1477  
1478  		disk->minors = minors;
1479  		rand_initialize_disk(disk);
1480  		disk_to_dev(disk)->class = &block_class;
1481  		disk_to_dev(disk)->type = &disk_type;
1482  		device_initialize(disk_to_dev(disk));
1483  	}
1484  	return disk;
1485  }
1486  EXPORT_SYMBOL(__alloc_disk_node);
1487  
1488  struct kobject *get_disk_and_module(struct gendisk *disk)
1489  {
1490  	struct module *owner;
1491  	struct kobject *kobj;
1492  
1493  	if (!disk->fops)
1494  		return NULL;
1495  	owner = disk->fops->owner;
1496  	if (owner && !try_module_get(owner))
1497  		return NULL;
1498  	kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
1499  	if (kobj == NULL) {
1500  		module_put(owner);
1501  		return NULL;
1502  	}
1503  	return kobj;
1504  
1505  }
1506  EXPORT_SYMBOL(get_disk_and_module);
1507  
1508  void put_disk(struct gendisk *disk)
1509  {
1510  	if (disk)
1511  		kobject_put(&disk_to_dev(disk)->kobj);
1512  }
1513  EXPORT_SYMBOL(put_disk);
1514  
1515  /*
1516   * This is a counterpart of get_disk_and_module() and thus also of
1517   * get_gendisk().
1518   */
1519  void put_disk_and_module(struct gendisk *disk)
1520  {
1521  	if (disk) {
1522  		struct module *owner = disk->fops->owner;
1523  
1524  		put_disk(disk);
1525  		module_put(owner);
1526  	}
1527  }
1528  EXPORT_SYMBOL(put_disk_and_module);
1529  
1530  static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1531  {
1532  	char event[] = "DISK_RO=1";
1533  	char *envp[] = { event, NULL };
1534  
1535  	if (!ro)
1536  		event[8] = '0';
1537  	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1538  }
1539  
1540  void set_device_ro(struct block_device *bdev, int flag)
1541  {
1542  	bdev->bd_part->policy = flag;
1543  }
1544  
1545  EXPORT_SYMBOL(set_device_ro);
1546  
1547  void set_disk_ro(struct gendisk *disk, int flag)
1548  {
1549  	struct disk_part_iter piter;
1550  	struct hd_struct *part;
1551  
1552  	if (disk->part0.policy != flag) {
1553  		set_disk_ro_uevent(disk, flag);
1554  		disk->part0.policy = flag;
1555  	}
1556  
1557  	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1558  	while ((part = disk_part_iter_next(&piter)))
1559  		part->policy = flag;
1560  	disk_part_iter_exit(&piter);
1561  }
1562  
1563  EXPORT_SYMBOL(set_disk_ro);
1564  
1565  int bdev_read_only(struct block_device *bdev)
1566  {
1567  	if (!bdev)
1568  		return 0;
1569  	return bdev->bd_part->policy;
1570  }
1571  
1572  EXPORT_SYMBOL(bdev_read_only);
1573  
1574  int invalidate_partition(struct gendisk *disk, int partno)
1575  {
1576  	int res = 0;
1577  	struct block_device *bdev = bdget_disk(disk, partno);
1578  	if (bdev) {
1579  		fsync_bdev(bdev);
1580  		res = __invalidate_device(bdev, true);
1581  		bdput(bdev);
1582  	}
1583  	return res;
1584  }
1585  
1586  EXPORT_SYMBOL(invalidate_partition);
1587  
1588  /*
1589   * Disk events - monitor disk events like media change and eject request.
1590   */
1591  struct disk_events {
1592  	struct list_head	node;		/* all disk_event's */
1593  	struct gendisk		*disk;		/* the associated disk */
1594  	spinlock_t		lock;
1595  
1596  	struct mutex		block_mutex;	/* protects blocking */
1597  	int			block;		/* event blocking depth */
1598  	unsigned int		pending;	/* events already sent out */
1599  	unsigned int		clearing;	/* events being cleared */
1600  
1601  	long			poll_msecs;	/* interval, -1 for default */
1602  	struct delayed_work	dwork;
1603  };
1604  
1605  static const char *disk_events_strs[] = {
1606  	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
1607  	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
1608  };
1609  
1610  static char *disk_uevents[] = {
1611  	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
1612  	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
1613  };
1614  
1615  /* list of all disk_events */
1616  static DEFINE_MUTEX(disk_events_mutex);
1617  static LIST_HEAD(disk_events);
1618  
1619  /* disable in-kernel polling by default */
1620  static unsigned long disk_events_dfl_poll_msecs;
1621  
1622  static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1623  {
1624  	struct disk_events *ev = disk->ev;
1625  	long intv_msecs = 0;
1626  
1627  	/*
1628  	 * If device-specific poll interval is set, always use it.  If
1629  	 * the default is being used, poll iff there are events which
1630  	 * can't be monitored asynchronously.
1631  	 */
1632  	if (ev->poll_msecs >= 0)
1633  		intv_msecs = ev->poll_msecs;
1634  	else if (disk->events & ~disk->async_events)
1635  		intv_msecs = disk_events_dfl_poll_msecs;
1636  
1637  	return msecs_to_jiffies(intv_msecs);
1638  }
1639  
1640  /**
1641   * disk_block_events - block and flush disk event checking
1642   * @disk: disk to block events for
1643   *
1644   * On return from this function, it is guaranteed that event checking
1645   * isn't in progress and won't happen until unblocked by
1646   * disk_unblock_events().  Events blocking is counted and the actual
1647   * unblocking happens after the matching number of unblocks are done.
1648   *
1649   * Note that this intentionally does not block event checking from
1650   * disk_clear_events().
1651   *
1652   * CONTEXT:
1653   * Might sleep.
1654   */
1655  void disk_block_events(struct gendisk *disk)
1656  {
1657  	struct disk_events *ev = disk->ev;
1658  	unsigned long flags;
1659  	bool cancel;
1660  
1661  	if (!ev)
1662  		return;
1663  
1664  	/*
1665  	 * Outer mutex ensures that the first blocker completes canceling
1666  	 * the event work before further blockers are allowed to finish.
1667  	 */
1668  	mutex_lock(&ev->block_mutex);
1669  
1670  	spin_lock_irqsave(&ev->lock, flags);
1671  	cancel = !ev->block++;
1672  	spin_unlock_irqrestore(&ev->lock, flags);
1673  
1674  	if (cancel)
1675  		cancel_delayed_work_sync(&disk->ev->dwork);
1676  
1677  	mutex_unlock(&ev->block_mutex);
1678  }
1679  
1680  static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1681  {
1682  	struct disk_events *ev = disk->ev;
1683  	unsigned long intv;
1684  	unsigned long flags;
1685  
1686  	spin_lock_irqsave(&ev->lock, flags);
1687  
1688  	if (WARN_ON_ONCE(ev->block <= 0))
1689  		goto out_unlock;
1690  
1691  	if (--ev->block)
1692  		goto out_unlock;
1693  
1694  	intv = disk_events_poll_jiffies(disk);
1695  	if (check_now)
1696  		queue_delayed_work(system_freezable_power_efficient_wq,
1697  				&ev->dwork, 0);
1698  	else if (intv)
1699  		queue_delayed_work(system_freezable_power_efficient_wq,
1700  				&ev->dwork, intv);
1701  out_unlock:
1702  	spin_unlock_irqrestore(&ev->lock, flags);
1703  }
1704  
1705  /**
1706   * disk_unblock_events - unblock disk event checking
1707   * @disk: disk to unblock events for
1708   *
1709   * Undo disk_block_events().  When the block count reaches zero, it
1710   * starts events polling if configured.
1711   *
1712   * CONTEXT:
1713   * Don't care.  Safe to call from irq context.
1714   */
1715  void disk_unblock_events(struct gendisk *disk)
1716  {
1717  	if (disk->ev)
1718  		__disk_unblock_events(disk, false);
1719  }
1720  
1721  /**
1722   * disk_flush_events - schedule immediate event checking and flushing
1723   * @disk: disk to check and flush events for
1724   * @mask: events to flush
1725   *
1726   * Schedule immediate event checking on @disk if not blocked.  Events in
1727   * @mask are scheduled to be cleared from the driver.  Note that this
1728   * doesn't clear the events from @disk->ev.
1729   *
1730   * CONTEXT:
1731   * If @mask is non-zero must be called with bdev->bd_mutex held.
1732   */
1733  void disk_flush_events(struct gendisk *disk, unsigned int mask)
1734  {
1735  	struct disk_events *ev = disk->ev;
1736  
1737  	if (!ev)
1738  		return;
1739  
1740  	spin_lock_irq(&ev->lock);
1741  	ev->clearing |= mask;
1742  	if (!ev->block)
1743  		mod_delayed_work(system_freezable_power_efficient_wq,
1744  				&ev->dwork, 0);
1745  	spin_unlock_irq(&ev->lock);
1746  }
1747  
1748  /**
1749   * disk_clear_events - synchronously check, clear and return pending events
1750   * @disk: disk to fetch and clear events from
1751   * @mask: mask of events to be fetched and cleared
1752   *
1753   * Disk events are synchronously checked and pending events in @mask
1754   * are cleared and returned.  This ignores the block count.
1755   *
1756   * CONTEXT:
1757   * Might sleep.
1758   */
1759  unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1760  {
1761  	const struct block_device_operations *bdops = disk->fops;
1762  	struct disk_events *ev = disk->ev;
1763  	unsigned int pending;
1764  	unsigned int clearing = mask;
1765  
1766  	if (!ev) {
1767  		/* for drivers still using the old ->media_changed method */
1768  		if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1769  		    bdops->media_changed && bdops->media_changed(disk))
1770  			return DISK_EVENT_MEDIA_CHANGE;
1771  		return 0;
1772  	}
1773  
1774  	disk_block_events(disk);
1775  
1776  	/*
1777  	 * store the union of mask and ev->clearing on the stack so that the
1778  	 * race with disk_flush_events does not cause ambiguity (ev->clearing
1779  	 * can still be modified even if events are blocked).
1780  	 */
1781  	spin_lock_irq(&ev->lock);
1782  	clearing |= ev->clearing;
1783  	ev->clearing = 0;
1784  	spin_unlock_irq(&ev->lock);
1785  
1786  	disk_check_events(ev, &clearing);
1787  	/*
1788  	 * if ev->clearing is not 0, the disk_flush_events got called in the
1789  	 * middle of this function, so we want to run the workfn without delay.
1790  	 */
1791  	__disk_unblock_events(disk, ev->clearing ? true : false);
1792  
1793  	/* then, fetch and clear pending events */
1794  	spin_lock_irq(&ev->lock);
1795  	pending = ev->pending & mask;
1796  	ev->pending &= ~mask;
1797  	spin_unlock_irq(&ev->lock);
1798  	WARN_ON_ONCE(clearing & mask);
1799  
1800  	return pending;
1801  }
1802  
1803  /*
1804   * Separate this part out so that a different pointer for clearing_ptr can be
1805   * passed in for disk_clear_events.
1806   */
1807  static void disk_events_workfn(struct work_struct *work)
1808  {
1809  	struct delayed_work *dwork = to_delayed_work(work);
1810  	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1811  
1812  	disk_check_events(ev, &ev->clearing);
1813  }
1814  
1815  static void disk_check_events(struct disk_events *ev,
1816  			      unsigned int *clearing_ptr)
1817  {
1818  	struct gendisk *disk = ev->disk;
1819  	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1820  	unsigned int clearing = *clearing_ptr;
1821  	unsigned int events;
1822  	unsigned long intv;
1823  	int nr_events = 0, i;
1824  
1825  	/* check events */
1826  	events = disk->fops->check_events(disk, clearing);
1827  
1828  	/* accumulate pending events and schedule next poll if necessary */
1829  	spin_lock_irq(&ev->lock);
1830  
1831  	events &= ~ev->pending;
1832  	ev->pending |= events;
1833  	*clearing_ptr &= ~clearing;
1834  
1835  	intv = disk_events_poll_jiffies(disk);
1836  	if (!ev->block && intv)
1837  		queue_delayed_work(system_freezable_power_efficient_wq,
1838  				&ev->dwork, intv);
1839  
1840  	spin_unlock_irq(&ev->lock);
1841  
1842  	/*
1843  	 * Tell userland about new events.  Only the events listed in
1844  	 * @disk->events are reported.  Unlisted events are processed the
1845  	 * same internally but never get reported to userland.
1846  	 */
1847  	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1848  		if (events & disk->events & (1 << i))
1849  			envp[nr_events++] = disk_uevents[i];
1850  
1851  	if (nr_events)
1852  		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1853  }
1854  
1855  /*
1856   * A disk events enabled device has the following sysfs nodes under
1857   * its /sys/block/X/ directory.
1858   *
1859   * events		: list of all supported events
1860   * events_async		: list of events which can be detected w/o polling
1861   * events_poll_msecs	: polling interval, 0: disable, -1: system default
1862   */
1863  static ssize_t __disk_events_show(unsigned int events, char *buf)
1864  {
1865  	const char *delim = "";
1866  	ssize_t pos = 0;
1867  	int i;
1868  
1869  	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1870  		if (events & (1 << i)) {
1871  			pos += sprintf(buf + pos, "%s%s",
1872  				       delim, disk_events_strs[i]);
1873  			delim = " ";
1874  		}
1875  	if (pos)
1876  		pos += sprintf(buf + pos, "\n");
1877  	return pos;
1878  }
1879  
1880  static ssize_t disk_events_show(struct device *dev,
1881  				struct device_attribute *attr, char *buf)
1882  {
1883  	struct gendisk *disk = dev_to_disk(dev);
1884  
1885  	return __disk_events_show(disk->events, buf);
1886  }
1887  
1888  static ssize_t disk_events_async_show(struct device *dev,
1889  				      struct device_attribute *attr, char *buf)
1890  {
1891  	struct gendisk *disk = dev_to_disk(dev);
1892  
1893  	return __disk_events_show(disk->async_events, buf);
1894  }
1895  
1896  static ssize_t disk_events_poll_msecs_show(struct device *dev,
1897  					   struct device_attribute *attr,
1898  					   char *buf)
1899  {
1900  	struct gendisk *disk = dev_to_disk(dev);
1901  
1902  	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1903  }
1904  
1905  static ssize_t disk_events_poll_msecs_store(struct device *dev,
1906  					    struct device_attribute *attr,
1907  					    const char *buf, size_t count)
1908  {
1909  	struct gendisk *disk = dev_to_disk(dev);
1910  	long intv;
1911  
1912  	if (!count || !sscanf(buf, "%ld", &intv))
1913  		return -EINVAL;
1914  
1915  	if (intv < 0 && intv != -1)
1916  		return -EINVAL;
1917  
1918  	disk_block_events(disk);
1919  	disk->ev->poll_msecs = intv;
1920  	__disk_unblock_events(disk, true);
1921  
1922  	return count;
1923  }
1924  
1925  static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
1926  static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
1927  static const DEVICE_ATTR(events_poll_msecs, 0644,
1928  			 disk_events_poll_msecs_show,
1929  			 disk_events_poll_msecs_store);
1930  
1931  static const struct attribute *disk_events_attrs[] = {
1932  	&dev_attr_events.attr,
1933  	&dev_attr_events_async.attr,
1934  	&dev_attr_events_poll_msecs.attr,
1935  	NULL,
1936  };
1937  
1938  /*
1939   * The default polling interval can be specified by the kernel
1940   * parameter block.events_dfl_poll_msecs which defaults to 0
1941   * (disable).  This can also be modified runtime by writing to
1942   * /sys/module/block/events_dfl_poll_msecs.
1943   */
1944  static int disk_events_set_dfl_poll_msecs(const char *val,
1945  					  const struct kernel_param *kp)
1946  {
1947  	struct disk_events *ev;
1948  	int ret;
1949  
1950  	ret = param_set_ulong(val, kp);
1951  	if (ret < 0)
1952  		return ret;
1953  
1954  	mutex_lock(&disk_events_mutex);
1955  
1956  	list_for_each_entry(ev, &disk_events, node)
1957  		disk_flush_events(ev->disk, 0);
1958  
1959  	mutex_unlock(&disk_events_mutex);
1960  
1961  	return 0;
1962  }
1963  
1964  static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1965  	.set	= disk_events_set_dfl_poll_msecs,
1966  	.get	= param_get_ulong,
1967  };
1968  
1969  #undef MODULE_PARAM_PREFIX
1970  #define MODULE_PARAM_PREFIX	"block."
1971  
1972  module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1973  		&disk_events_dfl_poll_msecs, 0644);
1974  
1975  /*
1976   * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
1977   */
1978  static void disk_alloc_events(struct gendisk *disk)
1979  {
1980  	struct disk_events *ev;
1981  
1982  	if (!disk->fops->check_events)
1983  		return;
1984  
1985  	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1986  	if (!ev) {
1987  		pr_warn("%s: failed to initialize events\n", disk->disk_name);
1988  		return;
1989  	}
1990  
1991  	INIT_LIST_HEAD(&ev->node);
1992  	ev->disk = disk;
1993  	spin_lock_init(&ev->lock);
1994  	mutex_init(&ev->block_mutex);
1995  	ev->block = 1;
1996  	ev->poll_msecs = -1;
1997  	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1998  
1999  	disk->ev = ev;
2000  }
2001  
2002  static void disk_add_events(struct gendisk *disk)
2003  {
2004  	if (!disk->ev)
2005  		return;
2006  
2007  	/* FIXME: error handling */
2008  	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
2009  		pr_warn("%s: failed to create sysfs files for events\n",
2010  			disk->disk_name);
2011  
2012  	mutex_lock(&disk_events_mutex);
2013  	list_add_tail(&disk->ev->node, &disk_events);
2014  	mutex_unlock(&disk_events_mutex);
2015  
2016  	/*
2017  	 * Block count is initialized to 1 and the following initial
2018  	 * unblock kicks it into action.
2019  	 */
2020  	__disk_unblock_events(disk, true);
2021  }
2022  
2023  static void disk_del_events(struct gendisk *disk)
2024  {
2025  	if (!disk->ev)
2026  		return;
2027  
2028  	disk_block_events(disk);
2029  
2030  	mutex_lock(&disk_events_mutex);
2031  	list_del_init(&disk->ev->node);
2032  	mutex_unlock(&disk_events_mutex);
2033  
2034  	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
2035  }
2036  
2037  static void disk_release_events(struct gendisk *disk)
2038  {
2039  	/* the block count should be 1 from disk_del_events() */
2040  	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
2041  	kfree(disk->ev);
2042  }
2043