xref: /openbmc/linux/block/bdev.c (revision 1b39e7607144337d752f36c2068ed79447462f99)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  Copyright (C) 1991, 1992  Linus Torvalds
4   *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
5   *  Copyright (C) 2016 - 2020 Christoph Hellwig
6   */
7  
8  #include <linux/init.h>
9  #include <linux/mm.h>
10  #include <linux/slab.h>
11  #include <linux/kmod.h>
12  #include <linux/major.h>
13  #include <linux/device_cgroup.h>
14  #include <linux/blkdev.h>
15  #include <linux/blk-integrity.h>
16  #include <linux/backing-dev.h>
17  #include <linux/module.h>
18  #include <linux/blkpg.h>
19  #include <linux/magic.h>
20  #include <linux/buffer_head.h>
21  #include <linux/swap.h>
22  #include <linux/writeback.h>
23  #include <linux/mount.h>
24  #include <linux/pseudo_fs.h>
25  #include <linux/uio.h>
26  #include <linux/namei.h>
27  #include <linux/part_stat.h>
28  #include <linux/uaccess.h>
29  #include <linux/stat.h>
30  #include "../fs/internal.h"
31  #include "blk.h"
32  
33  struct bdev_inode {
34  	struct block_device bdev;
35  	struct inode vfs_inode;
36  };
37  
38  static inline struct bdev_inode *BDEV_I(struct inode *inode)
39  {
40  	return container_of(inode, struct bdev_inode, vfs_inode);
41  }
42  
43  struct block_device *I_BDEV(struct inode *inode)
44  {
45  	return &BDEV_I(inode)->bdev;
46  }
47  EXPORT_SYMBOL(I_BDEV);
48  
49  static void bdev_write_inode(struct block_device *bdev)
50  {
51  	struct inode *inode = bdev->bd_inode;
52  	int ret;
53  
54  	spin_lock(&inode->i_lock);
55  	while (inode->i_state & I_DIRTY) {
56  		spin_unlock(&inode->i_lock);
57  		ret = write_inode_now(inode, true);
58  		if (ret)
59  			pr_warn_ratelimited(
60  	"VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
61  				bdev, ret);
62  		spin_lock(&inode->i_lock);
63  	}
64  	spin_unlock(&inode->i_lock);
65  }
66  
67  /* Kill _all_ buffers and pagecache , dirty or not.. */
68  static void kill_bdev(struct block_device *bdev)
69  {
70  	struct address_space *mapping = bdev->bd_inode->i_mapping;
71  
72  	if (mapping_empty(mapping))
73  		return;
74  
75  	invalidate_bh_lrus();
76  	truncate_inode_pages(mapping, 0);
77  }
78  
79  /* Invalidate clean unused buffers and pagecache. */
80  void invalidate_bdev(struct block_device *bdev)
81  {
82  	struct address_space *mapping = bdev->bd_inode->i_mapping;
83  
84  	if (mapping->nrpages) {
85  		invalidate_bh_lrus();
86  		lru_add_drain_all();	/* make sure all lru add caches are flushed */
87  		invalidate_mapping_pages(mapping, 0, -1);
88  	}
89  }
90  EXPORT_SYMBOL(invalidate_bdev);
91  
92  /*
93   * Drop all buffers & page cache for given bdev range. This function bails
94   * with error if bdev has other exclusive owner (such as filesystem).
95   */
96  int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
97  			loff_t lstart, loff_t lend)
98  {
99  	/*
100  	 * If we don't hold exclusive handle for the device, upgrade to it
101  	 * while we discard the buffer cache to avoid discarding buffers
102  	 * under live filesystem.
103  	 */
104  	if (!(mode & FMODE_EXCL)) {
105  		int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
106  		if (err)
107  			goto invalidate;
108  	}
109  
110  	truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
111  	if (!(mode & FMODE_EXCL))
112  		bd_abort_claiming(bdev, truncate_bdev_range);
113  	return 0;
114  
115  invalidate:
116  	/*
117  	 * Someone else has handle exclusively open. Try invalidating instead.
118  	 * The 'end' argument is inclusive so the rounding is safe.
119  	 */
120  	return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
121  					     lstart >> PAGE_SHIFT,
122  					     lend >> PAGE_SHIFT);
123  }
124  
125  static void set_init_blocksize(struct block_device *bdev)
126  {
127  	unsigned int bsize = bdev_logical_block_size(bdev);
128  	loff_t size = i_size_read(bdev->bd_inode);
129  
130  	while (bsize < PAGE_SIZE) {
131  		if (size & bsize)
132  			break;
133  		bsize <<= 1;
134  	}
135  	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
136  }
137  
138  int set_blocksize(struct block_device *bdev, int size)
139  {
140  	/* Size must be a power of two, and between 512 and PAGE_SIZE */
141  	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
142  		return -EINVAL;
143  
144  	/* Size cannot be smaller than the size supported by the device */
145  	if (size < bdev_logical_block_size(bdev))
146  		return -EINVAL;
147  
148  	/* Don't change the size if it is same as current */
149  	if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
150  		sync_blockdev(bdev);
151  		bdev->bd_inode->i_blkbits = blksize_bits(size);
152  		kill_bdev(bdev);
153  	}
154  	return 0;
155  }
156  
157  EXPORT_SYMBOL(set_blocksize);
158  
159  int sb_set_blocksize(struct super_block *sb, int size)
160  {
161  	if (set_blocksize(sb->s_bdev, size))
162  		return 0;
163  	/* If we get here, we know size is power of two
164  	 * and it's value is between 512 and PAGE_SIZE */
165  	sb->s_blocksize = size;
166  	sb->s_blocksize_bits = blksize_bits(size);
167  	return sb->s_blocksize;
168  }
169  
170  EXPORT_SYMBOL(sb_set_blocksize);
171  
172  int sb_min_blocksize(struct super_block *sb, int size)
173  {
174  	int minsize = bdev_logical_block_size(sb->s_bdev);
175  	if (size < minsize)
176  		size = minsize;
177  	return sb_set_blocksize(sb, size);
178  }
179  
180  EXPORT_SYMBOL(sb_min_blocksize);
181  
182  int sync_blockdev_nowait(struct block_device *bdev)
183  {
184  	if (!bdev)
185  		return 0;
186  	return filemap_flush(bdev->bd_inode->i_mapping);
187  }
188  EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
189  
190  /*
191   * Write out and wait upon all the dirty data associated with a block
192   * device via its mapping.  Does not take the superblock lock.
193   */
194  int sync_blockdev(struct block_device *bdev)
195  {
196  	if (!bdev)
197  		return 0;
198  	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
199  }
200  EXPORT_SYMBOL(sync_blockdev);
201  
202  int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
203  {
204  	return filemap_write_and_wait_range(bdev->bd_inode->i_mapping,
205  			lstart, lend);
206  }
207  EXPORT_SYMBOL(sync_blockdev_range);
208  
209  /*
210   * Write out and wait upon all dirty data associated with this
211   * device.   Filesystem data as well as the underlying block
212   * device.  Takes the superblock lock.
213   */
214  int fsync_bdev(struct block_device *bdev)
215  {
216  	struct super_block *sb = get_super(bdev);
217  	if (sb) {
218  		int res = sync_filesystem(sb);
219  		drop_super(sb);
220  		return res;
221  	}
222  	return sync_blockdev(bdev);
223  }
224  EXPORT_SYMBOL(fsync_bdev);
225  
226  /**
227   * freeze_bdev - lock a filesystem and force it into a consistent state
228   * @bdev:	blockdevice to lock
229   *
230   * If a superblock is found on this device, we take the s_umount semaphore
231   * on it to make sure nobody unmounts until the snapshot creation is done.
232   * The reference counter (bd_fsfreeze_count) guarantees that only the last
233   * unfreeze process can unfreeze the frozen filesystem actually when multiple
234   * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
235   * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
236   * actually.
237   */
238  int freeze_bdev(struct block_device *bdev)
239  {
240  	struct super_block *sb;
241  	int error = 0;
242  
243  	mutex_lock(&bdev->bd_fsfreeze_mutex);
244  	if (++bdev->bd_fsfreeze_count > 1)
245  		goto done;
246  
247  	sb = get_active_super(bdev);
248  	if (!sb)
249  		goto sync;
250  	if (sb->s_op->freeze_super)
251  		error = sb->s_op->freeze_super(sb);
252  	else
253  		error = freeze_super(sb);
254  	deactivate_super(sb);
255  
256  	if (error) {
257  		bdev->bd_fsfreeze_count--;
258  		goto done;
259  	}
260  	bdev->bd_fsfreeze_sb = sb;
261  
262  sync:
263  	sync_blockdev(bdev);
264  done:
265  	mutex_unlock(&bdev->bd_fsfreeze_mutex);
266  	return error;
267  }
268  EXPORT_SYMBOL(freeze_bdev);
269  
270  /**
271   * thaw_bdev - unlock filesystem
272   * @bdev:	blockdevice to unlock
273   *
274   * Unlocks the filesystem and marks it writeable again after freeze_bdev().
275   */
276  int thaw_bdev(struct block_device *bdev)
277  {
278  	struct super_block *sb;
279  	int error = -EINVAL;
280  
281  	mutex_lock(&bdev->bd_fsfreeze_mutex);
282  	if (!bdev->bd_fsfreeze_count)
283  		goto out;
284  
285  	error = 0;
286  	if (--bdev->bd_fsfreeze_count > 0)
287  		goto out;
288  
289  	sb = bdev->bd_fsfreeze_sb;
290  	if (!sb)
291  		goto out;
292  
293  	if (sb->s_op->thaw_super)
294  		error = sb->s_op->thaw_super(sb);
295  	else
296  		error = thaw_super(sb);
297  	if (error)
298  		bdev->bd_fsfreeze_count++;
299  	else
300  		bdev->bd_fsfreeze_sb = NULL;
301  out:
302  	mutex_unlock(&bdev->bd_fsfreeze_mutex);
303  	return error;
304  }
305  EXPORT_SYMBOL(thaw_bdev);
306  
307  /*
308   * pseudo-fs
309   */
310  
311  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
312  static struct kmem_cache * bdev_cachep __read_mostly;
313  
314  static struct inode *bdev_alloc_inode(struct super_block *sb)
315  {
316  	struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
317  
318  	if (!ei)
319  		return NULL;
320  	memset(&ei->bdev, 0, sizeof(ei->bdev));
321  	return &ei->vfs_inode;
322  }
323  
324  static void bdev_free_inode(struct inode *inode)
325  {
326  	struct block_device *bdev = I_BDEV(inode);
327  
328  	free_percpu(bdev->bd_stats);
329  	kfree(bdev->bd_meta_info);
330  
331  	if (!bdev_is_partition(bdev)) {
332  		if (bdev->bd_disk && bdev->bd_disk->bdi)
333  			bdi_put(bdev->bd_disk->bdi);
334  		kfree(bdev->bd_disk);
335  	}
336  
337  	if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
338  		blk_free_ext_minor(MINOR(bdev->bd_dev));
339  
340  	kmem_cache_free(bdev_cachep, BDEV_I(inode));
341  }
342  
343  static void init_once(void *data)
344  {
345  	struct bdev_inode *ei = data;
346  
347  	inode_init_once(&ei->vfs_inode);
348  }
349  
350  static void bdev_evict_inode(struct inode *inode)
351  {
352  	truncate_inode_pages_final(&inode->i_data);
353  	invalidate_inode_buffers(inode); /* is it needed here? */
354  	clear_inode(inode);
355  }
356  
357  static const struct super_operations bdev_sops = {
358  	.statfs = simple_statfs,
359  	.alloc_inode = bdev_alloc_inode,
360  	.free_inode = bdev_free_inode,
361  	.drop_inode = generic_delete_inode,
362  	.evict_inode = bdev_evict_inode,
363  };
364  
365  static int bd_init_fs_context(struct fs_context *fc)
366  {
367  	struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
368  	if (!ctx)
369  		return -ENOMEM;
370  	fc->s_iflags |= SB_I_CGROUPWB;
371  	ctx->ops = &bdev_sops;
372  	return 0;
373  }
374  
375  static struct file_system_type bd_type = {
376  	.name		= "bdev",
377  	.init_fs_context = bd_init_fs_context,
378  	.kill_sb	= kill_anon_super,
379  };
380  
381  struct super_block *blockdev_superblock __read_mostly;
382  EXPORT_SYMBOL_GPL(blockdev_superblock);
383  
384  void __init bdev_cache_init(void)
385  {
386  	int err;
387  	static struct vfsmount *bd_mnt;
388  
389  	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
390  			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
391  				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
392  			init_once);
393  	err = register_filesystem(&bd_type);
394  	if (err)
395  		panic("Cannot register bdev pseudo-fs");
396  	bd_mnt = kern_mount(&bd_type);
397  	if (IS_ERR(bd_mnt))
398  		panic("Cannot create bdev pseudo-fs");
399  	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
400  }
401  
402  struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
403  {
404  	struct block_device *bdev;
405  	struct inode *inode;
406  
407  	inode = new_inode(blockdev_superblock);
408  	if (!inode)
409  		return NULL;
410  	inode->i_mode = S_IFBLK;
411  	inode->i_rdev = 0;
412  	inode->i_data.a_ops = &def_blk_aops;
413  	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
414  
415  	bdev = I_BDEV(inode);
416  	mutex_init(&bdev->bd_fsfreeze_mutex);
417  	spin_lock_init(&bdev->bd_size_lock);
418  	bdev->bd_partno = partno;
419  	bdev->bd_inode = inode;
420  	bdev->bd_queue = disk->queue;
421  	if (partno)
422  		bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
423  	else
424  		bdev->bd_has_submit_bio = false;
425  	bdev->bd_stats = alloc_percpu(struct disk_stats);
426  	if (!bdev->bd_stats) {
427  		iput(inode);
428  		return NULL;
429  	}
430  	bdev->bd_disk = disk;
431  	return bdev;
432  }
433  
434  void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
435  {
436  	spin_lock(&bdev->bd_size_lock);
437  	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
438  	bdev->bd_nr_sectors = sectors;
439  	spin_unlock(&bdev->bd_size_lock);
440  }
441  
442  void bdev_add(struct block_device *bdev, dev_t dev)
443  {
444  	bdev->bd_dev = dev;
445  	bdev->bd_inode->i_rdev = dev;
446  	bdev->bd_inode->i_ino = dev;
447  	insert_inode_hash(bdev->bd_inode);
448  }
449  
450  long nr_blockdev_pages(void)
451  {
452  	struct inode *inode;
453  	long ret = 0;
454  
455  	spin_lock(&blockdev_superblock->s_inode_list_lock);
456  	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
457  		ret += inode->i_mapping->nrpages;
458  	spin_unlock(&blockdev_superblock->s_inode_list_lock);
459  
460  	return ret;
461  }
462  
463  /**
464   * bd_may_claim - test whether a block device can be claimed
465   * @bdev: block device of interest
466   * @whole: whole block device containing @bdev, may equal @bdev
467   * @holder: holder trying to claim @bdev
468   *
469   * Test whether @bdev can be claimed by @holder.
470   *
471   * CONTEXT:
472   * spin_lock(&bdev_lock).
473   *
474   * RETURNS:
475   * %true if @bdev can be claimed, %false otherwise.
476   */
477  static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
478  			 void *holder)
479  {
480  	if (bdev->bd_holder == holder)
481  		return true;	 /* already a holder */
482  	else if (bdev->bd_holder != NULL)
483  		return false; 	 /* held by someone else */
484  	else if (whole == bdev)
485  		return true;  	 /* is a whole device which isn't held */
486  
487  	else if (whole->bd_holder == bd_may_claim)
488  		return true; 	 /* is a partition of a device that is being partitioned */
489  	else if (whole->bd_holder != NULL)
490  		return false;	 /* is a partition of a held device */
491  	else
492  		return true;	 /* is a partition of an un-held device */
493  }
494  
495  /**
496   * bd_prepare_to_claim - claim a block device
497   * @bdev: block device of interest
498   * @holder: holder trying to claim @bdev
499   *
500   * Claim @bdev.  This function fails if @bdev is already claimed by another
501   * holder and waits if another claiming is in progress. return, the caller
502   * has ownership of bd_claiming and bd_holder[s].
503   *
504   * RETURNS:
505   * 0 if @bdev can be claimed, -EBUSY otherwise.
506   */
507  int bd_prepare_to_claim(struct block_device *bdev, void *holder)
508  {
509  	struct block_device *whole = bdev_whole(bdev);
510  
511  	if (WARN_ON_ONCE(!holder))
512  		return -EINVAL;
513  retry:
514  	spin_lock(&bdev_lock);
515  	/* if someone else claimed, fail */
516  	if (!bd_may_claim(bdev, whole, holder)) {
517  		spin_unlock(&bdev_lock);
518  		return -EBUSY;
519  	}
520  
521  	/* if claiming is already in progress, wait for it to finish */
522  	if (whole->bd_claiming) {
523  		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
524  		DEFINE_WAIT(wait);
525  
526  		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
527  		spin_unlock(&bdev_lock);
528  		schedule();
529  		finish_wait(wq, &wait);
530  		goto retry;
531  	}
532  
533  	/* yay, all mine */
534  	whole->bd_claiming = holder;
535  	spin_unlock(&bdev_lock);
536  	return 0;
537  }
538  EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
539  
540  static void bd_clear_claiming(struct block_device *whole, void *holder)
541  {
542  	lockdep_assert_held(&bdev_lock);
543  	/* tell others that we're done */
544  	BUG_ON(whole->bd_claiming != holder);
545  	whole->bd_claiming = NULL;
546  	wake_up_bit(&whole->bd_claiming, 0);
547  }
548  
549  /**
550   * bd_finish_claiming - finish claiming of a block device
551   * @bdev: block device of interest
552   * @holder: holder that has claimed @bdev
553   *
554   * Finish exclusive open of a block device. Mark the device as exlusively
555   * open by the holder and wake up all waiters for exclusive open to finish.
556   */
557  static void bd_finish_claiming(struct block_device *bdev, void *holder)
558  {
559  	struct block_device *whole = bdev_whole(bdev);
560  
561  	spin_lock(&bdev_lock);
562  	BUG_ON(!bd_may_claim(bdev, whole, holder));
563  	/*
564  	 * Note that for a whole device bd_holders will be incremented twice,
565  	 * and bd_holder will be set to bd_may_claim before being set to holder
566  	 */
567  	whole->bd_holders++;
568  	whole->bd_holder = bd_may_claim;
569  	bdev->bd_holders++;
570  	bdev->bd_holder = holder;
571  	bd_clear_claiming(whole, holder);
572  	spin_unlock(&bdev_lock);
573  }
574  
575  /**
576   * bd_abort_claiming - abort claiming of a block device
577   * @bdev: block device of interest
578   * @holder: holder that has claimed @bdev
579   *
580   * Abort claiming of a block device when the exclusive open failed. This can be
581   * also used when exclusive open is not actually desired and we just needed
582   * to block other exclusive openers for a while.
583   */
584  void bd_abort_claiming(struct block_device *bdev, void *holder)
585  {
586  	spin_lock(&bdev_lock);
587  	bd_clear_claiming(bdev_whole(bdev), holder);
588  	spin_unlock(&bdev_lock);
589  }
590  EXPORT_SYMBOL(bd_abort_claiming);
591  
592  static void blkdev_flush_mapping(struct block_device *bdev)
593  {
594  	WARN_ON_ONCE(bdev->bd_holders);
595  	sync_blockdev(bdev);
596  	kill_bdev(bdev);
597  	bdev_write_inode(bdev);
598  }
599  
600  static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
601  {
602  	struct gendisk *disk = bdev->bd_disk;
603  	int ret;
604  
605  	if (disk->fops->open) {
606  		ret = disk->fops->open(bdev, mode);
607  		if (ret) {
608  			/* avoid ghost partitions on a removed medium */
609  			if (ret == -ENOMEDIUM &&
610  			     test_bit(GD_NEED_PART_SCAN, &disk->state))
611  				bdev_disk_changed(disk, true);
612  			return ret;
613  		}
614  	}
615  
616  	if (!atomic_read(&bdev->bd_openers))
617  		set_init_blocksize(bdev);
618  	if (test_bit(GD_NEED_PART_SCAN, &disk->state))
619  		bdev_disk_changed(disk, false);
620  	atomic_inc(&bdev->bd_openers);
621  	return 0;
622  }
623  
624  static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
625  {
626  	if (atomic_dec_and_test(&bdev->bd_openers))
627  		blkdev_flush_mapping(bdev);
628  	if (bdev->bd_disk->fops->release)
629  		bdev->bd_disk->fops->release(bdev->bd_disk, mode);
630  }
631  
632  static int blkdev_get_part(struct block_device *part, fmode_t mode)
633  {
634  	struct gendisk *disk = part->bd_disk;
635  	int ret;
636  
637  	if (atomic_read(&part->bd_openers))
638  		goto done;
639  
640  	ret = blkdev_get_whole(bdev_whole(part), mode);
641  	if (ret)
642  		return ret;
643  
644  	ret = -ENXIO;
645  	if (!bdev_nr_sectors(part))
646  		goto out_blkdev_put;
647  
648  	disk->open_partitions++;
649  	set_init_blocksize(part);
650  done:
651  	atomic_inc(&part->bd_openers);
652  	return 0;
653  
654  out_blkdev_put:
655  	blkdev_put_whole(bdev_whole(part), mode);
656  	return ret;
657  }
658  
659  static void blkdev_put_part(struct block_device *part, fmode_t mode)
660  {
661  	struct block_device *whole = bdev_whole(part);
662  
663  	if (!atomic_dec_and_test(&part->bd_openers))
664  		return;
665  	blkdev_flush_mapping(part);
666  	whole->bd_disk->open_partitions--;
667  	blkdev_put_whole(whole, mode);
668  }
669  
670  struct block_device *blkdev_get_no_open(dev_t dev)
671  {
672  	struct block_device *bdev;
673  	struct inode *inode;
674  
675  	inode = ilookup(blockdev_superblock, dev);
676  	if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
677  		blk_request_module(dev);
678  		inode = ilookup(blockdev_superblock, dev);
679  		if (inode)
680  			pr_warn_ratelimited(
681  "block device autoloading is deprecated and will be removed.\n");
682  	}
683  	if (!inode)
684  		return NULL;
685  
686  	/* switch from the inode reference to a device mode one: */
687  	bdev = &BDEV_I(inode)->bdev;
688  	if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
689  		bdev = NULL;
690  	iput(inode);
691  	return bdev;
692  }
693  
694  void blkdev_put_no_open(struct block_device *bdev)
695  {
696  	put_device(&bdev->bd_device);
697  }
698  
699  /**
700   * blkdev_get_by_dev - open a block device by device number
701   * @dev: device number of block device to open
702   * @mode: FMODE_* mask
703   * @holder: exclusive holder identifier
704   *
705   * Open the block device described by device number @dev. If @mode includes
706   * %FMODE_EXCL, the block device is opened with exclusive access.  Specifying
707   * %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may nest for
708   * the same @holder.
709   *
710   * Use this interface ONLY if you really do not have anything better - i.e. when
711   * you are behind a truly sucky interface and all you are given is a device
712   * number.  Everything else should use blkdev_get_by_path().
713   *
714   * CONTEXT:
715   * Might sleep.
716   *
717   * RETURNS:
718   * Reference to the block_device on success, ERR_PTR(-errno) on failure.
719   */
720  struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
721  {
722  	bool unblock_events = true;
723  	struct block_device *bdev;
724  	struct gendisk *disk;
725  	int ret;
726  
727  	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
728  			MAJOR(dev), MINOR(dev),
729  			((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
730  			((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
731  	if (ret)
732  		return ERR_PTR(ret);
733  
734  	bdev = blkdev_get_no_open(dev);
735  	if (!bdev)
736  		return ERR_PTR(-ENXIO);
737  	disk = bdev->bd_disk;
738  
739  	if (mode & FMODE_EXCL) {
740  		ret = bd_prepare_to_claim(bdev, holder);
741  		if (ret)
742  			goto put_blkdev;
743  	}
744  
745  	disk_block_events(disk);
746  
747  	mutex_lock(&disk->open_mutex);
748  	ret = -ENXIO;
749  	if (!disk_live(disk))
750  		goto abort_claiming;
751  	if (!try_module_get(disk->fops->owner))
752  		goto abort_claiming;
753  	if (bdev_is_partition(bdev))
754  		ret = blkdev_get_part(bdev, mode);
755  	else
756  		ret = blkdev_get_whole(bdev, mode);
757  	if (ret)
758  		goto put_module;
759  	if (mode & FMODE_EXCL) {
760  		bd_finish_claiming(bdev, holder);
761  
762  		/*
763  		 * Block event polling for write claims if requested.  Any write
764  		 * holder makes the write_holder state stick until all are
765  		 * released.  This is good enough and tracking individual
766  		 * writeable reference is too fragile given the way @mode is
767  		 * used in blkdev_get/put().
768  		 */
769  		if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
770  		    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
771  			bdev->bd_write_holder = true;
772  			unblock_events = false;
773  		}
774  	}
775  	mutex_unlock(&disk->open_mutex);
776  
777  	if (unblock_events)
778  		disk_unblock_events(disk);
779  	return bdev;
780  put_module:
781  	module_put(disk->fops->owner);
782  abort_claiming:
783  	if (mode & FMODE_EXCL)
784  		bd_abort_claiming(bdev, holder);
785  	mutex_unlock(&disk->open_mutex);
786  	disk_unblock_events(disk);
787  put_blkdev:
788  	blkdev_put_no_open(bdev);
789  	return ERR_PTR(ret);
790  }
791  EXPORT_SYMBOL(blkdev_get_by_dev);
792  
793  /**
794   * blkdev_get_by_path - open a block device by name
795   * @path: path to the block device to open
796   * @mode: FMODE_* mask
797   * @holder: exclusive holder identifier
798   *
799   * Open the block device described by the device file at @path.  If @mode
800   * includes %FMODE_EXCL, the block device is opened with exclusive access.
801   * Specifying %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may
802   * nest for the same @holder.
803   *
804   * CONTEXT:
805   * Might sleep.
806   *
807   * RETURNS:
808   * Reference to the block_device on success, ERR_PTR(-errno) on failure.
809   */
810  struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
811  					void *holder)
812  {
813  	struct block_device *bdev;
814  	dev_t dev;
815  	int error;
816  
817  	error = lookup_bdev(path, &dev);
818  	if (error)
819  		return ERR_PTR(error);
820  
821  	bdev = blkdev_get_by_dev(dev, mode, holder);
822  	if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
823  		blkdev_put(bdev, mode);
824  		return ERR_PTR(-EACCES);
825  	}
826  
827  	return bdev;
828  }
829  EXPORT_SYMBOL(blkdev_get_by_path);
830  
831  void blkdev_put(struct block_device *bdev, fmode_t mode)
832  {
833  	struct gendisk *disk = bdev->bd_disk;
834  
835  	/*
836  	 * Sync early if it looks like we're the last one.  If someone else
837  	 * opens the block device between now and the decrement of bd_openers
838  	 * then we did a sync that we didn't need to, but that's not the end
839  	 * of the world and we want to avoid long (could be several minute)
840  	 * syncs while holding the mutex.
841  	 */
842  	if (atomic_read(&bdev->bd_openers) == 1)
843  		sync_blockdev(bdev);
844  
845  	mutex_lock(&disk->open_mutex);
846  	if (mode & FMODE_EXCL) {
847  		struct block_device *whole = bdev_whole(bdev);
848  		bool bdev_free;
849  
850  		/*
851  		 * Release a claim on the device.  The holder fields
852  		 * are protected with bdev_lock.  open_mutex is to
853  		 * synchronize disk_holder unlinking.
854  		 */
855  		spin_lock(&bdev_lock);
856  
857  		WARN_ON_ONCE(--bdev->bd_holders < 0);
858  		WARN_ON_ONCE(--whole->bd_holders < 0);
859  
860  		if ((bdev_free = !bdev->bd_holders))
861  			bdev->bd_holder = NULL;
862  		if (!whole->bd_holders)
863  			whole->bd_holder = NULL;
864  
865  		spin_unlock(&bdev_lock);
866  
867  		/*
868  		 * If this was the last claim, remove holder link and
869  		 * unblock evpoll if it was a write holder.
870  		 */
871  		if (bdev_free && bdev->bd_write_holder) {
872  			disk_unblock_events(disk);
873  			bdev->bd_write_holder = false;
874  		}
875  	}
876  
877  	/*
878  	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
879  	 * event.  This is to ensure detection of media removal commanded
880  	 * from userland - e.g. eject(1).
881  	 */
882  	disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
883  
884  	if (bdev_is_partition(bdev))
885  		blkdev_put_part(bdev, mode);
886  	else
887  		blkdev_put_whole(bdev, mode);
888  	mutex_unlock(&disk->open_mutex);
889  
890  	module_put(disk->fops->owner);
891  	blkdev_put_no_open(bdev);
892  }
893  EXPORT_SYMBOL(blkdev_put);
894  
895  /**
896   * lookup_bdev() - Look up a struct block_device by name.
897   * @pathname: Name of the block device in the filesystem.
898   * @dev: Pointer to the block device's dev_t, if found.
899   *
900   * Lookup the block device's dev_t at @pathname in the current
901   * namespace if possible and return it in @dev.
902   *
903   * Context: May sleep.
904   * Return: 0 if succeeded, negative errno otherwise.
905   */
906  int lookup_bdev(const char *pathname, dev_t *dev)
907  {
908  	struct inode *inode;
909  	struct path path;
910  	int error;
911  
912  	if (!pathname || !*pathname)
913  		return -EINVAL;
914  
915  	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
916  	if (error)
917  		return error;
918  
919  	inode = d_backing_inode(path.dentry);
920  	error = -ENOTBLK;
921  	if (!S_ISBLK(inode->i_mode))
922  		goto out_path_put;
923  	error = -EACCES;
924  	if (!may_open_dev(&path))
925  		goto out_path_put;
926  
927  	*dev = inode->i_rdev;
928  	error = 0;
929  out_path_put:
930  	path_put(&path);
931  	return error;
932  }
933  EXPORT_SYMBOL(lookup_bdev);
934  
935  int __invalidate_device(struct block_device *bdev, bool kill_dirty)
936  {
937  	struct super_block *sb = get_super(bdev);
938  	int res = 0;
939  
940  	if (sb) {
941  		/*
942  		 * no need to lock the super, get_super holds the
943  		 * read mutex so the filesystem cannot go away
944  		 * under us (->put_super runs with the write lock
945  		 * hold).
946  		 */
947  		shrink_dcache_sb(sb);
948  		res = invalidate_inodes(sb, kill_dirty);
949  		drop_super(sb);
950  	}
951  	invalidate_bdev(bdev);
952  	return res;
953  }
954  EXPORT_SYMBOL(__invalidate_device);
955  
956  void sync_bdevs(bool wait)
957  {
958  	struct inode *inode, *old_inode = NULL;
959  
960  	spin_lock(&blockdev_superblock->s_inode_list_lock);
961  	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
962  		struct address_space *mapping = inode->i_mapping;
963  		struct block_device *bdev;
964  
965  		spin_lock(&inode->i_lock);
966  		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
967  		    mapping->nrpages == 0) {
968  			spin_unlock(&inode->i_lock);
969  			continue;
970  		}
971  		__iget(inode);
972  		spin_unlock(&inode->i_lock);
973  		spin_unlock(&blockdev_superblock->s_inode_list_lock);
974  		/*
975  		 * We hold a reference to 'inode' so it couldn't have been
976  		 * removed from s_inodes list while we dropped the
977  		 * s_inode_list_lock  We cannot iput the inode now as we can
978  		 * be holding the last reference and we cannot iput it under
979  		 * s_inode_list_lock. So we keep the reference and iput it
980  		 * later.
981  		 */
982  		iput(old_inode);
983  		old_inode = inode;
984  		bdev = I_BDEV(inode);
985  
986  		mutex_lock(&bdev->bd_disk->open_mutex);
987  		if (!atomic_read(&bdev->bd_openers)) {
988  			; /* skip */
989  		} else if (wait) {
990  			/*
991  			 * We keep the error status of individual mapping so
992  			 * that applications can catch the writeback error using
993  			 * fsync(2). See filemap_fdatawait_keep_errors() for
994  			 * details.
995  			 */
996  			filemap_fdatawait_keep_errors(inode->i_mapping);
997  		} else {
998  			filemap_fdatawrite(inode->i_mapping);
999  		}
1000  		mutex_unlock(&bdev->bd_disk->open_mutex);
1001  
1002  		spin_lock(&blockdev_superblock->s_inode_list_lock);
1003  	}
1004  	spin_unlock(&blockdev_superblock->s_inode_list_lock);
1005  	iput(old_inode);
1006  }
1007  
1008  /*
1009   * Handle STATX_DIOALIGN for block devices.
1010   *
1011   * Note that the inode passed to this is the inode of a block device node file,
1012   * not the block device's internal inode.  Therefore it is *not* valid to use
1013   * I_BDEV() here; the block device has to be looked up by i_rdev instead.
1014   */
1015  void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
1016  {
1017  	struct block_device *bdev;
1018  
1019  	bdev = blkdev_get_no_open(inode->i_rdev);
1020  	if (!bdev)
1021  		return;
1022  
1023  	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
1024  	stat->dio_offset_align = bdev_logical_block_size(bdev);
1025  	stat->result_mask |= STATX_DIOALIGN;
1026  
1027  	blkdev_put_no_open(bdev);
1028  }
1029