xref: /openbmc/linux/mm/backing-dev.c (revision d0b73b48)
1 
2 #include <linux/wait.h>
3 #include <linux/backing-dev.h>
4 #include <linux/kthread.h>
5 #include <linux/freezer.h>
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/module.h>
11 #include <linux/writeback.h>
12 #include <linux/device.h>
13 #include <trace/events/writeback.h>
14 
15 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 
17 struct backing_dev_info default_backing_dev_info = {
18 	.name		= "default",
19 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
20 	.state		= 0,
21 	.capabilities	= BDI_CAP_MAP_COPY,
22 };
23 EXPORT_SYMBOL_GPL(default_backing_dev_info);
24 
25 struct backing_dev_info noop_backing_dev_info = {
26 	.name		= "noop",
27 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
28 };
29 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
30 
31 static struct class *bdi_class;
32 
33 /*
34  * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
35  * reader side protection for bdi_pending_list. bdi_list has RCU reader side
36  * locking.
37  */
38 DEFINE_SPINLOCK(bdi_lock);
39 LIST_HEAD(bdi_list);
40 LIST_HEAD(bdi_pending_list);
41 
42 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
43 {
44 	if (wb1 < wb2) {
45 		spin_lock(&wb1->list_lock);
46 		spin_lock_nested(&wb2->list_lock, 1);
47 	} else {
48 		spin_lock(&wb2->list_lock);
49 		spin_lock_nested(&wb1->list_lock, 1);
50 	}
51 }
52 
53 #ifdef CONFIG_DEBUG_FS
54 #include <linux/debugfs.h>
55 #include <linux/seq_file.h>
56 
57 static struct dentry *bdi_debug_root;
58 
59 static void bdi_debug_init(void)
60 {
61 	bdi_debug_root = debugfs_create_dir("bdi", NULL);
62 }
63 
64 static int bdi_debug_stats_show(struct seq_file *m, void *v)
65 {
66 	struct backing_dev_info *bdi = m->private;
67 	struct bdi_writeback *wb = &bdi->wb;
68 	unsigned long background_thresh;
69 	unsigned long dirty_thresh;
70 	unsigned long bdi_thresh;
71 	unsigned long nr_dirty, nr_io, nr_more_io;
72 	struct inode *inode;
73 
74 	nr_dirty = nr_io = nr_more_io = 0;
75 	spin_lock(&wb->list_lock);
76 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
77 		nr_dirty++;
78 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
79 		nr_io++;
80 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
81 		nr_more_io++;
82 	spin_unlock(&wb->list_lock);
83 
84 	global_dirty_limits(&background_thresh, &dirty_thresh);
85 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
86 
87 #define K(x) ((x) << (PAGE_SHIFT - 10))
88 	seq_printf(m,
89 		   "BdiWriteback:       %10lu kB\n"
90 		   "BdiReclaimable:     %10lu kB\n"
91 		   "BdiDirtyThresh:     %10lu kB\n"
92 		   "DirtyThresh:        %10lu kB\n"
93 		   "BackgroundThresh:   %10lu kB\n"
94 		   "BdiDirtied:         %10lu kB\n"
95 		   "BdiWritten:         %10lu kB\n"
96 		   "BdiWriteBandwidth:  %10lu kBps\n"
97 		   "b_dirty:            %10lu\n"
98 		   "b_io:               %10lu\n"
99 		   "b_more_io:          %10lu\n"
100 		   "bdi_list:           %10u\n"
101 		   "state:              %10lx\n",
102 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
103 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
104 		   K(bdi_thresh),
105 		   K(dirty_thresh),
106 		   K(background_thresh),
107 		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
108 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
109 		   (unsigned long) K(bdi->write_bandwidth),
110 		   nr_dirty,
111 		   nr_io,
112 		   nr_more_io,
113 		   !list_empty(&bdi->bdi_list), bdi->state);
114 #undef K
115 
116 	return 0;
117 }
118 
119 static int bdi_debug_stats_open(struct inode *inode, struct file *file)
120 {
121 	return single_open(file, bdi_debug_stats_show, inode->i_private);
122 }
123 
124 static const struct file_operations bdi_debug_stats_fops = {
125 	.open		= bdi_debug_stats_open,
126 	.read		= seq_read,
127 	.llseek		= seq_lseek,
128 	.release	= single_release,
129 };
130 
131 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
132 {
133 	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
134 	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
135 					       bdi, &bdi_debug_stats_fops);
136 }
137 
138 static void bdi_debug_unregister(struct backing_dev_info *bdi)
139 {
140 	debugfs_remove(bdi->debug_stats);
141 	debugfs_remove(bdi->debug_dir);
142 }
143 #else
144 static inline void bdi_debug_init(void)
145 {
146 }
147 static inline void bdi_debug_register(struct backing_dev_info *bdi,
148 				      const char *name)
149 {
150 }
151 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
152 {
153 }
154 #endif
155 
156 static ssize_t read_ahead_kb_store(struct device *dev,
157 				  struct device_attribute *attr,
158 				  const char *buf, size_t count)
159 {
160 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
161 	unsigned long read_ahead_kb;
162 	ssize_t ret;
163 
164 	ret = kstrtoul(buf, 10, &read_ahead_kb);
165 	if (ret < 0)
166 		return ret;
167 
168 	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
169 
170 	return count;
171 }
172 
173 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
174 
175 #define BDI_SHOW(name, expr)						\
176 static ssize_t name##_show(struct device *dev,				\
177 			   struct device_attribute *attr, char *page)	\
178 {									\
179 	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
180 									\
181 	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
182 }
183 
184 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
185 
186 static ssize_t min_ratio_store(struct device *dev,
187 		struct device_attribute *attr, const char *buf, size_t count)
188 {
189 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
190 	unsigned int ratio;
191 	ssize_t ret;
192 
193 	ret = kstrtouint(buf, 10, &ratio);
194 	if (ret < 0)
195 		return ret;
196 
197 	ret = bdi_set_min_ratio(bdi, ratio);
198 	if (!ret)
199 		ret = count;
200 
201 	return ret;
202 }
203 BDI_SHOW(min_ratio, bdi->min_ratio)
204 
205 static ssize_t max_ratio_store(struct device *dev,
206 		struct device_attribute *attr, const char *buf, size_t count)
207 {
208 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
209 	unsigned int ratio;
210 	ssize_t ret;
211 
212 	ret = kstrtouint(buf, 10, &ratio);
213 	if (ret < 0)
214 		return ret;
215 
216 	ret = bdi_set_max_ratio(bdi, ratio);
217 	if (!ret)
218 		ret = count;
219 
220 	return ret;
221 }
222 BDI_SHOW(max_ratio, bdi->max_ratio)
223 
224 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
225 
226 static struct device_attribute bdi_dev_attrs[] = {
227 	__ATTR_RW(read_ahead_kb),
228 	__ATTR_RW(min_ratio),
229 	__ATTR_RW(max_ratio),
230 	__ATTR_NULL,
231 };
232 
233 static __init int bdi_class_init(void)
234 {
235 	bdi_class = class_create(THIS_MODULE, "bdi");
236 	if (IS_ERR(bdi_class))
237 		return PTR_ERR(bdi_class);
238 
239 	bdi_class->dev_attrs = bdi_dev_attrs;
240 	bdi_debug_init();
241 	return 0;
242 }
243 postcore_initcall(bdi_class_init);
244 
245 static int __init default_bdi_init(void)
246 {
247 	int err;
248 
249 	err = bdi_init(&default_backing_dev_info);
250 	if (!err)
251 		bdi_register(&default_backing_dev_info, NULL, "default");
252 	err = bdi_init(&noop_backing_dev_info);
253 
254 	return err;
255 }
256 subsys_initcall(default_bdi_init);
257 
258 int bdi_has_dirty_io(struct backing_dev_info *bdi)
259 {
260 	return wb_has_dirty_io(&bdi->wb);
261 }
262 
263 static void wakeup_timer_fn(unsigned long data)
264 {
265 	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
266 
267 	spin_lock_bh(&bdi->wb_lock);
268 	if (bdi->wb.task) {
269 		trace_writeback_wake_thread(bdi);
270 		wake_up_process(bdi->wb.task);
271 	} else if (bdi->dev) {
272 		/*
273 		 * When bdi tasks are inactive for long time, they are killed.
274 		 * In this case we have to wake-up the forker thread which
275 		 * should create and run the bdi thread.
276 		 */
277 		trace_writeback_wake_forker_thread(bdi);
278 		wake_up_process(default_backing_dev_info.wb.task);
279 	}
280 	spin_unlock_bh(&bdi->wb_lock);
281 }
282 
283 /*
284  * This function is used when the first inode for this bdi is marked dirty. It
285  * wakes-up the corresponding bdi thread which should then take care of the
286  * periodic background write-out of dirty inodes. Since the write-out would
287  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
288  * set up a timer which wakes the bdi thread up later.
289  *
290  * Note, we wouldn't bother setting up the timer, but this function is on the
291  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
292  * by delaying the wake-up.
293  */
294 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
295 {
296 	unsigned long timeout;
297 
298 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
299 	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
300 }
301 
302 /*
303  * Calculate the longest interval (jiffies) bdi threads are allowed to be
304  * inactive.
305  */
306 static unsigned long bdi_longest_inactive(void)
307 {
308 	unsigned long interval;
309 
310 	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
311 	return max(5UL * 60 * HZ, interval);
312 }
313 
314 /*
315  * Clear pending bit and wakeup anybody waiting for flusher thread creation or
316  * shutdown
317  */
318 static void bdi_clear_pending(struct backing_dev_info *bdi)
319 {
320 	clear_bit(BDI_pending, &bdi->state);
321 	smp_mb__after_clear_bit();
322 	wake_up_bit(&bdi->state, BDI_pending);
323 }
324 
325 static int bdi_forker_thread(void *ptr)
326 {
327 	struct bdi_writeback *me = ptr;
328 
329 	current->flags |= PF_SWAPWRITE;
330 	set_freezable();
331 
332 	/*
333 	 * Our parent may run at a different priority, just set us to normal
334 	 */
335 	set_user_nice(current, 0);
336 
337 	for (;;) {
338 		struct task_struct *task = NULL;
339 		struct backing_dev_info *bdi;
340 		enum {
341 			NO_ACTION,   /* Nothing to do */
342 			FORK_THREAD, /* Fork bdi thread */
343 			KILL_THREAD, /* Kill inactive bdi thread */
344 		} action = NO_ACTION;
345 
346 		/*
347 		 * Temporary measure, we want to make sure we don't see
348 		 * dirty data on the default backing_dev_info
349 		 */
350 		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
351 			del_timer(&me->wakeup_timer);
352 			wb_do_writeback(me, 0);
353 		}
354 
355 		spin_lock_bh(&bdi_lock);
356 		/*
357 		 * In the following loop we are going to check whether we have
358 		 * some work to do without any synchronization with tasks
359 		 * waking us up to do work for them. Set the task state here
360 		 * so that we don't miss wakeups after verifying conditions.
361 		 */
362 		set_current_state(TASK_INTERRUPTIBLE);
363 
364 		list_for_each_entry(bdi, &bdi_list, bdi_list) {
365 			bool have_dirty_io;
366 
367 			if (!bdi_cap_writeback_dirty(bdi) ||
368 			     bdi_cap_flush_forker(bdi))
369 				continue;
370 
371 			WARN(!test_bit(BDI_registered, &bdi->state),
372 			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
373 
374 			have_dirty_io = !list_empty(&bdi->work_list) ||
375 					wb_has_dirty_io(&bdi->wb);
376 
377 			/*
378 			 * If the bdi has work to do, but the thread does not
379 			 * exist - create it.
380 			 */
381 			if (!bdi->wb.task && have_dirty_io) {
382 				/*
383 				 * Set the pending bit - if someone will try to
384 				 * unregister this bdi - it'll wait on this bit.
385 				 */
386 				set_bit(BDI_pending, &bdi->state);
387 				action = FORK_THREAD;
388 				break;
389 			}
390 
391 			spin_lock(&bdi->wb_lock);
392 
393 			/*
394 			 * If there is no work to do and the bdi thread was
395 			 * inactive long enough - kill it. The wb_lock is taken
396 			 * to make sure no-one adds more work to this bdi and
397 			 * wakes the bdi thread up.
398 			 */
399 			if (bdi->wb.task && !have_dirty_io &&
400 			    time_after(jiffies, bdi->wb.last_active +
401 						bdi_longest_inactive())) {
402 				task = bdi->wb.task;
403 				bdi->wb.task = NULL;
404 				spin_unlock(&bdi->wb_lock);
405 				set_bit(BDI_pending, &bdi->state);
406 				action = KILL_THREAD;
407 				break;
408 			}
409 			spin_unlock(&bdi->wb_lock);
410 		}
411 		spin_unlock_bh(&bdi_lock);
412 
413 		/* Keep working if default bdi still has things to do */
414 		if (!list_empty(&me->bdi->work_list))
415 			__set_current_state(TASK_RUNNING);
416 
417 		switch (action) {
418 		case FORK_THREAD:
419 			__set_current_state(TASK_RUNNING);
420 			task = kthread_create(bdi_writeback_thread, &bdi->wb,
421 					      "flush-%s", dev_name(bdi->dev));
422 			if (IS_ERR(task)) {
423 				/*
424 				 * If thread creation fails, force writeout of
425 				 * the bdi from the thread. Hopefully 1024 is
426 				 * large enough for efficient IO.
427 				 */
428 				writeback_inodes_wb(&bdi->wb, 1024,
429 						    WB_REASON_FORKER_THREAD);
430 			} else {
431 				/*
432 				 * The spinlock makes sure we do not lose
433 				 * wake-ups when racing with 'bdi_queue_work()'.
434 				 * And as soon as the bdi thread is visible, we
435 				 * can start it.
436 				 */
437 				spin_lock_bh(&bdi->wb_lock);
438 				bdi->wb.task = task;
439 				spin_unlock_bh(&bdi->wb_lock);
440 				wake_up_process(task);
441 			}
442 			bdi_clear_pending(bdi);
443 			break;
444 
445 		case KILL_THREAD:
446 			__set_current_state(TASK_RUNNING);
447 			kthread_stop(task);
448 			bdi_clear_pending(bdi);
449 			break;
450 
451 		case NO_ACTION:
452 			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
453 				/*
454 				 * There are no dirty data. The only thing we
455 				 * should now care about is checking for
456 				 * inactive bdi threads and killing them. Thus,
457 				 * let's sleep for longer time, save energy and
458 				 * be friendly for battery-driven devices.
459 				 */
460 				schedule_timeout(bdi_longest_inactive());
461 			else
462 				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
463 			try_to_freeze();
464 			break;
465 		}
466 	}
467 
468 	return 0;
469 }
470 
471 /*
472  * Remove bdi from bdi_list, and ensure that it is no longer visible
473  */
474 static void bdi_remove_from_list(struct backing_dev_info *bdi)
475 {
476 	spin_lock_bh(&bdi_lock);
477 	list_del_rcu(&bdi->bdi_list);
478 	spin_unlock_bh(&bdi_lock);
479 
480 	synchronize_rcu_expedited();
481 }
482 
483 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
484 		const char *fmt, ...)
485 {
486 	va_list args;
487 	struct device *dev;
488 
489 	if (bdi->dev)	/* The driver needs to use separate queues per device */
490 		return 0;
491 
492 	va_start(args, fmt);
493 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
494 	va_end(args);
495 	if (IS_ERR(dev))
496 		return PTR_ERR(dev);
497 
498 	bdi->dev = dev;
499 
500 	/*
501 	 * Just start the forker thread for our default backing_dev_info,
502 	 * and add other bdi's to the list. They will get a thread created
503 	 * on-demand when they need it.
504 	 */
505 	if (bdi_cap_flush_forker(bdi)) {
506 		struct bdi_writeback *wb = &bdi->wb;
507 
508 		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
509 						dev_name(dev));
510 		if (IS_ERR(wb->task))
511 			return PTR_ERR(wb->task);
512 	}
513 
514 	bdi_debug_register(bdi, dev_name(dev));
515 	set_bit(BDI_registered, &bdi->state);
516 
517 	spin_lock_bh(&bdi_lock);
518 	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
519 	spin_unlock_bh(&bdi_lock);
520 
521 	trace_writeback_bdi_register(bdi);
522 	return 0;
523 }
524 EXPORT_SYMBOL(bdi_register);
525 
526 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
527 {
528 	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
529 }
530 EXPORT_SYMBOL(bdi_register_dev);
531 
532 /*
533  * Remove bdi from the global list and shutdown any threads we have running
534  */
535 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
536 {
537 	struct task_struct *task;
538 
539 	if (!bdi_cap_writeback_dirty(bdi))
540 		return;
541 
542 	/*
543 	 * Make sure nobody finds us on the bdi_list anymore
544 	 */
545 	bdi_remove_from_list(bdi);
546 
547 	/*
548 	 * If setup is pending, wait for that to complete first
549 	 */
550 	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
551 			TASK_UNINTERRUPTIBLE);
552 
553 	/*
554 	 * Finally, kill the kernel thread. We don't need to be RCU
555 	 * safe anymore, since the bdi is gone from visibility.
556 	 */
557 	spin_lock_bh(&bdi->wb_lock);
558 	task = bdi->wb.task;
559 	bdi->wb.task = NULL;
560 	spin_unlock_bh(&bdi->wb_lock);
561 
562 	if (task)
563 		kthread_stop(task);
564 }
565 
566 /*
567  * This bdi is going away now, make sure that no super_blocks point to it
568  */
569 static void bdi_prune_sb(struct backing_dev_info *bdi)
570 {
571 	struct super_block *sb;
572 
573 	spin_lock(&sb_lock);
574 	list_for_each_entry(sb, &super_blocks, s_list) {
575 		if (sb->s_bdi == bdi)
576 			sb->s_bdi = &default_backing_dev_info;
577 	}
578 	spin_unlock(&sb_lock);
579 }
580 
581 void bdi_unregister(struct backing_dev_info *bdi)
582 {
583 	struct device *dev = bdi->dev;
584 
585 	if (dev) {
586 		bdi_set_min_ratio(bdi, 0);
587 		trace_writeback_bdi_unregister(bdi);
588 		bdi_prune_sb(bdi);
589 		del_timer_sync(&bdi->wb.wakeup_timer);
590 
591 		if (!bdi_cap_flush_forker(bdi))
592 			bdi_wb_shutdown(bdi);
593 		bdi_debug_unregister(bdi);
594 
595 		spin_lock_bh(&bdi->wb_lock);
596 		bdi->dev = NULL;
597 		spin_unlock_bh(&bdi->wb_lock);
598 
599 		device_unregister(dev);
600 	}
601 }
602 EXPORT_SYMBOL(bdi_unregister);
603 
604 static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
605 {
606 	memset(wb, 0, sizeof(*wb));
607 
608 	wb->bdi = bdi;
609 	wb->last_old_flush = jiffies;
610 	INIT_LIST_HEAD(&wb->b_dirty);
611 	INIT_LIST_HEAD(&wb->b_io);
612 	INIT_LIST_HEAD(&wb->b_more_io);
613 	spin_lock_init(&wb->list_lock);
614 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
615 }
616 
617 /*
618  * Initial write bandwidth: 100 MB/s
619  */
620 #define INIT_BW		(100 << (20 - PAGE_SHIFT))
621 
622 int bdi_init(struct backing_dev_info *bdi)
623 {
624 	int i, err;
625 
626 	bdi->dev = NULL;
627 
628 	bdi->min_ratio = 0;
629 	bdi->max_ratio = 100;
630 	bdi->max_prop_frac = FPROP_FRAC_BASE;
631 	spin_lock_init(&bdi->wb_lock);
632 	INIT_LIST_HEAD(&bdi->bdi_list);
633 	INIT_LIST_HEAD(&bdi->work_list);
634 
635 	bdi_wb_init(&bdi->wb, bdi);
636 
637 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
638 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
639 		if (err)
640 			goto err;
641 	}
642 
643 	bdi->dirty_exceeded = 0;
644 
645 	bdi->bw_time_stamp = jiffies;
646 	bdi->written_stamp = 0;
647 
648 	bdi->balanced_dirty_ratelimit = INIT_BW;
649 	bdi->dirty_ratelimit = INIT_BW;
650 	bdi->write_bandwidth = INIT_BW;
651 	bdi->avg_write_bandwidth = INIT_BW;
652 
653 	err = fprop_local_init_percpu(&bdi->completions);
654 
655 	if (err) {
656 err:
657 		while (i--)
658 			percpu_counter_destroy(&bdi->bdi_stat[i]);
659 	}
660 
661 	return err;
662 }
663 EXPORT_SYMBOL(bdi_init);
664 
665 void bdi_destroy(struct backing_dev_info *bdi)
666 {
667 	int i;
668 
669 	/*
670 	 * Splice our entries to the default_backing_dev_info, if this
671 	 * bdi disappears
672 	 */
673 	if (bdi_has_dirty_io(bdi)) {
674 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
675 
676 		bdi_lock_two(&bdi->wb, dst);
677 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
678 		list_splice(&bdi->wb.b_io, &dst->b_io);
679 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
680 		spin_unlock(&bdi->wb.list_lock);
681 		spin_unlock(&dst->list_lock);
682 	}
683 
684 	bdi_unregister(bdi);
685 
686 	/*
687 	 * If bdi_unregister() had already been called earlier, the
688 	 * wakeup_timer could still be armed because bdi_prune_sb()
689 	 * can race with the bdi_wakeup_thread_delayed() calls from
690 	 * __mark_inode_dirty().
691 	 */
692 	del_timer_sync(&bdi->wb.wakeup_timer);
693 
694 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
695 		percpu_counter_destroy(&bdi->bdi_stat[i]);
696 
697 	fprop_local_destroy_percpu(&bdi->completions);
698 }
699 EXPORT_SYMBOL(bdi_destroy);
700 
701 /*
702  * For use from filesystems to quickly init and register a bdi associated
703  * with dirty writeback
704  */
705 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
706 			   unsigned int cap)
707 {
708 	char tmp[32];
709 	int err;
710 
711 	bdi->name = name;
712 	bdi->capabilities = cap;
713 	err = bdi_init(bdi);
714 	if (err)
715 		return err;
716 
717 	sprintf(tmp, "%.28s%s", name, "-%d");
718 	err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
719 	if (err) {
720 		bdi_destroy(bdi);
721 		return err;
722 	}
723 
724 	return 0;
725 }
726 EXPORT_SYMBOL(bdi_setup_and_register);
727 
728 static wait_queue_head_t congestion_wqh[2] = {
729 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 	};
732 static atomic_t nr_bdi_congested[2];
733 
734 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
735 {
736 	enum bdi_state bit;
737 	wait_queue_head_t *wqh = &congestion_wqh[sync];
738 
739 	bit = sync ? BDI_sync_congested : BDI_async_congested;
740 	if (test_and_clear_bit(bit, &bdi->state))
741 		atomic_dec(&nr_bdi_congested[sync]);
742 	smp_mb__after_clear_bit();
743 	if (waitqueue_active(wqh))
744 		wake_up(wqh);
745 }
746 EXPORT_SYMBOL(clear_bdi_congested);
747 
748 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
749 {
750 	enum bdi_state bit;
751 
752 	bit = sync ? BDI_sync_congested : BDI_async_congested;
753 	if (!test_and_set_bit(bit, &bdi->state))
754 		atomic_inc(&nr_bdi_congested[sync]);
755 }
756 EXPORT_SYMBOL(set_bdi_congested);
757 
758 /**
759  * congestion_wait - wait for a backing_dev to become uncongested
760  * @sync: SYNC or ASYNC IO
761  * @timeout: timeout in jiffies
762  *
763  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
764  * write congestion.  If no backing_devs are congested then just wait for the
765  * next write to be completed.
766  */
767 long congestion_wait(int sync, long timeout)
768 {
769 	long ret;
770 	unsigned long start = jiffies;
771 	DEFINE_WAIT(wait);
772 	wait_queue_head_t *wqh = &congestion_wqh[sync];
773 
774 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
775 	ret = io_schedule_timeout(timeout);
776 	finish_wait(wqh, &wait);
777 
778 	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
779 					jiffies_to_usecs(jiffies - start));
780 
781 	return ret;
782 }
783 EXPORT_SYMBOL(congestion_wait);
784 
785 /**
786  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787  * @zone: A zone to check if it is heavily congested
788  * @sync: SYNC or ASYNC IO
789  * @timeout: timeout in jiffies
790  *
791  * In the event of a congested backing_dev (any backing_dev) and the given
792  * @zone has experienced recent congestion, this waits for up to @timeout
793  * jiffies for either a BDI to exit congestion of the given @sync queue
794  * or a write to complete.
795  *
796  * In the absence of zone congestion, cond_resched() is called to yield
797  * the processor if necessary but otherwise does not sleep.
798  *
799  * The return value is 0 if the sleep is for the full timeout. Otherwise,
800  * it is the number of jiffies that were still remaining when the function
801  * returned. return_value == timeout implies the function did not sleep.
802  */
803 long wait_iff_congested(struct zone *zone, int sync, long timeout)
804 {
805 	long ret;
806 	unsigned long start = jiffies;
807 	DEFINE_WAIT(wait);
808 	wait_queue_head_t *wqh = &congestion_wqh[sync];
809 
810 	/*
811 	 * If there is no congestion, or heavy congestion is not being
812 	 * encountered in the current zone, yield if necessary instead
813 	 * of sleeping on the congestion queue
814 	 */
815 	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 			!zone_is_reclaim_congested(zone)) {
817 		cond_resched();
818 
819 		/* In case we scheduled, work out time remaining */
820 		ret = timeout - (jiffies - start);
821 		if (ret < 0)
822 			ret = 0;
823 
824 		goto out;
825 	}
826 
827 	/* Sleep until uncongested or a write happens */
828 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 	ret = io_schedule_timeout(timeout);
830 	finish_wait(wqh, &wait);
831 
832 out:
833 	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 					jiffies_to_usecs(jiffies - start));
835 
836 	return ret;
837 }
838 EXPORT_SYMBOL(wait_iff_congested);
839 
840 int pdflush_proc_obsolete(struct ctl_table *table, int write,
841 			void __user *buffer, size_t *lenp, loff_t *ppos)
842 {
843 	char kbuf[] = "0\n";
844 
845 	if (*ppos) {
846 		*lenp = 0;
847 		return 0;
848 	}
849 
850 	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
851 		return -EFAULT;
852 	printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
853 			table->procname);
854 
855 	*lenp = 2;
856 	*ppos += *lenp;
857 	return 2;
858 }
859