extents_status.c (eeca9fad52fc4bfdf42c38bfcf383e932eb3e9d6) extents_status.c (1ab6c4997e04a00c50c6d786c2f046adc0d1f5de)
1/*
2 * fs/ext4/extents_status.c
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Hugh Dickins <hughd@google.com>
8 * Zheng Liu <wenqing.lz@taobao.com>
9 *
10 * Ext4 extents status tree core functions.
11 */
12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
14#include "ext4.h"
15#include "extents_status.h"
1/*
2 * fs/ext4/extents_status.c
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Hugh Dickins <hughd@google.com>
8 * Zheng Liu <wenqing.lz@taobao.com>
9 *
10 * Ext4 extents status tree core functions.
11 */
12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
14#include "ext4.h"
15#include "extents_status.h"
16#include "ext4_extents.h"
17
18#include <trace/events/ext4.h>
19
20/*
21 * According to previous discussion in Ext4 Developer Workshop, we
22 * will introduce a new structure called io tree to track all extent
23 * status in order to solve some problems that we have met
24 * (e.g. Reservation space warning), and provide extent-level locking.

--- 233 unchanged lines hidden (view full) ---

258 read_lock(&EXT4_I(inode)->i_es_lock);
259 tree = &EXT4_I(inode)->i_es_tree;
260
261 /* find extent in cache firstly */
262 es->es_lblk = es->es_len = es->es_pblk = 0;
263 if (tree->cache_es) {
264 es1 = tree->cache_es;
265 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
16
17#include <trace/events/ext4.h>
18
19/*
20 * According to previous discussion in Ext4 Developer Workshop, we
21 * will introduce a new structure called io tree to track all extent
22 * status in order to solve some problems that we have met
23 * (e.g. Reservation space warning), and provide extent-level locking.

--- 233 unchanged lines hidden (view full) ---

257 read_lock(&EXT4_I(inode)->i_es_lock);
258 tree = &EXT4_I(inode)->i_es_tree;
259
260 /* find extent in cache firstly */
261 es->es_lblk = es->es_len = es->es_pblk = 0;
262 if (tree->cache_es) {
263 es1 = tree->cache_es;
264 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
266 es_debug("%u cached by [%u/%u) %llu %llx\n",
265 es_debug("%u cached by [%u/%u) %llu %x\n",
267 lblk, es1->es_lblk, es1->es_len,
268 ext4_es_pblock(es1), ext4_es_status(es1));
269 goto out;
270 }
271 }
272
273 es1 = __es_tree_search(&tree->root, lblk);
274

--- 129 unchanged lines hidden (view full) ---

404 rb_erase(node, &tree->root);
405 ext4_es_free_extent(inode, es1);
406 }
407
408 return es;
409}
410
411#ifdef ES_AGGRESSIVE_TEST
266 lblk, es1->es_lblk, es1->es_len,
267 ext4_es_pblock(es1), ext4_es_status(es1));
268 goto out;
269 }
270 }
271
272 es1 = __es_tree_search(&tree->root, lblk);
273

--- 129 unchanged lines hidden (view full) ---

403 rb_erase(node, &tree->root);
404 ext4_es_free_extent(inode, es1);
405 }
406
407 return es;
408}
409
410#ifdef ES_AGGRESSIVE_TEST
411#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
412
412static void ext4_es_insert_extent_ext_check(struct inode *inode,
413 struct extent_status *es)
414{
415 struct ext4_ext_path *path = NULL;
416 struct ext4_extent *ex;
417 ext4_lblk_t ee_block;
418 ext4_fsblk_t ee_start;
419 unsigned short ee_len;
420 int depth, ee_status, es_status;
421
413static void ext4_es_insert_extent_ext_check(struct inode *inode,
414 struct extent_status *es)
415{
416 struct ext4_ext_path *path = NULL;
417 struct ext4_extent *ex;
418 ext4_lblk_t ee_block;
419 ext4_fsblk_t ee_start;
420 unsigned short ee_len;
421 int depth, ee_status, es_status;
422
422 path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
423 path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
423 if (IS_ERR(path))
424 return;
425
426 depth = ext_depth(inode);
427 ex = path[depth].p_ext;
428
429 if (ex) {
430

--- 205 unchanged lines hidden (view full) ---

636/*
637 * ext4_es_insert_extent() adds information to an inode's extent
638 * status tree.
639 *
640 * Return 0 on success, error code on failure.
641 */
642int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
643 ext4_lblk_t len, ext4_fsblk_t pblk,
424 if (IS_ERR(path))
425 return;
426
427 depth = ext_depth(inode);
428 ex = path[depth].p_ext;
429
430 if (ex) {
431

--- 205 unchanged lines hidden (view full) ---

637/*
638 * ext4_es_insert_extent() adds information to an inode's extent
639 * status tree.
640 *
641 * Return 0 on success, error code on failure.
642 */
643int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
644 ext4_lblk_t len, ext4_fsblk_t pblk,
644 unsigned long long status)
645 unsigned int status)
645{
646 struct extent_status newes;
647 ext4_lblk_t end = lblk + len - 1;
648 int err = 0;
649
646{
647 struct extent_status newes;
648 ext4_lblk_t end = lblk + len - 1;
649 int err = 0;
650
650 es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
651 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
651 lblk, len, pblk, status, inode->i_ino);
652
653 if (!len)
654 return 0;
655
656 BUG_ON(end < lblk);
657
658 newes.es_lblk = lblk;

--- 20 unchanged lines hidden (view full) ---

679 write_unlock(&EXT4_I(inode)->i_es_lock);
680
681 ext4_es_print_tree(inode);
682
683 return err;
684}
685
686/*
652 lblk, len, pblk, status, inode->i_ino);
653
654 if (!len)
655 return 0;
656
657 BUG_ON(end < lblk);
658
659 newes.es_lblk = lblk;

--- 20 unchanged lines hidden (view full) ---

680 write_unlock(&EXT4_I(inode)->i_es_lock);
681
682 ext4_es_print_tree(inode);
683
684 return err;
685}
686
687/*
688 * ext4_es_cache_extent() inserts information into the extent status
689 * tree if and only if there isn't information about the range in
690 * question already.
691 */
692void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
693 ext4_lblk_t len, ext4_fsblk_t pblk,
694 unsigned int status)
695{
696 struct extent_status *es;
697 struct extent_status newes;
698 ext4_lblk_t end = lblk + len - 1;
699
700 newes.es_lblk = lblk;
701 newes.es_len = len;
702 ext4_es_store_pblock(&newes, pblk);
703 ext4_es_store_status(&newes, status);
704 trace_ext4_es_cache_extent(inode, &newes);
705
706 if (!len)
707 return;
708
709 BUG_ON(end < lblk);
710
711 write_lock(&EXT4_I(inode)->i_es_lock);
712
713 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
714 if (!es || es->es_lblk > end)
715 __es_insert_extent(inode, &newes);
716 write_unlock(&EXT4_I(inode)->i_es_lock);
717}
718
719/*
687 * ext4_es_lookup_extent() looks up an extent in extent status tree.
688 *
689 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
690 *
691 * Return: 1 on found, 0 on not
692 */
693int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
694 struct extent_status *es)

--- 171 unchanged lines hidden (view full) ---

866
867 write_lock(&EXT4_I(inode)->i_es_lock);
868 err = __es_remove_extent(inode, lblk, end);
869 write_unlock(&EXT4_I(inode)->i_es_lock);
870 ext4_es_print_tree(inode);
871 return err;
872}
873
720 * ext4_es_lookup_extent() looks up an extent in extent status tree.
721 *
722 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
723 *
724 * Return: 1 on found, 0 on not
725 */
726int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
727 struct extent_status *es)

--- 171 unchanged lines hidden (view full) ---

899
900 write_lock(&EXT4_I(inode)->i_es_lock);
901 err = __es_remove_extent(inode, lblk, end);
902 write_unlock(&EXT4_I(inode)->i_es_lock);
903 ext4_es_print_tree(inode);
904 return err;
905}
906
874int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
875{
876 ext4_lblk_t ee_block;
877 ext4_fsblk_t ee_pblock;
878 unsigned int ee_len;
879
880 ee_block = le32_to_cpu(ex->ee_block);
881 ee_len = ext4_ext_get_actual_len(ex);
882 ee_pblock = ext4_ext_pblock(ex);
883
884 if (ee_len == 0)
885 return 0;
886
887 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
888 EXTENT_STATUS_WRITTEN);
889}
890
891static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
892 struct list_head *b)
893{
894 struct ext4_inode_info *eia, *eib;
895 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
896 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
897
907static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
908 struct list_head *b)
909{
910 struct ext4_inode_info *eia, *eib;
911 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
912 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
913
914 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
915 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
916 return 1;
917 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
918 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
919 return -1;
898 if (eia->i_touch_when == eib->i_touch_when)
899 return 0;
900 if (time_after(eia->i_touch_when, eib->i_touch_when))
901 return 1;
902 else
903 return -1;
904}
905
906static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
907 struct ext4_inode_info *locked_ei)
908{
909 struct ext4_inode_info *ei;
910 struct list_head *cur, *tmp;
920 if (eia->i_touch_when == eib->i_touch_when)
921 return 0;
922 if (time_after(eia->i_touch_when, eib->i_touch_when))
923 return 1;
924 else
925 return -1;
926}
927
928static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
929 struct ext4_inode_info *locked_ei)
930{
931 struct ext4_inode_info *ei;
932 struct list_head *cur, *tmp;
911 LIST_HEAD(skiped);
912 int ret, nr_shrunk = 0;
933 LIST_HEAD(skipped);
934 int nr_shrunk = 0;
935 int retried = 0, skip_precached = 1, nr_skipped = 0;
913
914 spin_lock(&sbi->s_es_lru_lock);
915
936
937 spin_lock(&sbi->s_es_lru_lock);
938
916 /*
917 * If the inode that is at the head of LRU list is newer than
918 * last_sorted time, that means that we need to sort this list.
919 */
920 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
921 if (sbi->s_es_last_sorted < ei->i_touch_when) {
922 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
923 sbi->s_es_last_sorted = jiffies;
924 }
925
939retry:
926 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
940 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
941 int shrunk;
942
927 /*
928 * If we have already reclaimed all extents from extent
929 * status tree, just stop the loop immediately.
930 */
931 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
932 break;
933
934 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
935
943 /*
944 * If we have already reclaimed all extents from extent
945 * status tree, just stop the loop immediately.
946 */
947 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
948 break;
949
950 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
951
936 /* Skip the inode that is newer than the last_sorted time */
937 if (sbi->s_es_last_sorted < ei->i_touch_when) {
938 list_move_tail(cur, &skiped);
952 /*
953 * Skip the inode that is newer than the last_sorted
954 * time. Normally we try hard to avoid shrinking
955 * precached inodes, but we will as a last resort.
956 */
957 if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
958 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
959 EXT4_STATE_EXT_PRECACHED))) {
960 nr_skipped++;
961 list_move_tail(cur, &skipped);
939 continue;
940 }
941
942 if (ei->i_es_lru_nr == 0 || ei == locked_ei)
943 continue;
944
945 write_lock(&ei->i_es_lock);
962 continue;
963 }
964
965 if (ei->i_es_lru_nr == 0 || ei == locked_ei)
966 continue;
967
968 write_lock(&ei->i_es_lock);
946 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
969 shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
947 if (ei->i_es_lru_nr == 0)
948 list_del_init(&ei->i_es_lru);
949 write_unlock(&ei->i_es_lock);
950
970 if (ei->i_es_lru_nr == 0)
971 list_del_init(&ei->i_es_lru);
972 write_unlock(&ei->i_es_lock);
973
951 nr_shrunk += ret;
952 nr_to_scan -= ret;
974 nr_shrunk += shrunk;
975 nr_to_scan -= shrunk;
953 if (nr_to_scan == 0)
954 break;
955 }
956
957 /* Move the newer inodes into the tail of the LRU list. */
976 if (nr_to_scan == 0)
977 break;
978 }
979
980 /* Move the newer inodes into the tail of the LRU list. */
958 list_splice_tail(&skiped, &sbi->s_es_lru);
981 list_splice_tail(&skipped, &sbi->s_es_lru);
982 INIT_LIST_HEAD(&skipped);
983
984 /*
985 * If we skipped any inodes, and we weren't able to make any
986 * forward progress, sort the list and try again.
987 */
988 if ((nr_shrunk == 0) && nr_skipped && !retried) {
989 retried++;
990 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
991 sbi->s_es_last_sorted = jiffies;
992 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
993 i_es_lru);
994 /*
995 * If there are no non-precached inodes left on the
996 * list, start releasing precached extents.
997 */
998 if (ext4_test_inode_state(&ei->vfs_inode,
999 EXT4_STATE_EXT_PRECACHED))
1000 skip_precached = 0;
1001 goto retry;
1002 }
1003
959 spin_unlock(&sbi->s_es_lru_lock);
960
961 if (locked_ei && nr_shrunk == 0)
1004 spin_unlock(&sbi->s_es_lru_lock);
1005
1006 if (locked_ei && nr_shrunk == 0)
962 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
1007 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
963
964 return nr_shrunk;
965}
966
1008
1009 return nr_shrunk;
1010}
1011
967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
1012static unsigned long ext4_es_count(struct shrinker *shrink,
1013 struct shrink_control *sc)
968{
1014{
1015 unsigned long nr;
1016 struct ext4_sb_info *sbi;
1017
1018 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1019 nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
1020 trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
1021 return nr;
1022}
1023
1024static unsigned long ext4_es_scan(struct shrinker *shrink,
1025 struct shrink_control *sc)
1026{
969 struct ext4_sb_info *sbi = container_of(shrink,
970 struct ext4_sb_info, s_es_shrinker);
971 int nr_to_scan = sc->nr_to_scan;
972 int ret, nr_shrunk;
973
974 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
975 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
976
977 if (!nr_to_scan)
978 return ret;
979
980 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
981
1027 struct ext4_sb_info *sbi = container_of(shrink,
1028 struct ext4_sb_info, s_es_shrinker);
1029 int nr_to_scan = sc->nr_to_scan;
1030 int ret, nr_shrunk;
1031
1032 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
1033 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
1034
1035 if (!nr_to_scan)
1036 return ret;
1037
1038 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
1039
982 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
983 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
1040 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
984 return ret;
1041 return nr_shrunk;
985}
986
987void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
988{
989 INIT_LIST_HEAD(&sbi->s_es_lru);
990 spin_lock_init(&sbi->s_es_lru_lock);
991 sbi->s_es_last_sorted = 0;
1042}
1043
1044void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1045{
1046 INIT_LIST_HEAD(&sbi->s_es_lru);
1047 spin_lock_init(&sbi->s_es_lru_lock);
1048 sbi->s_es_last_sorted = 0;
992 sbi->s_es_shrinker.shrink = ext4_es_shrink;
1049 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1050 sbi->s_es_shrinker.count_objects = ext4_es_count;
993 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
994 register_shrinker(&sbi->s_es_shrinker);
995}
996
997void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
998{
999 unregister_shrinker(&sbi->s_es_shrinker);
1000}

--- 27 unchanged lines hidden (view full) ---

1028
1029static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1030 int nr_to_scan)
1031{
1032 struct inode *inode = &ei->vfs_inode;
1033 struct ext4_es_tree *tree = &ei->i_es_tree;
1034 struct rb_node *node;
1035 struct extent_status *es;
1051 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1052 register_shrinker(&sbi->s_es_shrinker);
1053}
1054
1055void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1056{
1057 unregister_shrinker(&sbi->s_es_shrinker);
1058}

--- 27 unchanged lines hidden (view full) ---

1086
1087static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1088 int nr_to_scan)
1089{
1090 struct inode *inode = &ei->vfs_inode;
1091 struct ext4_es_tree *tree = &ei->i_es_tree;
1092 struct rb_node *node;
1093 struct extent_status *es;
1036 int nr_shrunk = 0;
1094 unsigned long nr_shrunk = 0;
1095 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1096 DEFAULT_RATELIMIT_BURST);
1037
1038 if (ei->i_es_lru_nr == 0)
1039 return 0;
1040
1097
1098 if (ei->i_es_lru_nr == 0)
1099 return 0;
1100
1101 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1102 __ratelimit(&_rs))
1103 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1104
1041 node = rb_first(&tree->root);
1042 while (node != NULL) {
1043 es = rb_entry(node, struct extent_status, rb_node);
1044 node = rb_next(&es->rb_node);
1045 /*
1046 * We can't reclaim delayed extent from status tree because
1047 * fiemap, bigallic, and seek_data/hole need to use it.
1048 */
1049 if (!ext4_es_is_delayed(es)) {
1050 rb_erase(&es->rb_node, &tree->root);
1051 ext4_es_free_extent(inode, es);
1052 nr_shrunk++;
1053 if (--nr_to_scan == 0)
1054 break;
1055 }
1056 }
1057 tree->cache_es = NULL;
1058 return nr_shrunk;
1059}
1105 node = rb_first(&tree->root);
1106 while (node != NULL) {
1107 es = rb_entry(node, struct extent_status, rb_node);
1108 node = rb_next(&es->rb_node);
1109 /*
1110 * We can't reclaim delayed extent from status tree because
1111 * fiemap, bigallic, and seek_data/hole need to use it.
1112 */
1113 if (!ext4_es_is_delayed(es)) {
1114 rb_erase(&es->rb_node, &tree->root);
1115 ext4_es_free_extent(inode, es);
1116 nr_shrunk++;
1117 if (--nr_to_scan == 0)
1118 break;
1119 }
1120 }
1121 tree->cache_es = NULL;
1122 return nr_shrunk;
1123}