xref: /openbmc/linux/mm/memory-tiers.c (revision 8d3a7d79)
1992bf775SAneesh Kumar K.V // SPDX-License-Identifier: GPL-2.0
2992bf775SAneesh Kumar K.V #include <linux/slab.h>
3992bf775SAneesh Kumar K.V #include <linux/lockdep.h>
491952440SAneesh Kumar K.V #include <linux/sysfs.h>
591952440SAneesh Kumar K.V #include <linux/kobject.h>
6c6123a19SAneesh Kumar K.V #include <linux/memory.h>
7992bf775SAneesh Kumar K.V #include <linux/memory-tiers.h>
8992bf775SAneesh Kumar K.V 
96c542ab7SAneesh Kumar K.V #include "internal.h"
106c542ab7SAneesh Kumar K.V 
11992bf775SAneesh Kumar K.V struct memory_tier {
12992bf775SAneesh Kumar K.V 	/* hierarchy of memory tiers */
13992bf775SAneesh Kumar K.V 	struct list_head list;
14992bf775SAneesh Kumar K.V 	/* list of all memory types part of this tier */
15992bf775SAneesh Kumar K.V 	struct list_head memory_types;
16992bf775SAneesh Kumar K.V 	/*
17992bf775SAneesh Kumar K.V 	 * start value of abstract distance. memory tier maps
18992bf775SAneesh Kumar K.V 	 * an abstract distance  range,
19992bf775SAneesh Kumar K.V 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20992bf775SAneesh Kumar K.V 	 */
21992bf775SAneesh Kumar K.V 	int adistance_start;
229832fb87SAneesh Kumar K.V 	struct device dev;
2332008027SJagdish Gediya 	/* All the nodes that are part of all the lower memory tiers. */
2432008027SJagdish Gediya 	nodemask_t lower_tier_mask;
25992bf775SAneesh Kumar K.V };
26992bf775SAneesh Kumar K.V 
276c542ab7SAneesh Kumar K.V struct demotion_nodes {
286c542ab7SAneesh Kumar K.V 	nodemask_t preferred;
296c542ab7SAneesh Kumar K.V };
306c542ab7SAneesh Kumar K.V 
317b88bda3SAneesh Kumar K.V struct node_memory_type_map {
327b88bda3SAneesh Kumar K.V 	struct memory_dev_type *memtype;
337b88bda3SAneesh Kumar K.V 	int map_count;
34992bf775SAneesh Kumar K.V };
35992bf775SAneesh Kumar K.V 
36992bf775SAneesh Kumar K.V static DEFINE_MUTEX(memory_tier_lock);
37992bf775SAneesh Kumar K.V static LIST_HEAD(memory_tiers);
387b88bda3SAneesh Kumar K.V static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
397b88bda3SAneesh Kumar K.V static struct memory_dev_type *default_dram_type;
409832fb87SAneesh Kumar K.V 
419832fb87SAneesh Kumar K.V static struct bus_type memory_tier_subsys = {
429832fb87SAneesh Kumar K.V 	.name = "memory_tiering",
439832fb87SAneesh Kumar K.V 	.dev_name = "memory_tier",
449832fb87SAneesh Kumar K.V };
459832fb87SAneesh Kumar K.V 
466c542ab7SAneesh Kumar K.V #ifdef CONFIG_MIGRATION
47467b171aSAneesh Kumar K.V static int top_tier_adistance;
486c542ab7SAneesh Kumar K.V /*
496c542ab7SAneesh Kumar K.V  * node_demotion[] examples:
506c542ab7SAneesh Kumar K.V  *
516c542ab7SAneesh Kumar K.V  * Example 1:
526c542ab7SAneesh Kumar K.V  *
536c542ab7SAneesh Kumar K.V  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
546c542ab7SAneesh Kumar K.V  *
556c542ab7SAneesh Kumar K.V  * node distances:
566c542ab7SAneesh Kumar K.V  * node   0    1    2    3
576c542ab7SAneesh Kumar K.V  *    0  10   20   30   40
586c542ab7SAneesh Kumar K.V  *    1  20   10   40   30
596c542ab7SAneesh Kumar K.V  *    2  30   40   10   40
606c542ab7SAneesh Kumar K.V  *    3  40   30   40   10
616c542ab7SAneesh Kumar K.V  *
626c542ab7SAneesh Kumar K.V  * memory_tiers0 = 0-1
636c542ab7SAneesh Kumar K.V  * memory_tiers1 = 2-3
646c542ab7SAneesh Kumar K.V  *
656c542ab7SAneesh Kumar K.V  * node_demotion[0].preferred = 2
666c542ab7SAneesh Kumar K.V  * node_demotion[1].preferred = 3
676c542ab7SAneesh Kumar K.V  * node_demotion[2].preferred = <empty>
686c542ab7SAneesh Kumar K.V  * node_demotion[3].preferred = <empty>
696c542ab7SAneesh Kumar K.V  *
706c542ab7SAneesh Kumar K.V  * Example 2:
716c542ab7SAneesh Kumar K.V  *
726c542ab7SAneesh Kumar K.V  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
736c542ab7SAneesh Kumar K.V  *
746c542ab7SAneesh Kumar K.V  * node distances:
756c542ab7SAneesh Kumar K.V  * node   0    1    2
766c542ab7SAneesh Kumar K.V  *    0  10   20   30
776c542ab7SAneesh Kumar K.V  *    1  20   10   30
786c542ab7SAneesh Kumar K.V  *    2  30   30   10
796c542ab7SAneesh Kumar K.V  *
806c542ab7SAneesh Kumar K.V  * memory_tiers0 = 0-2
816c542ab7SAneesh Kumar K.V  *
826c542ab7SAneesh Kumar K.V  * node_demotion[0].preferred = <empty>
836c542ab7SAneesh Kumar K.V  * node_demotion[1].preferred = <empty>
846c542ab7SAneesh Kumar K.V  * node_demotion[2].preferred = <empty>
856c542ab7SAneesh Kumar K.V  *
866c542ab7SAneesh Kumar K.V  * Example 3:
876c542ab7SAneesh Kumar K.V  *
886c542ab7SAneesh Kumar K.V  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
896c542ab7SAneesh Kumar K.V  *
906c542ab7SAneesh Kumar K.V  * node distances:
916c542ab7SAneesh Kumar K.V  * node   0    1    2
926c542ab7SAneesh Kumar K.V  *    0  10   20   30
936c542ab7SAneesh Kumar K.V  *    1  20   10   40
946c542ab7SAneesh Kumar K.V  *    2  30   40   10
956c542ab7SAneesh Kumar K.V  *
966c542ab7SAneesh Kumar K.V  * memory_tiers0 = 1
976c542ab7SAneesh Kumar K.V  * memory_tiers1 = 0
986c542ab7SAneesh Kumar K.V  * memory_tiers2 = 2
996c542ab7SAneesh Kumar K.V  *
1006c542ab7SAneesh Kumar K.V  * node_demotion[0].preferred = 2
1016c542ab7SAneesh Kumar K.V  * node_demotion[1].preferred = 0
1026c542ab7SAneesh Kumar K.V  * node_demotion[2].preferred = <empty>
1036c542ab7SAneesh Kumar K.V  *
1046c542ab7SAneesh Kumar K.V  */
1056c542ab7SAneesh Kumar K.V static struct demotion_nodes *node_demotion __read_mostly;
1066c542ab7SAneesh Kumar K.V #endif /* CONFIG_MIGRATION */
107992bf775SAneesh Kumar K.V 
to_memory_tier(struct device * device)1089832fb87SAneesh Kumar K.V static inline struct memory_tier *to_memory_tier(struct device *device)
1099832fb87SAneesh Kumar K.V {
1109832fb87SAneesh Kumar K.V 	return container_of(device, struct memory_tier, dev);
1119832fb87SAneesh Kumar K.V }
1129832fb87SAneesh Kumar K.V 
get_memtier_nodemask(struct memory_tier * memtier)1139832fb87SAneesh Kumar K.V static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
1149832fb87SAneesh Kumar K.V {
1159832fb87SAneesh Kumar K.V 	nodemask_t nodes = NODE_MASK_NONE;
1169832fb87SAneesh Kumar K.V 	struct memory_dev_type *memtype;
1179832fb87SAneesh Kumar K.V 
1189832fb87SAneesh Kumar K.V 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
1199832fb87SAneesh Kumar K.V 		nodes_or(nodes, nodes, memtype->nodes);
1209832fb87SAneesh Kumar K.V 
1219832fb87SAneesh Kumar K.V 	return nodes;
1229832fb87SAneesh Kumar K.V }
1239832fb87SAneesh Kumar K.V 
memory_tier_device_release(struct device * dev)1249832fb87SAneesh Kumar K.V static void memory_tier_device_release(struct device *dev)
1259832fb87SAneesh Kumar K.V {
1269832fb87SAneesh Kumar K.V 	struct memory_tier *tier = to_memory_tier(dev);
1279832fb87SAneesh Kumar K.V 	/*
1289832fb87SAneesh Kumar K.V 	 * synchronize_rcu in clear_node_memory_tier makes sure
1299832fb87SAneesh Kumar K.V 	 * we don't have rcu access to this memory tier.
1309832fb87SAneesh Kumar K.V 	 */
1319832fb87SAneesh Kumar K.V 	kfree(tier);
1329832fb87SAneesh Kumar K.V }
1339832fb87SAneesh Kumar K.V 
nodelist_show(struct device * dev,struct device_attribute * attr,char * buf)13427d676a1SHuang Ying static ssize_t nodelist_show(struct device *dev,
1359832fb87SAneesh Kumar K.V 			     struct device_attribute *attr, char *buf)
1369832fb87SAneesh Kumar K.V {
1379832fb87SAneesh Kumar K.V 	int ret;
1389832fb87SAneesh Kumar K.V 	nodemask_t nmask;
1399832fb87SAneesh Kumar K.V 
1409832fb87SAneesh Kumar K.V 	mutex_lock(&memory_tier_lock);
1419832fb87SAneesh Kumar K.V 	nmask = get_memtier_nodemask(to_memory_tier(dev));
1429832fb87SAneesh Kumar K.V 	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
1439832fb87SAneesh Kumar K.V 	mutex_unlock(&memory_tier_lock);
1449832fb87SAneesh Kumar K.V 	return ret;
1459832fb87SAneesh Kumar K.V }
14627d676a1SHuang Ying static DEVICE_ATTR_RO(nodelist);
1479832fb87SAneesh Kumar K.V 
1489832fb87SAneesh Kumar K.V static struct attribute *memtier_dev_attrs[] = {
14927d676a1SHuang Ying 	&dev_attr_nodelist.attr,
1509832fb87SAneesh Kumar K.V 	NULL
1519832fb87SAneesh Kumar K.V };
1529832fb87SAneesh Kumar K.V 
1539832fb87SAneesh Kumar K.V static const struct attribute_group memtier_dev_group = {
1549832fb87SAneesh Kumar K.V 	.attrs = memtier_dev_attrs,
1559832fb87SAneesh Kumar K.V };
1569832fb87SAneesh Kumar K.V 
1579832fb87SAneesh Kumar K.V static const struct attribute_group *memtier_dev_groups[] = {
1589832fb87SAneesh Kumar K.V 	&memtier_dev_group,
1599832fb87SAneesh Kumar K.V 	NULL
1609832fb87SAneesh Kumar K.V };
1619832fb87SAneesh Kumar K.V 
find_create_memory_tier(struct memory_dev_type * memtype)162992bf775SAneesh Kumar K.V static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
163992bf775SAneesh Kumar K.V {
1649832fb87SAneesh Kumar K.V 	int ret;
165992bf775SAneesh Kumar K.V 	bool found_slot = false;
166992bf775SAneesh Kumar K.V 	struct memory_tier *memtier, *new_memtier;
167992bf775SAneesh Kumar K.V 	int adistance = memtype->adistance;
168992bf775SAneesh Kumar K.V 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
169992bf775SAneesh Kumar K.V 
170992bf775SAneesh Kumar K.V 	lockdep_assert_held_once(&memory_tier_lock);
171992bf775SAneesh Kumar K.V 
172b26ac6f3SAneesh Kumar K.V 	adistance = round_down(adistance, memtier_adistance_chunk_size);
173992bf775SAneesh Kumar K.V 	/*
174992bf775SAneesh Kumar K.V 	 * If the memtype is already part of a memory tier,
175992bf775SAneesh Kumar K.V 	 * just return that.
176992bf775SAneesh Kumar K.V 	 */
177b26ac6f3SAneesh Kumar K.V 	if (!list_empty(&memtype->tier_sibiling)) {
178b26ac6f3SAneesh Kumar K.V 		list_for_each_entry(memtier, &memory_tiers, list) {
179b26ac6f3SAneesh Kumar K.V 			if (adistance == memtier->adistance_start)
180b26ac6f3SAneesh Kumar K.V 				return memtier;
181b26ac6f3SAneesh Kumar K.V 		}
182b26ac6f3SAneesh Kumar K.V 		WARN_ON(1);
183b26ac6f3SAneesh Kumar K.V 		return ERR_PTR(-EINVAL);
184b26ac6f3SAneesh Kumar K.V 	}
185992bf775SAneesh Kumar K.V 
186992bf775SAneesh Kumar K.V 	list_for_each_entry(memtier, &memory_tiers, list) {
187992bf775SAneesh Kumar K.V 		if (adistance == memtier->adistance_start) {
1889832fb87SAneesh Kumar K.V 			goto link_memtype;
189992bf775SAneesh Kumar K.V 		} else if (adistance < memtier->adistance_start) {
190992bf775SAneesh Kumar K.V 			found_slot = true;
191992bf775SAneesh Kumar K.V 			break;
192992bf775SAneesh Kumar K.V 		}
193992bf775SAneesh Kumar K.V 	}
194992bf775SAneesh Kumar K.V 
1959832fb87SAneesh Kumar K.V 	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
196992bf775SAneesh Kumar K.V 	if (!new_memtier)
197992bf775SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
198992bf775SAneesh Kumar K.V 
199992bf775SAneesh Kumar K.V 	new_memtier->adistance_start = adistance;
200992bf775SAneesh Kumar K.V 	INIT_LIST_HEAD(&new_memtier->list);
201992bf775SAneesh Kumar K.V 	INIT_LIST_HEAD(&new_memtier->memory_types);
202992bf775SAneesh Kumar K.V 	if (found_slot)
203992bf775SAneesh Kumar K.V 		list_add_tail(&new_memtier->list, &memtier->list);
204992bf775SAneesh Kumar K.V 	else
205992bf775SAneesh Kumar K.V 		list_add_tail(&new_memtier->list, &memory_tiers);
2069832fb87SAneesh Kumar K.V 
2079832fb87SAneesh Kumar K.V 	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
2089832fb87SAneesh Kumar K.V 	new_memtier->dev.bus = &memory_tier_subsys;
2099832fb87SAneesh Kumar K.V 	new_memtier->dev.release = memory_tier_device_release;
2109832fb87SAneesh Kumar K.V 	new_memtier->dev.groups = memtier_dev_groups;
2119832fb87SAneesh Kumar K.V 
2129832fb87SAneesh Kumar K.V 	ret = device_register(&new_memtier->dev);
2139832fb87SAneesh Kumar K.V 	if (ret) {
21493419139STong Tiangen 		list_del(&new_memtier->list);
21593419139STong Tiangen 		put_device(&new_memtier->dev);
2169832fb87SAneesh Kumar K.V 		return ERR_PTR(ret);
2179832fb87SAneesh Kumar K.V 	}
2189832fb87SAneesh Kumar K.V 	memtier = new_memtier;
2199832fb87SAneesh Kumar K.V 
2209832fb87SAneesh Kumar K.V link_memtype:
2219832fb87SAneesh Kumar K.V 	list_add(&memtype->tier_sibiling, &memtier->memory_types);
2229832fb87SAneesh Kumar K.V 	return memtier;
223992bf775SAneesh Kumar K.V }
224992bf775SAneesh Kumar K.V 
__node_get_memory_tier(int node)2256c542ab7SAneesh Kumar K.V static struct memory_tier *__node_get_memory_tier(int node)
2266c542ab7SAneesh Kumar K.V {
2277766cf7aSAneesh Kumar K.V 	pg_data_t *pgdat;
2286c542ab7SAneesh Kumar K.V 
2297766cf7aSAneesh Kumar K.V 	pgdat = NODE_DATA(node);
2307766cf7aSAneesh Kumar K.V 	if (!pgdat)
2316c542ab7SAneesh Kumar K.V 		return NULL;
2327766cf7aSAneesh Kumar K.V 	/*
2337766cf7aSAneesh Kumar K.V 	 * Since we hold memory_tier_lock, we can avoid
2347766cf7aSAneesh Kumar K.V 	 * RCU read locks when accessing the details. No
2357766cf7aSAneesh Kumar K.V 	 * parallel updates are possible here.
2367766cf7aSAneesh Kumar K.V 	 */
2377766cf7aSAneesh Kumar K.V 	return rcu_dereference_check(pgdat->memtier,
2387766cf7aSAneesh Kumar K.V 				     lockdep_is_held(&memory_tier_lock));
2396c542ab7SAneesh Kumar K.V }
2406c542ab7SAneesh Kumar K.V 
2416c542ab7SAneesh Kumar K.V #ifdef CONFIG_MIGRATION
node_is_toptier(int node)242467b171aSAneesh Kumar K.V bool node_is_toptier(int node)
243467b171aSAneesh Kumar K.V {
244467b171aSAneesh Kumar K.V 	bool toptier;
245467b171aSAneesh Kumar K.V 	pg_data_t *pgdat;
246467b171aSAneesh Kumar K.V 	struct memory_tier *memtier;
247467b171aSAneesh Kumar K.V 
248467b171aSAneesh Kumar K.V 	pgdat = NODE_DATA(node);
249467b171aSAneesh Kumar K.V 	if (!pgdat)
250467b171aSAneesh Kumar K.V 		return false;
251467b171aSAneesh Kumar K.V 
252467b171aSAneesh Kumar K.V 	rcu_read_lock();
253467b171aSAneesh Kumar K.V 	memtier = rcu_dereference(pgdat->memtier);
254467b171aSAneesh Kumar K.V 	if (!memtier) {
255467b171aSAneesh Kumar K.V 		toptier = true;
256467b171aSAneesh Kumar K.V 		goto out;
257467b171aSAneesh Kumar K.V 	}
258467b171aSAneesh Kumar K.V 	if (memtier->adistance_start <= top_tier_adistance)
259467b171aSAneesh Kumar K.V 		toptier = true;
260467b171aSAneesh Kumar K.V 	else
261467b171aSAneesh Kumar K.V 		toptier = false;
262467b171aSAneesh Kumar K.V out:
263467b171aSAneesh Kumar K.V 	rcu_read_unlock();
264467b171aSAneesh Kumar K.V 	return toptier;
265467b171aSAneesh Kumar K.V }
266467b171aSAneesh Kumar K.V 
node_get_allowed_targets(pg_data_t * pgdat,nodemask_t * targets)26732008027SJagdish Gediya void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
26832008027SJagdish Gediya {
26932008027SJagdish Gediya 	struct memory_tier *memtier;
27032008027SJagdish Gediya 
27132008027SJagdish Gediya 	/*
27232008027SJagdish Gediya 	 * pg_data_t.memtier updates includes a synchronize_rcu()
27332008027SJagdish Gediya 	 * which ensures that we either find NULL or a valid memtier
27432008027SJagdish Gediya 	 * in NODE_DATA. protect the access via rcu_read_lock();
27532008027SJagdish Gediya 	 */
27632008027SJagdish Gediya 	rcu_read_lock();
27732008027SJagdish Gediya 	memtier = rcu_dereference(pgdat->memtier);
27832008027SJagdish Gediya 	if (memtier)
27932008027SJagdish Gediya 		*targets = memtier->lower_tier_mask;
28032008027SJagdish Gediya 	else
28132008027SJagdish Gediya 		*targets = NODE_MASK_NONE;
28232008027SJagdish Gediya 	rcu_read_unlock();
28332008027SJagdish Gediya }
28432008027SJagdish Gediya 
2856c542ab7SAneesh Kumar K.V /**
2866c542ab7SAneesh Kumar K.V  * next_demotion_node() - Get the next node in the demotion path
2876c542ab7SAneesh Kumar K.V  * @node: The starting node to lookup the next node
2886c542ab7SAneesh Kumar K.V  *
2896c542ab7SAneesh Kumar K.V  * Return: node id for next memory node in the demotion path hierarchy
2906c542ab7SAneesh Kumar K.V  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
2916c542ab7SAneesh Kumar K.V  * @node online or guarantee that it *continues* to be the next demotion
2926c542ab7SAneesh Kumar K.V  * target.
2936c542ab7SAneesh Kumar K.V  */
next_demotion_node(int node)2946c542ab7SAneesh Kumar K.V int next_demotion_node(int node)
2956c542ab7SAneesh Kumar K.V {
2966c542ab7SAneesh Kumar K.V 	struct demotion_nodes *nd;
2976c542ab7SAneesh Kumar K.V 	int target;
2986c542ab7SAneesh Kumar K.V 
2996c542ab7SAneesh Kumar K.V 	if (!node_demotion)
3006c542ab7SAneesh Kumar K.V 		return NUMA_NO_NODE;
3016c542ab7SAneesh Kumar K.V 
3026c542ab7SAneesh Kumar K.V 	nd = &node_demotion[node];
3036c542ab7SAneesh Kumar K.V 
3046c542ab7SAneesh Kumar K.V 	/*
3056c542ab7SAneesh Kumar K.V 	 * node_demotion[] is updated without excluding this
3066c542ab7SAneesh Kumar K.V 	 * function from running.
3076c542ab7SAneesh Kumar K.V 	 *
3086c542ab7SAneesh Kumar K.V 	 * Make sure to use RCU over entire code blocks if
3096c542ab7SAneesh Kumar K.V 	 * node_demotion[] reads need to be consistent.
3106c542ab7SAneesh Kumar K.V 	 */
3116c542ab7SAneesh Kumar K.V 	rcu_read_lock();
3126c542ab7SAneesh Kumar K.V 	/*
3136c542ab7SAneesh Kumar K.V 	 * If there are multiple target nodes, just select one
3146c542ab7SAneesh Kumar K.V 	 * target node randomly.
3156c542ab7SAneesh Kumar K.V 	 *
3166c542ab7SAneesh Kumar K.V 	 * In addition, we can also use round-robin to select
3176c542ab7SAneesh Kumar K.V 	 * target node, but we should introduce another variable
3186c542ab7SAneesh Kumar K.V 	 * for node_demotion[] to record last selected target node,
3196c542ab7SAneesh Kumar K.V 	 * that may cause cache ping-pong due to the changing of
3206c542ab7SAneesh Kumar K.V 	 * last target node. Or introducing per-cpu data to avoid
3216c542ab7SAneesh Kumar K.V 	 * caching issue, which seems more complicated. So selecting
3226c542ab7SAneesh Kumar K.V 	 * target node randomly seems better until now.
3236c542ab7SAneesh Kumar K.V 	 */
3246c542ab7SAneesh Kumar K.V 	target = node_random(&nd->preferred);
3256c542ab7SAneesh Kumar K.V 	rcu_read_unlock();
3266c542ab7SAneesh Kumar K.V 
3276c542ab7SAneesh Kumar K.V 	return target;
3286c542ab7SAneesh Kumar K.V }
3296c542ab7SAneesh Kumar K.V 
disable_all_demotion_targets(void)3306c542ab7SAneesh Kumar K.V static void disable_all_demotion_targets(void)
3316c542ab7SAneesh Kumar K.V {
33232008027SJagdish Gediya 	struct memory_tier *memtier;
3336c542ab7SAneesh Kumar K.V 	int node;
3346c542ab7SAneesh Kumar K.V 
33532008027SJagdish Gediya 	for_each_node_state(node, N_MEMORY) {
3366c542ab7SAneesh Kumar K.V 		node_demotion[node].preferred = NODE_MASK_NONE;
3376c542ab7SAneesh Kumar K.V 		/*
33832008027SJagdish Gediya 		 * We are holding memory_tier_lock, it is safe
33932008027SJagdish Gediya 		 * to access pgda->memtier.
34032008027SJagdish Gediya 		 */
34132008027SJagdish Gediya 		memtier = __node_get_memory_tier(node);
34232008027SJagdish Gediya 		if (memtier)
34332008027SJagdish Gediya 			memtier->lower_tier_mask = NODE_MASK_NONE;
34432008027SJagdish Gediya 	}
34532008027SJagdish Gediya 	/*
3466c542ab7SAneesh Kumar K.V 	 * Ensure that the "disable" is visible across the system.
3476c542ab7SAneesh Kumar K.V 	 * Readers will see either a combination of before+disable
3486c542ab7SAneesh Kumar K.V 	 * state or disable+after.  They will never see before and
3496c542ab7SAneesh Kumar K.V 	 * after state together.
3506c542ab7SAneesh Kumar K.V 	 */
3516c542ab7SAneesh Kumar K.V 	synchronize_rcu();
3526c542ab7SAneesh Kumar K.V }
3536c542ab7SAneesh Kumar K.V 
3546c542ab7SAneesh Kumar K.V /*
3556c542ab7SAneesh Kumar K.V  * Find an automatic demotion target for all memory
3566c542ab7SAneesh Kumar K.V  * nodes. Failing here is OK.  It might just indicate
3576c542ab7SAneesh Kumar K.V  * being at the end of a chain.
3586c542ab7SAneesh Kumar K.V  */
establish_demotion_targets(void)3596c542ab7SAneesh Kumar K.V static void establish_demotion_targets(void)
3606c542ab7SAneesh Kumar K.V {
3616c542ab7SAneesh Kumar K.V 	struct memory_tier *memtier;
3626c542ab7SAneesh Kumar K.V 	struct demotion_nodes *nd;
3636c542ab7SAneesh Kumar K.V 	int target = NUMA_NO_NODE, node;
3646c542ab7SAneesh Kumar K.V 	int distance, best_distance;
36532008027SJagdish Gediya 	nodemask_t tier_nodes, lower_tier;
3666c542ab7SAneesh Kumar K.V 
3676c542ab7SAneesh Kumar K.V 	lockdep_assert_held_once(&memory_tier_lock);
3686c542ab7SAneesh Kumar K.V 
36933ee4f18SMiaohe Lin 	if (!node_demotion)
3706c542ab7SAneesh Kumar K.V 		return;
3716c542ab7SAneesh Kumar K.V 
3726c542ab7SAneesh Kumar K.V 	disable_all_demotion_targets();
3736c542ab7SAneesh Kumar K.V 
3746c542ab7SAneesh Kumar K.V 	for_each_node_state(node, N_MEMORY) {
3756c542ab7SAneesh Kumar K.V 		best_distance = -1;
3766c542ab7SAneesh Kumar K.V 		nd = &node_demotion[node];
3776c542ab7SAneesh Kumar K.V 
3786c542ab7SAneesh Kumar K.V 		memtier = __node_get_memory_tier(node);
3796c542ab7SAneesh Kumar K.V 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
3806c542ab7SAneesh Kumar K.V 			continue;
3816c542ab7SAneesh Kumar K.V 		/*
3826c542ab7SAneesh Kumar K.V 		 * Get the lower memtier to find the  demotion node list.
3836c542ab7SAneesh Kumar K.V 		 */
3846c542ab7SAneesh Kumar K.V 		memtier = list_next_entry(memtier, list);
3856c542ab7SAneesh Kumar K.V 		tier_nodes = get_memtier_nodemask(memtier);
3866c542ab7SAneesh Kumar K.V 		/*
3876c542ab7SAneesh Kumar K.V 		 * find_next_best_node, use 'used' nodemask as a skip list.
3886c542ab7SAneesh Kumar K.V 		 * Add all memory nodes except the selected memory tier
3896c542ab7SAneesh Kumar K.V 		 * nodelist to skip list so that we find the best node from the
3906c542ab7SAneesh Kumar K.V 		 * memtier nodelist.
3916c542ab7SAneesh Kumar K.V 		 */
3926c542ab7SAneesh Kumar K.V 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
3936c542ab7SAneesh Kumar K.V 
3946c542ab7SAneesh Kumar K.V 		/*
3956c542ab7SAneesh Kumar K.V 		 * Find all the nodes in the memory tier node list of same best distance.
3966c542ab7SAneesh Kumar K.V 		 * add them to the preferred mask. We randomly select between nodes
3976c542ab7SAneesh Kumar K.V 		 * in the preferred mask when allocating pages during demotion.
3986c542ab7SAneesh Kumar K.V 		 */
3996c542ab7SAneesh Kumar K.V 		do {
4006c542ab7SAneesh Kumar K.V 			target = find_next_best_node(node, &tier_nodes);
4016c542ab7SAneesh Kumar K.V 			if (target == NUMA_NO_NODE)
4026c542ab7SAneesh Kumar K.V 				break;
4036c542ab7SAneesh Kumar K.V 
4046c542ab7SAneesh Kumar K.V 			distance = node_distance(node, target);
4056c542ab7SAneesh Kumar K.V 			if (distance == best_distance || best_distance == -1) {
4066c542ab7SAneesh Kumar K.V 				best_distance = distance;
4076c542ab7SAneesh Kumar K.V 				node_set(target, nd->preferred);
4086c542ab7SAneesh Kumar K.V 			} else {
4096c542ab7SAneesh Kumar K.V 				break;
4106c542ab7SAneesh Kumar K.V 			}
4116c542ab7SAneesh Kumar K.V 		} while (1);
4126c542ab7SAneesh Kumar K.V 	}
41332008027SJagdish Gediya 	/*
414467b171aSAneesh Kumar K.V 	 * Promotion is allowed from a memory tier to higher
415467b171aSAneesh Kumar K.V 	 * memory tier only if the memory tier doesn't include
416467b171aSAneesh Kumar K.V 	 * compute. We want to skip promotion from a memory tier,
417467b171aSAneesh Kumar K.V 	 * if any node that is part of the memory tier have CPUs.
418467b171aSAneesh Kumar K.V 	 * Once we detect such a memory tier, we consider that tier
419467b171aSAneesh Kumar K.V 	 * as top tiper from which promotion is not allowed.
420467b171aSAneesh Kumar K.V 	 */
421467b171aSAneesh Kumar K.V 	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
422467b171aSAneesh Kumar K.V 		tier_nodes = get_memtier_nodemask(memtier);
423467b171aSAneesh Kumar K.V 		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
424467b171aSAneesh Kumar K.V 		if (!nodes_empty(tier_nodes)) {
425467b171aSAneesh Kumar K.V 			/*
426467b171aSAneesh Kumar K.V 			 * abstract distance below the max value of this memtier
427467b171aSAneesh Kumar K.V 			 * is considered toptier.
428467b171aSAneesh Kumar K.V 			 */
429467b171aSAneesh Kumar K.V 			top_tier_adistance = memtier->adistance_start +
430467b171aSAneesh Kumar K.V 						MEMTIER_CHUNK_SIZE - 1;
431467b171aSAneesh Kumar K.V 			break;
432467b171aSAneesh Kumar K.V 		}
433467b171aSAneesh Kumar K.V 	}
434467b171aSAneesh Kumar K.V 	/*
43532008027SJagdish Gediya 	 * Now build the lower_tier mask for each node collecting node mask from
43632008027SJagdish Gediya 	 * all memory tier below it. This allows us to fallback demotion page
43732008027SJagdish Gediya 	 * allocation to a set of nodes that is closer the above selected
43832008027SJagdish Gediya 	 * perferred node.
43932008027SJagdish Gediya 	 */
44032008027SJagdish Gediya 	lower_tier = node_states[N_MEMORY];
44132008027SJagdish Gediya 	list_for_each_entry(memtier, &memory_tiers, list) {
44232008027SJagdish Gediya 		/*
44332008027SJagdish Gediya 		 * Keep removing current tier from lower_tier nodes,
44432008027SJagdish Gediya 		 * This will remove all nodes in current and above
44532008027SJagdish Gediya 		 * memory tier from the lower_tier mask.
44632008027SJagdish Gediya 		 */
44732008027SJagdish Gediya 		tier_nodes = get_memtier_nodemask(memtier);
44832008027SJagdish Gediya 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
44932008027SJagdish Gediya 		memtier->lower_tier_mask = lower_tier;
45032008027SJagdish Gediya 	}
4516c542ab7SAneesh Kumar K.V }
4526c542ab7SAneesh Kumar K.V 
4536c542ab7SAneesh Kumar K.V #else
establish_demotion_targets(void)4546c542ab7SAneesh Kumar K.V static inline void establish_demotion_targets(void) {}
4556c542ab7SAneesh Kumar K.V #endif /* CONFIG_MIGRATION */
4566c542ab7SAneesh Kumar K.V 
__init_node_memory_type(int node,struct memory_dev_type * memtype)4577b88bda3SAneesh Kumar K.V static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
4587b88bda3SAneesh Kumar K.V {
4597b88bda3SAneesh Kumar K.V 	if (!node_memory_types[node].memtype)
4607b88bda3SAneesh Kumar K.V 		node_memory_types[node].memtype = memtype;
4617b88bda3SAneesh Kumar K.V 	/*
4627b88bda3SAneesh Kumar K.V 	 * for each device getting added in the same NUMA node
4637b88bda3SAneesh Kumar K.V 	 * with this specific memtype, bump the map count. We
4647b88bda3SAneesh Kumar K.V 	 * Only take memtype device reference once, so that
4657b88bda3SAneesh Kumar K.V 	 * changing a node memtype can be done by droping the
4667b88bda3SAneesh Kumar K.V 	 * only reference count taken here.
4677b88bda3SAneesh Kumar K.V 	 */
4687b88bda3SAneesh Kumar K.V 
4697b88bda3SAneesh Kumar K.V 	if (node_memory_types[node].memtype == memtype) {
4707b88bda3SAneesh Kumar K.V 		if (!node_memory_types[node].map_count++)
4717b88bda3SAneesh Kumar K.V 			kref_get(&memtype->kref);
4727b88bda3SAneesh Kumar K.V 	}
4737b88bda3SAneesh Kumar K.V }
4747b88bda3SAneesh Kumar K.V 
set_node_memory_tier(int node)475992bf775SAneesh Kumar K.V static struct memory_tier *set_node_memory_tier(int node)
476992bf775SAneesh Kumar K.V {
477992bf775SAneesh Kumar K.V 	struct memory_tier *memtier;
478992bf775SAneesh Kumar K.V 	struct memory_dev_type *memtype;
4797766cf7aSAneesh Kumar K.V 	pg_data_t *pgdat = NODE_DATA(node);
4807766cf7aSAneesh Kumar K.V 
481992bf775SAneesh Kumar K.V 
482992bf775SAneesh Kumar K.V 	lockdep_assert_held_once(&memory_tier_lock);
483992bf775SAneesh Kumar K.V 
484992bf775SAneesh Kumar K.V 	if (!node_state(node, N_MEMORY))
485992bf775SAneesh Kumar K.V 		return ERR_PTR(-EINVAL);
486992bf775SAneesh Kumar K.V 
4877b88bda3SAneesh Kumar K.V 	__init_node_memory_type(node, default_dram_type);
488992bf775SAneesh Kumar K.V 
4897b88bda3SAneesh Kumar K.V 	memtype = node_memory_types[node].memtype;
490992bf775SAneesh Kumar K.V 	node_set(node, memtype->nodes);
491992bf775SAneesh Kumar K.V 	memtier = find_create_memory_tier(memtype);
4927766cf7aSAneesh Kumar K.V 	if (!IS_ERR(memtier))
4937766cf7aSAneesh Kumar K.V 		rcu_assign_pointer(pgdat->memtier, memtier);
494992bf775SAneesh Kumar K.V 	return memtier;
495992bf775SAneesh Kumar K.V }
496992bf775SAneesh Kumar K.V 
destroy_memory_tier(struct memory_tier * memtier)497c6123a19SAneesh Kumar K.V static void destroy_memory_tier(struct memory_tier *memtier)
498c6123a19SAneesh Kumar K.V {
499c6123a19SAneesh Kumar K.V 	list_del(&memtier->list);
5009832fb87SAneesh Kumar K.V 	device_unregister(&memtier->dev);
501c6123a19SAneesh Kumar K.V }
502c6123a19SAneesh Kumar K.V 
clear_node_memory_tier(int node)503c6123a19SAneesh Kumar K.V static bool clear_node_memory_tier(int node)
504c6123a19SAneesh Kumar K.V {
505c6123a19SAneesh Kumar K.V 	bool cleared = false;
5067766cf7aSAneesh Kumar K.V 	pg_data_t *pgdat;
507c6123a19SAneesh Kumar K.V 	struct memory_tier *memtier;
508c6123a19SAneesh Kumar K.V 
5097766cf7aSAneesh Kumar K.V 	pgdat = NODE_DATA(node);
5107766cf7aSAneesh Kumar K.V 	if (!pgdat)
5117766cf7aSAneesh Kumar K.V 		return false;
5127766cf7aSAneesh Kumar K.V 
5137766cf7aSAneesh Kumar K.V 	/*
5147766cf7aSAneesh Kumar K.V 	 * Make sure that anybody looking at NODE_DATA who finds
5157766cf7aSAneesh Kumar K.V 	 * a valid memtier finds memory_dev_types with nodes still
5167766cf7aSAneesh Kumar K.V 	 * linked to the memtier. We achieve this by waiting for
5177766cf7aSAneesh Kumar K.V 	 * rcu read section to finish using synchronize_rcu.
5187766cf7aSAneesh Kumar K.V 	 * This also enables us to free the destroyed memory tier
5197766cf7aSAneesh Kumar K.V 	 * with kfree instead of kfree_rcu
5207766cf7aSAneesh Kumar K.V 	 */
521c6123a19SAneesh Kumar K.V 	memtier = __node_get_memory_tier(node);
522c6123a19SAneesh Kumar K.V 	if (memtier) {
523c6123a19SAneesh Kumar K.V 		struct memory_dev_type *memtype;
524c6123a19SAneesh Kumar K.V 
5257766cf7aSAneesh Kumar K.V 		rcu_assign_pointer(pgdat->memtier, NULL);
5267766cf7aSAneesh Kumar K.V 		synchronize_rcu();
5277b88bda3SAneesh Kumar K.V 		memtype = node_memory_types[node].memtype;
528c6123a19SAneesh Kumar K.V 		node_clear(node, memtype->nodes);
529c6123a19SAneesh Kumar K.V 		if (nodes_empty(memtype->nodes)) {
530c6123a19SAneesh Kumar K.V 			list_del_init(&memtype->tier_sibiling);
531c6123a19SAneesh Kumar K.V 			if (list_empty(&memtier->memory_types))
532c6123a19SAneesh Kumar K.V 				destroy_memory_tier(memtier);
533c6123a19SAneesh Kumar K.V 		}
534c6123a19SAneesh Kumar K.V 		cleared = true;
535c6123a19SAneesh Kumar K.V 	}
536c6123a19SAneesh Kumar K.V 	return cleared;
537c6123a19SAneesh Kumar K.V }
538c6123a19SAneesh Kumar K.V 
release_memtype(struct kref * kref)5397b88bda3SAneesh Kumar K.V static void release_memtype(struct kref *kref)
5407b88bda3SAneesh Kumar K.V {
5417b88bda3SAneesh Kumar K.V 	struct memory_dev_type *memtype;
5427b88bda3SAneesh Kumar K.V 
5437b88bda3SAneesh Kumar K.V 	memtype = container_of(kref, struct memory_dev_type, kref);
5447b88bda3SAneesh Kumar K.V 	kfree(memtype);
5457b88bda3SAneesh Kumar K.V }
5467b88bda3SAneesh Kumar K.V 
alloc_memory_type(int adistance)5477b88bda3SAneesh Kumar K.V struct memory_dev_type *alloc_memory_type(int adistance)
5487b88bda3SAneesh Kumar K.V {
5497b88bda3SAneesh Kumar K.V 	struct memory_dev_type *memtype;
5507b88bda3SAneesh Kumar K.V 
5517b88bda3SAneesh Kumar K.V 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
5527b88bda3SAneesh Kumar K.V 	if (!memtype)
5537b88bda3SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
5547b88bda3SAneesh Kumar K.V 
5557b88bda3SAneesh Kumar K.V 	memtype->adistance = adistance;
5567b88bda3SAneesh Kumar K.V 	INIT_LIST_HEAD(&memtype->tier_sibiling);
5577b88bda3SAneesh Kumar K.V 	memtype->nodes  = NODE_MASK_NONE;
5587b88bda3SAneesh Kumar K.V 	kref_init(&memtype->kref);
5597b88bda3SAneesh Kumar K.V 	return memtype;
5607b88bda3SAneesh Kumar K.V }
5617b88bda3SAneesh Kumar K.V EXPORT_SYMBOL_GPL(alloc_memory_type);
5627b88bda3SAneesh Kumar K.V 
put_memory_type(struct memory_dev_type * memtype)563bded67f8SMiaohe Lin void put_memory_type(struct memory_dev_type *memtype)
5647b88bda3SAneesh Kumar K.V {
5657b88bda3SAneesh Kumar K.V 	kref_put(&memtype->kref, release_memtype);
5667b88bda3SAneesh Kumar K.V }
567bded67f8SMiaohe Lin EXPORT_SYMBOL_GPL(put_memory_type);
5687b88bda3SAneesh Kumar K.V 
init_node_memory_type(int node,struct memory_dev_type * memtype)5697b88bda3SAneesh Kumar K.V void init_node_memory_type(int node, struct memory_dev_type *memtype)
5707b88bda3SAneesh Kumar K.V {
5717b88bda3SAneesh Kumar K.V 
5727b88bda3SAneesh Kumar K.V 	mutex_lock(&memory_tier_lock);
5737b88bda3SAneesh Kumar K.V 	__init_node_memory_type(node, memtype);
5747b88bda3SAneesh Kumar K.V 	mutex_unlock(&memory_tier_lock);
5757b88bda3SAneesh Kumar K.V }
5767b88bda3SAneesh Kumar K.V EXPORT_SYMBOL_GPL(init_node_memory_type);
5777b88bda3SAneesh Kumar K.V 
clear_node_memory_type(int node,struct memory_dev_type * memtype)5787b88bda3SAneesh Kumar K.V void clear_node_memory_type(int node, struct memory_dev_type *memtype)
5797b88bda3SAneesh Kumar K.V {
5807b88bda3SAneesh Kumar K.V 	mutex_lock(&memory_tier_lock);
5817b88bda3SAneesh Kumar K.V 	if (node_memory_types[node].memtype == memtype)
5827b88bda3SAneesh Kumar K.V 		node_memory_types[node].map_count--;
5837b88bda3SAneesh Kumar K.V 	/*
5847b88bda3SAneesh Kumar K.V 	 * If we umapped all the attached devices to this node,
5857b88bda3SAneesh Kumar K.V 	 * clear the node memory type.
5867b88bda3SAneesh Kumar K.V 	 */
5877b88bda3SAneesh Kumar K.V 	if (!node_memory_types[node].map_count) {
5887b88bda3SAneesh Kumar K.V 		node_memory_types[node].memtype = NULL;
589bded67f8SMiaohe Lin 		put_memory_type(memtype);
5907b88bda3SAneesh Kumar K.V 	}
5917b88bda3SAneesh Kumar K.V 	mutex_unlock(&memory_tier_lock);
5927b88bda3SAneesh Kumar K.V }
5937b88bda3SAneesh Kumar K.V EXPORT_SYMBOL_GPL(clear_node_memory_type);
5947b88bda3SAneesh Kumar K.V 
memtier_hotplug_callback(struct notifier_block * self,unsigned long action,void * _arg)595c6123a19SAneesh Kumar K.V static int __meminit memtier_hotplug_callback(struct notifier_block *self,
596c6123a19SAneesh Kumar K.V 					      unsigned long action, void *_arg)
597c6123a19SAneesh Kumar K.V {
5986c542ab7SAneesh Kumar K.V 	struct memory_tier *memtier;
599c6123a19SAneesh Kumar K.V 	struct memory_notify *arg = _arg;
600c6123a19SAneesh Kumar K.V 
601c6123a19SAneesh Kumar K.V 	/*
602c6123a19SAneesh Kumar K.V 	 * Only update the node migration order when a node is
603c6123a19SAneesh Kumar K.V 	 * changing status, like online->offline.
604c6123a19SAneesh Kumar K.V 	 */
605c6123a19SAneesh Kumar K.V 	if (arg->status_change_nid < 0)
606c6123a19SAneesh Kumar K.V 		return notifier_from_errno(0);
607c6123a19SAneesh Kumar K.V 
608c6123a19SAneesh Kumar K.V 	switch (action) {
609c6123a19SAneesh Kumar K.V 	case MEM_OFFLINE:
610c6123a19SAneesh Kumar K.V 		mutex_lock(&memory_tier_lock);
6116c542ab7SAneesh Kumar K.V 		if (clear_node_memory_tier(arg->status_change_nid))
6126c542ab7SAneesh Kumar K.V 			establish_demotion_targets();
613c6123a19SAneesh Kumar K.V 		mutex_unlock(&memory_tier_lock);
614c6123a19SAneesh Kumar K.V 		break;
615c6123a19SAneesh Kumar K.V 	case MEM_ONLINE:
616c6123a19SAneesh Kumar K.V 		mutex_lock(&memory_tier_lock);
6176c542ab7SAneesh Kumar K.V 		memtier = set_node_memory_tier(arg->status_change_nid);
6186c542ab7SAneesh Kumar K.V 		if (!IS_ERR(memtier))
6196c542ab7SAneesh Kumar K.V 			establish_demotion_targets();
620c6123a19SAneesh Kumar K.V 		mutex_unlock(&memory_tier_lock);
621c6123a19SAneesh Kumar K.V 		break;
622c6123a19SAneesh Kumar K.V 	}
623c6123a19SAneesh Kumar K.V 
624c6123a19SAneesh Kumar K.V 	return notifier_from_errno(0);
625c6123a19SAneesh Kumar K.V }
626c6123a19SAneesh Kumar K.V 
memory_tier_init(void)627992bf775SAneesh Kumar K.V static int __init memory_tier_init(void)
628992bf775SAneesh Kumar K.V {
6299832fb87SAneesh Kumar K.V 	int ret, node;
630992bf775SAneesh Kumar K.V 	struct memory_tier *memtier;
631992bf775SAneesh Kumar K.V 
6329832fb87SAneesh Kumar K.V 	ret = subsys_virtual_register(&memory_tier_subsys, NULL);
6339832fb87SAneesh Kumar K.V 	if (ret)
6349832fb87SAneesh Kumar K.V 		panic("%s() failed to register memory tier subsystem\n", __func__);
6359832fb87SAneesh Kumar K.V 
6366c542ab7SAneesh Kumar K.V #ifdef CONFIG_MIGRATION
6376c542ab7SAneesh Kumar K.V 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
6386c542ab7SAneesh Kumar K.V 				GFP_KERNEL);
6396c542ab7SAneesh Kumar K.V 	WARN_ON(!node_demotion);
6406c542ab7SAneesh Kumar K.V #endif
641992bf775SAneesh Kumar K.V 	mutex_lock(&memory_tier_lock);
642992bf775SAneesh Kumar K.V 	/*
6437b88bda3SAneesh Kumar K.V 	 * For now we can have 4 faster memory tiers with smaller adistance
6447b88bda3SAneesh Kumar K.V 	 * than default DRAM tier.
6457b88bda3SAneesh Kumar K.V 	 */
6467b88bda3SAneesh Kumar K.V 	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
6474a625ceeSMiaoqian Lin 	if (IS_ERR(default_dram_type))
6487b88bda3SAneesh Kumar K.V 		panic("%s() failed to allocate default DRAM tier\n", __func__);
6497b88bda3SAneesh Kumar K.V 
6507b88bda3SAneesh Kumar K.V 	/*
651992bf775SAneesh Kumar K.V 	 * Look at all the existing N_MEMORY nodes and add them to
652992bf775SAneesh Kumar K.V 	 * default memory tier or to a tier if we already have memory
653992bf775SAneesh Kumar K.V 	 * types assigned.
654992bf775SAneesh Kumar K.V 	 */
655992bf775SAneesh Kumar K.V 	for_each_node_state(node, N_MEMORY) {
656992bf775SAneesh Kumar K.V 		memtier = set_node_memory_tier(node);
657992bf775SAneesh Kumar K.V 		if (IS_ERR(memtier))
658992bf775SAneesh Kumar K.V 			/*
659992bf775SAneesh Kumar K.V 			 * Continue with memtiers we are able to setup
660992bf775SAneesh Kumar K.V 			 */
661992bf775SAneesh Kumar K.V 			break;
662992bf775SAneesh Kumar K.V 	}
6636c542ab7SAneesh Kumar K.V 	establish_demotion_targets();
664992bf775SAneesh Kumar K.V 	mutex_unlock(&memory_tier_lock);
665992bf775SAneesh Kumar K.V 
6661eeaa4fdSLiu Shixin 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
667992bf775SAneesh Kumar K.V 	return 0;
668992bf775SAneesh Kumar K.V }
669992bf775SAneesh Kumar K.V subsys_initcall(memory_tier_init);
67091952440SAneesh Kumar K.V 
67191952440SAneesh Kumar K.V bool numa_demotion_enabled = false;
67291952440SAneesh Kumar K.V 
67391952440SAneesh Kumar K.V #ifdef CONFIG_MIGRATION
67491952440SAneesh Kumar K.V #ifdef CONFIG_SYSFS
demotion_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)675*8d3a7d79SMiaohe Lin static ssize_t demotion_enabled_show(struct kobject *kobj,
67691952440SAneesh Kumar K.V 				     struct kobj_attribute *attr, char *buf)
67791952440SAneesh Kumar K.V {
67891952440SAneesh Kumar K.V 	return sysfs_emit(buf, "%s\n",
67991952440SAneesh Kumar K.V 			  numa_demotion_enabled ? "true" : "false");
68091952440SAneesh Kumar K.V }
68191952440SAneesh Kumar K.V 
demotion_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)682*8d3a7d79SMiaohe Lin static ssize_t demotion_enabled_store(struct kobject *kobj,
68391952440SAneesh Kumar K.V 				      struct kobj_attribute *attr,
68491952440SAneesh Kumar K.V 				      const char *buf, size_t count)
68591952440SAneesh Kumar K.V {
68691952440SAneesh Kumar K.V 	ssize_t ret;
68791952440SAneesh Kumar K.V 
68891952440SAneesh Kumar K.V 	ret = kstrtobool(buf, &numa_demotion_enabled);
68991952440SAneesh Kumar K.V 	if (ret)
69091952440SAneesh Kumar K.V 		return ret;
69191952440SAneesh Kumar K.V 
69291952440SAneesh Kumar K.V 	return count;
69391952440SAneesh Kumar K.V }
69491952440SAneesh Kumar K.V 
69591952440SAneesh Kumar K.V static struct kobj_attribute numa_demotion_enabled_attr =
696*8d3a7d79SMiaohe Lin 	__ATTR_RW(demotion_enabled);
69791952440SAneesh Kumar K.V 
69891952440SAneesh Kumar K.V static struct attribute *numa_attrs[] = {
69991952440SAneesh Kumar K.V 	&numa_demotion_enabled_attr.attr,
70091952440SAneesh Kumar K.V 	NULL,
70191952440SAneesh Kumar K.V };
70291952440SAneesh Kumar K.V 
70391952440SAneesh Kumar K.V static const struct attribute_group numa_attr_group = {
70491952440SAneesh Kumar K.V 	.attrs = numa_attrs,
70591952440SAneesh Kumar K.V };
70691952440SAneesh Kumar K.V 
numa_init_sysfs(void)70791952440SAneesh Kumar K.V static int __init numa_init_sysfs(void)
70891952440SAneesh Kumar K.V {
70991952440SAneesh Kumar K.V 	int err;
71091952440SAneesh Kumar K.V 	struct kobject *numa_kobj;
71191952440SAneesh Kumar K.V 
71291952440SAneesh Kumar K.V 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
71391952440SAneesh Kumar K.V 	if (!numa_kobj) {
71491952440SAneesh Kumar K.V 		pr_err("failed to create numa kobject\n");
71591952440SAneesh Kumar K.V 		return -ENOMEM;
71691952440SAneesh Kumar K.V 	}
71791952440SAneesh Kumar K.V 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
71891952440SAneesh Kumar K.V 	if (err) {
71991952440SAneesh Kumar K.V 		pr_err("failed to register numa group\n");
72091952440SAneesh Kumar K.V 		goto delete_obj;
72191952440SAneesh Kumar K.V 	}
72291952440SAneesh Kumar K.V 	return 0;
72391952440SAneesh Kumar K.V 
72491952440SAneesh Kumar K.V delete_obj:
72591952440SAneesh Kumar K.V 	kobject_put(numa_kobj);
72691952440SAneesh Kumar K.V 	return err;
72791952440SAneesh Kumar K.V }
72891952440SAneesh Kumar K.V subsys_initcall(numa_init_sysfs);
72991952440SAneesh Kumar K.V #endif /* CONFIG_SYSFS */
73091952440SAneesh Kumar K.V #endif
731