fair.c (b3bbcc5d1da1b654091dad15980b3d58fdae0fc6) fair.c (467b171af881282fc627328e6c164f044a6df888)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>

--- 26 unchanged lines hidden (view full) ---

35#include <linux/sched/clock.h>
36#include <linux/sched/cond_resched.h>
37#include <linux/sched/cputime.h>
38#include <linux/sched/isolation.h>
39#include <linux/sched/nohz.h>
40
41#include <linux/cpuidle.h>
42#include <linux/interrupt.h>
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>

--- 26 unchanged lines hidden (view full) ---

35#include <linux/sched/clock.h>
36#include <linux/sched/cond_resched.h>
37#include <linux/sched/cputime.h>
38#include <linux/sched/isolation.h>
39#include <linux/sched/nohz.h>
40
41#include <linux/cpuidle.h>
42#include <linux/interrupt.h>
43#include <linux/memory-tiers.h>
43#include <linux/mempolicy.h>
44#include <linux/mutex_api.h>
45#include <linux/profile.h>
46#include <linux/psi.h>
47#include <linux/ratelimit.h>
48#include <linux/task_work.h>
49
50#include <asm/switch_to.h>

--- 1038 unchanged lines hidden (view full) ---

1089unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1090
1091/* Portion of address space to scan in MB */
1092unsigned int sysctl_numa_balancing_scan_size = 256;
1093
1094/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1095unsigned int sysctl_numa_balancing_scan_delay = 1000;
1096
44#include <linux/mempolicy.h>
45#include <linux/mutex_api.h>
46#include <linux/profile.h>
47#include <linux/psi.h>
48#include <linux/ratelimit.h>
49#include <linux/task_work.h>
50
51#include <asm/switch_to.h>

--- 1038 unchanged lines hidden (view full) ---

1090unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1091
1092/* Portion of address space to scan in MB */
1093unsigned int sysctl_numa_balancing_scan_size = 256;
1094
1095/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1096unsigned int sysctl_numa_balancing_scan_delay = 1000;
1097
1098/* The page with hint page fault latency < threshold in ms is considered hot */
1099unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1100
1101/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
1102unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
1103
1097struct numa_group {
1098 refcount_t refcount;
1099
1100 spinlock_t lock; /* nr_tasks, tasks */
1101 int nr_tasks;
1102 pid_t gid;
1103 int active_nodes;
1104

--- 326 unchanged lines hidden (view full) ---

1431 return 0;
1432
1433 faults = group_faults(p, nid);
1434 faults += score_nearby_nodes(p, nid, dist, false);
1435
1436 return 1000 * faults / total_faults;
1437}
1438
1104struct numa_group {
1105 refcount_t refcount;
1106
1107 spinlock_t lock; /* nr_tasks, tasks */
1108 int nr_tasks;
1109 pid_t gid;
1110 int active_nodes;
1111

--- 326 unchanged lines hidden (view full) ---

1438 return 0;
1439
1440 faults = group_faults(p, nid);
1441 faults += score_nearby_nodes(p, nid, dist, false);
1442
1443 return 1000 * faults / total_faults;
1444}
1445
1446/*
1447 * If memory tiering mode is enabled, cpupid of slow memory page is
1448 * used to record scan time instead of CPU and PID. When tiering mode
1449 * is disabled at run time, the scan time (in cpupid) will be
1450 * interpreted as CPU and PID. So CPU needs to be checked to avoid to
1451 * access out of array bound.
1452 */
1453static inline bool cpupid_valid(int cpupid)
1454{
1455 return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1456}
1457
1458/*
1459 * For memory tiering mode, if there are enough free pages (more than
1460 * enough watermark defined here) in fast memory node, to take full
1461 * advantage of fast memory capacity, all recently accessed slow
1462 * memory pages will be migrated to fast memory node without
1463 * considering hot threshold.
1464 */
1465static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1466{
1467 int z;
1468 unsigned long enough_wmark;
1469
1470 enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
1471 pgdat->node_present_pages >> 4);
1472 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1473 struct zone *zone = pgdat->node_zones + z;
1474
1475 if (!populated_zone(zone))
1476 continue;
1477
1478 if (zone_watermark_ok(zone, 0,
1479 wmark_pages(zone, WMARK_PROMO) + enough_wmark,
1480 ZONE_MOVABLE, 0))
1481 return true;
1482 }
1483 return false;
1484}
1485
1486/*
1487 * For memory tiering mode, when page tables are scanned, the scan
1488 * time will be recorded in struct page in addition to make page
1489 * PROT_NONE for slow memory page. So when the page is accessed, in
1490 * hint page fault handler, the hint page fault latency is calculated
1491 * via,
1492 *
1493 * hint page fault latency = hint page fault time - scan time
1494 *
1495 * The smaller the hint page fault latency, the higher the possibility
1496 * for the page to be hot.
1497 */
1498static int numa_hint_fault_latency(struct page *page)
1499{
1500 int last_time, time;
1501
1502 time = jiffies_to_msecs(jiffies);
1503 last_time = xchg_page_access_time(page, time);
1504
1505 return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1506}
1507
1508/*
1509 * For memory tiering mode, too high promotion/demotion throughput may
1510 * hurt application latency. So we provide a mechanism to rate limit
1511 * the number of pages that are tried to be promoted.
1512 */
1513static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1514 unsigned long rate_limit, int nr)
1515{
1516 unsigned long nr_cand;
1517 unsigned int now, start;
1518
1519 now = jiffies_to_msecs(jiffies);
1520 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1521 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1522 start = pgdat->nbp_rl_start;
1523 if (now - start > MSEC_PER_SEC &&
1524 cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1525 pgdat->nbp_rl_nr_cand = nr_cand;
1526 if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1527 return true;
1528 return false;
1529}
1530
1531#define NUMA_MIGRATION_ADJUST_STEPS 16
1532
1533static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1534 unsigned long rate_limit,
1535 unsigned int ref_th)
1536{
1537 unsigned int now, start, th_period, unit_th, th;
1538 unsigned long nr_cand, ref_cand, diff_cand;
1539
1540 now = jiffies_to_msecs(jiffies);
1541 th_period = sysctl_numa_balancing_scan_period_max;
1542 start = pgdat->nbp_th_start;
1543 if (now - start > th_period &&
1544 cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1545 ref_cand = rate_limit *
1546 sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1547 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1548 diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1549 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
1550 th = pgdat->nbp_threshold ? : ref_th;
1551 if (diff_cand > ref_cand * 11 / 10)
1552 th = max(th - unit_th, unit_th);
1553 else if (diff_cand < ref_cand * 9 / 10)
1554 th = min(th + unit_th, ref_th * 2);
1555 pgdat->nbp_th_nr_cand = nr_cand;
1556 pgdat->nbp_threshold = th;
1557 }
1558}
1559
1439bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1440 int src_nid, int dst_cpu)
1441{
1442 struct numa_group *ng = deref_curr_numa_group(p);
1443 int dst_nid = cpu_to_node(dst_cpu);
1444 int last_cpupid, this_cpupid;
1445
1560bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1561 int src_nid, int dst_cpu)
1562{
1563 struct numa_group *ng = deref_curr_numa_group(p);
1564 int dst_nid = cpu_to_node(dst_cpu);
1565 int last_cpupid, this_cpupid;
1566
1567 /*
1568 * The pages in slow memory node should be migrated according
1569 * to hot/cold instead of private/shared.
1570 */
1571 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
1572 !node_is_toptier(src_nid)) {
1573 struct pglist_data *pgdat;
1574 unsigned long rate_limit;
1575 unsigned int latency, th, def_th;
1576
1577 pgdat = NODE_DATA(dst_nid);
1578 if (pgdat_free_space_enough(pgdat)) {
1579 /* workload changed, reset hot threshold */
1580 pgdat->nbp_threshold = 0;
1581 return true;
1582 }
1583
1584 def_th = sysctl_numa_balancing_hot_threshold;
1585 rate_limit = sysctl_numa_balancing_promote_rate_limit << \
1586 (20 - PAGE_SHIFT);
1587 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
1588
1589 th = pgdat->nbp_threshold ? : def_th;
1590 latency = numa_hint_fault_latency(page);
1591 if (latency >= th)
1592 return false;
1593
1594 return !numa_promotion_rate_limit(pgdat, rate_limit,
1595 thp_nr_pages(page));
1596 }
1597
1446 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1447 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1448
1598 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1599 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1600
1601 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1602 !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
1603 return false;
1604
1449 /*
1450 * Allow first faults or private faults to migrate immediately early in
1451 * the lifetime of a task. The magic number 4 is based on waiting for
1452 * two full passes of the "multi-stage node selection" test that is
1453 * executed below.
1454 */
1455 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1456 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))

--- 1223 unchanged lines hidden (view full) ---

2680
2681 if (!static_branch_likely(&sched_numa_balancing))
2682 return;
2683
2684 /* for example, ksmd faulting in a user's mm */
2685 if (!p->mm)
2686 return;
2687
1605 /*
1606 * Allow first faults or private faults to migrate immediately early in
1607 * the lifetime of a task. The magic number 4 is based on waiting for
1608 * two full passes of the "multi-stage node selection" test that is
1609 * executed below.
1610 */
1611 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1612 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))

--- 1223 unchanged lines hidden (view full) ---

2836
2837 if (!static_branch_likely(&sched_numa_balancing))
2838 return;
2839
2840 /* for example, ksmd faulting in a user's mm */
2841 if (!p->mm)
2842 return;
2843
2844 /*
2845 * NUMA faults statistics are unnecessary for the slow memory
2846 * node for memory tiering mode.
2847 */
2848 if (!node_is_toptier(mem_node) &&
2849 (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
2850 !cpupid_valid(last_cpupid)))
2851 return;
2852
2688 /* Allocate buffer to track faults on a per-node basis */
2689 if (unlikely(!p->numa_faults)) {
2690 int size = sizeof(*p->numa_faults) *
2691 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2692
2693 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2694 if (!p->numa_faults)
2695 return;

--- 9392 unchanged lines hidden ---
2853 /* Allocate buffer to track faults on a per-node basis */
2854 if (unlikely(!p->numa_faults)) {
2855 int size = sizeof(*p->numa_faults) *
2856 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2857
2858 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2859 if (!p->numa_faults)
2860 return;

--- 9392 unchanged lines hidden ---