fair.c (b3bbcc5d1da1b654091dad15980b3d58fdae0fc6) | fair.c (467b171af881282fc627328e6c164f044a6df888) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> --- 26 unchanged lines hidden (view full) --- 35#include <linux/sched/clock.h> 36#include <linux/sched/cond_resched.h> 37#include <linux/sched/cputime.h> 38#include <linux/sched/isolation.h> 39#include <linux/sched/nohz.h> 40 41#include <linux/cpuidle.h> 42#include <linux/interrupt.h> | 1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> --- 26 unchanged lines hidden (view full) --- 35#include <linux/sched/clock.h> 36#include <linux/sched/cond_resched.h> 37#include <linux/sched/cputime.h> 38#include <linux/sched/isolation.h> 39#include <linux/sched/nohz.h> 40 41#include <linux/cpuidle.h> 42#include <linux/interrupt.h> |
43#include <linux/memory-tiers.h> |
|
43#include <linux/mempolicy.h> 44#include <linux/mutex_api.h> 45#include <linux/profile.h> 46#include <linux/psi.h> 47#include <linux/ratelimit.h> 48#include <linux/task_work.h> 49 50#include <asm/switch_to.h> --- 1038 unchanged lines hidden (view full) --- 1089unsigned int sysctl_numa_balancing_scan_period_max = 60000; 1090 1091/* Portion of address space to scan in MB */ 1092unsigned int sysctl_numa_balancing_scan_size = 256; 1093 1094/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 1095unsigned int sysctl_numa_balancing_scan_delay = 1000; 1096 | 44#include <linux/mempolicy.h> 45#include <linux/mutex_api.h> 46#include <linux/profile.h> 47#include <linux/psi.h> 48#include <linux/ratelimit.h> 49#include <linux/task_work.h> 50 51#include <asm/switch_to.h> --- 1038 unchanged lines hidden (view full) --- 1090unsigned int sysctl_numa_balancing_scan_period_max = 60000; 1091 1092/* Portion of address space to scan in MB */ 1093unsigned int sysctl_numa_balancing_scan_size = 256; 1094 1095/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 1096unsigned int sysctl_numa_balancing_scan_delay = 1000; 1097 |
1098/* The page with hint page fault latency < threshold in ms is considered hot */ 1099unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; 1100 1101/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ 1102unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; 1103 |
|
1097struct numa_group { 1098 refcount_t refcount; 1099 1100 spinlock_t lock; /* nr_tasks, tasks */ 1101 int nr_tasks; 1102 pid_t gid; 1103 int active_nodes; 1104 --- 326 unchanged lines hidden (view full) --- 1431 return 0; 1432 1433 faults = group_faults(p, nid); 1434 faults += score_nearby_nodes(p, nid, dist, false); 1435 1436 return 1000 * faults / total_faults; 1437} 1438 | 1104struct numa_group { 1105 refcount_t refcount; 1106 1107 spinlock_t lock; /* nr_tasks, tasks */ 1108 int nr_tasks; 1109 pid_t gid; 1110 int active_nodes; 1111 --- 326 unchanged lines hidden (view full) --- 1438 return 0; 1439 1440 faults = group_faults(p, nid); 1441 faults += score_nearby_nodes(p, nid, dist, false); 1442 1443 return 1000 * faults / total_faults; 1444} 1445 |
1446/* 1447 * If memory tiering mode is enabled, cpupid of slow memory page is 1448 * used to record scan time instead of CPU and PID. When tiering mode 1449 * is disabled at run time, the scan time (in cpupid) will be 1450 * interpreted as CPU and PID. So CPU needs to be checked to avoid to 1451 * access out of array bound. 1452 */ 1453static inline bool cpupid_valid(int cpupid) 1454{ 1455 return cpupid_to_cpu(cpupid) < nr_cpu_ids; 1456} 1457 1458/* 1459 * For memory tiering mode, if there are enough free pages (more than 1460 * enough watermark defined here) in fast memory node, to take full 1461 * advantage of fast memory capacity, all recently accessed slow 1462 * memory pages will be migrated to fast memory node without 1463 * considering hot threshold. 1464 */ 1465static bool pgdat_free_space_enough(struct pglist_data *pgdat) 1466{ 1467 int z; 1468 unsigned long enough_wmark; 1469 1470 enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, 1471 pgdat->node_present_pages >> 4); 1472 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1473 struct zone *zone = pgdat->node_zones + z; 1474 1475 if (!populated_zone(zone)) 1476 continue; 1477 1478 if (zone_watermark_ok(zone, 0, 1479 wmark_pages(zone, WMARK_PROMO) + enough_wmark, 1480 ZONE_MOVABLE, 0)) 1481 return true; 1482 } 1483 return false; 1484} 1485 1486/* 1487 * For memory tiering mode, when page tables are scanned, the scan 1488 * time will be recorded in struct page in addition to make page 1489 * PROT_NONE for slow memory page. So when the page is accessed, in 1490 * hint page fault handler, the hint page fault latency is calculated 1491 * via, 1492 * 1493 * hint page fault latency = hint page fault time - scan time 1494 * 1495 * The smaller the hint page fault latency, the higher the possibility 1496 * for the page to be hot. 1497 */ 1498static int numa_hint_fault_latency(struct page *page) 1499{ 1500 int last_time, time; 1501 1502 time = jiffies_to_msecs(jiffies); 1503 last_time = xchg_page_access_time(page, time); 1504 1505 return (time - last_time) & PAGE_ACCESS_TIME_MASK; 1506} 1507 1508/* 1509 * For memory tiering mode, too high promotion/demotion throughput may 1510 * hurt application latency. So we provide a mechanism to rate limit 1511 * the number of pages that are tried to be promoted. 1512 */ 1513static bool numa_promotion_rate_limit(struct pglist_data *pgdat, 1514 unsigned long rate_limit, int nr) 1515{ 1516 unsigned long nr_cand; 1517 unsigned int now, start; 1518 1519 now = jiffies_to_msecs(jiffies); 1520 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); 1521 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 1522 start = pgdat->nbp_rl_start; 1523 if (now - start > MSEC_PER_SEC && 1524 cmpxchg(&pgdat->nbp_rl_start, start, now) == start) 1525 pgdat->nbp_rl_nr_cand = nr_cand; 1526 if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) 1527 return true; 1528 return false; 1529} 1530 1531#define NUMA_MIGRATION_ADJUST_STEPS 16 1532 1533static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, 1534 unsigned long rate_limit, 1535 unsigned int ref_th) 1536{ 1537 unsigned int now, start, th_period, unit_th, th; 1538 unsigned long nr_cand, ref_cand, diff_cand; 1539 1540 now = jiffies_to_msecs(jiffies); 1541 th_period = sysctl_numa_balancing_scan_period_max; 1542 start = pgdat->nbp_th_start; 1543 if (now - start > th_period && 1544 cmpxchg(&pgdat->nbp_th_start, start, now) == start) { 1545 ref_cand = rate_limit * 1546 sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; 1547 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 1548 diff_cand = nr_cand - pgdat->nbp_th_nr_cand; 1549 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; 1550 th = pgdat->nbp_threshold ? : ref_th; 1551 if (diff_cand > ref_cand * 11 / 10) 1552 th = max(th - unit_th, unit_th); 1553 else if (diff_cand < ref_cand * 9 / 10) 1554 th = min(th + unit_th, ref_th * 2); 1555 pgdat->nbp_th_nr_cand = nr_cand; 1556 pgdat->nbp_threshold = th; 1557 } 1558} 1559 |
|
1439bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1440 int src_nid, int dst_cpu) 1441{ 1442 struct numa_group *ng = deref_curr_numa_group(p); 1443 int dst_nid = cpu_to_node(dst_cpu); 1444 int last_cpupid, this_cpupid; 1445 | 1560bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1561 int src_nid, int dst_cpu) 1562{ 1563 struct numa_group *ng = deref_curr_numa_group(p); 1564 int dst_nid = cpu_to_node(dst_cpu); 1565 int last_cpupid, this_cpupid; 1566 |
1567 /* 1568 * The pages in slow memory node should be migrated according 1569 * to hot/cold instead of private/shared. 1570 */ 1571 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && 1572 !node_is_toptier(src_nid)) { 1573 struct pglist_data *pgdat; 1574 unsigned long rate_limit; 1575 unsigned int latency, th, def_th; 1576 1577 pgdat = NODE_DATA(dst_nid); 1578 if (pgdat_free_space_enough(pgdat)) { 1579 /* workload changed, reset hot threshold */ 1580 pgdat->nbp_threshold = 0; 1581 return true; 1582 } 1583 1584 def_th = sysctl_numa_balancing_hot_threshold; 1585 rate_limit = sysctl_numa_balancing_promote_rate_limit << \ 1586 (20 - PAGE_SHIFT); 1587 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); 1588 1589 th = pgdat->nbp_threshold ? : def_th; 1590 latency = numa_hint_fault_latency(page); 1591 if (latency >= th) 1592 return false; 1593 1594 return !numa_promotion_rate_limit(pgdat, rate_limit, 1595 thp_nr_pages(page)); 1596 } 1597 |
|
1446 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1447 last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1448 | 1598 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1599 last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1600 |
1601 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 1602 !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) 1603 return false; 1604 |
|
1449 /* 1450 * Allow first faults or private faults to migrate immediately early in 1451 * the lifetime of a task. The magic number 4 is based on waiting for 1452 * two full passes of the "multi-stage node selection" test that is 1453 * executed below. 1454 */ 1455 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && 1456 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) --- 1223 unchanged lines hidden (view full) --- 2680 2681 if (!static_branch_likely(&sched_numa_balancing)) 2682 return; 2683 2684 /* for example, ksmd faulting in a user's mm */ 2685 if (!p->mm) 2686 return; 2687 | 1605 /* 1606 * Allow first faults or private faults to migrate immediately early in 1607 * the lifetime of a task. The magic number 4 is based on waiting for 1608 * two full passes of the "multi-stage node selection" test that is 1609 * executed below. 1610 */ 1611 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && 1612 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) --- 1223 unchanged lines hidden (view full) --- 2836 2837 if (!static_branch_likely(&sched_numa_balancing)) 2838 return; 2839 2840 /* for example, ksmd faulting in a user's mm */ 2841 if (!p->mm) 2842 return; 2843 |
2844 /* 2845 * NUMA faults statistics are unnecessary for the slow memory 2846 * node for memory tiering mode. 2847 */ 2848 if (!node_is_toptier(mem_node) && 2849 (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || 2850 !cpupid_valid(last_cpupid))) 2851 return; 2852 |
|
2688 /* Allocate buffer to track faults on a per-node basis */ 2689 if (unlikely(!p->numa_faults)) { 2690 int size = sizeof(*p->numa_faults) * 2691 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2692 2693 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2694 if (!p->numa_faults) 2695 return; --- 9392 unchanged lines hidden --- | 2853 /* Allocate buffer to track faults on a per-node basis */ 2854 if (unlikely(!p->numa_faults)) { 2855 int size = sizeof(*p->numa_faults) * 2856 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2857 2858 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2859 if (!p->numa_faults) 2860 return; --- 9392 unchanged lines hidden --- |