fair.c (1e7f32f776089af32b6ec9b801fe976778c8448b) | fair.c (0fb3978b0aac3a5c08637aed03cc2d65f793508f) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> --- 1245 unchanged lines hidden (view full) --- 1254 1255static bool numa_is_active_node(int nid, struct numa_group *ng) 1256{ 1257 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 1258} 1259 1260/* Handle placement on systems where not all nodes are directly connected. */ 1261static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | 1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> --- 1245 unchanged lines hidden (view full) --- 1254 1255static bool numa_is_active_node(int nid, struct numa_group *ng) 1256{ 1257 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 1258} 1259 1260/* Handle placement on systems where not all nodes are directly connected. */ 1261static unsigned long score_nearby_nodes(struct task_struct *p, int nid, |
1262 int maxdist, bool task) | 1262 int lim_dist, bool task) |
1263{ 1264 unsigned long score = 0; | 1263{ 1264 unsigned long score = 0; |
1265 int node; | 1265 int node, max_dist; |
1266 1267 /* 1268 * All nodes are directly connected, and the same distance 1269 * from each other. No need for fancy placement algorithms. 1270 */ 1271 if (sched_numa_topology_type == NUMA_DIRECT) 1272 return 0; 1273 | 1266 1267 /* 1268 * All nodes are directly connected, and the same distance 1269 * from each other. No need for fancy placement algorithms. 1270 */ 1271 if (sched_numa_topology_type == NUMA_DIRECT) 1272 return 0; 1273 |
1274 /* sched_max_numa_distance may be changed in parallel. */ 1275 max_dist = READ_ONCE(sched_max_numa_distance); |
|
1274 /* 1275 * This code is called for each node, introducing N^2 complexity, 1276 * which should be ok given the number of nodes rarely exceeds 8. 1277 */ 1278 for_each_online_node(node) { 1279 unsigned long faults; 1280 int dist = node_distance(nid, node); 1281 1282 /* 1283 * The furthest away nodes in the system are not interesting 1284 * for placement; nid was already counted. 1285 */ | 1276 /* 1277 * This code is called for each node, introducing N^2 complexity, 1278 * which should be ok given the number of nodes rarely exceeds 8. 1279 */ 1280 for_each_online_node(node) { 1281 unsigned long faults; 1282 int dist = node_distance(nid, node); 1283 1284 /* 1285 * The furthest away nodes in the system are not interesting 1286 * for placement; nid was already counted. 1287 */ |
1286 if (dist == sched_max_numa_distance || node == nid) | 1288 if (dist >= max_dist || node == nid) |
1287 continue; 1288 1289 /* 1290 * On systems with a backplane NUMA topology, compare groups 1291 * of nodes, and move tasks towards the group with the most 1292 * memory accesses. When comparing two nodes at distance 1293 * "hoplimit", only nodes closer by than "hoplimit" are part 1294 * of each group. Skip other nodes. 1295 */ | 1289 continue; 1290 1291 /* 1292 * On systems with a backplane NUMA topology, compare groups 1293 * of nodes, and move tasks towards the group with the most 1294 * memory accesses. When comparing two nodes at distance 1295 * "hoplimit", only nodes closer by than "hoplimit" are part 1296 * of each group. Skip other nodes. 1297 */ |
1296 if (sched_numa_topology_type == NUMA_BACKPLANE && 1297 dist >= maxdist) | 1298 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) |
1298 continue; 1299 1300 /* Add up the faults from nearby nodes. */ 1301 if (task) 1302 faults = task_faults(p, node); 1303 else 1304 faults = group_faults(p, node); 1305 1306 /* 1307 * On systems with a glueless mesh NUMA topology, there are 1308 * no fixed "groups of nodes". Instead, nodes that are not 1309 * directly connected bounce traffic through intermediate 1310 * nodes; a numa_group can occupy any set of nodes. 1311 * The further away a node is, the less the faults count. 1312 * This seems to result in good task placement. 1313 */ 1314 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | 1299 continue; 1300 1301 /* Add up the faults from nearby nodes. */ 1302 if (task) 1303 faults = task_faults(p, node); 1304 else 1305 faults = group_faults(p, node); 1306 1307 /* 1308 * On systems with a glueless mesh NUMA topology, there are 1309 * no fixed "groups of nodes". Instead, nodes that are not 1310 * directly connected bounce traffic through intermediate 1311 * nodes; a numa_group can occupy any set of nodes. 1312 * The further away a node is, the less the faults count. 1313 * This seems to result in good task placement. 1314 */ 1315 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { |
1315 faults *= (sched_max_numa_distance - dist); 1316 faults /= (sched_max_numa_distance - LOCAL_DISTANCE); | 1316 faults *= (max_dist - dist); 1317 faults /= (max_dist - LOCAL_DISTANCE); |
1317 } 1318 1319 score += faults; 1320 } 1321 1322 return score; 1323} 1324 --- 159 unchanged lines hidden (view full) --- 1484 return true; 1485} 1486 1487struct task_numa_env { 1488 struct task_struct *p; 1489 1490 int src_cpu, src_nid; 1491 int dst_cpu, dst_nid; | 1318 } 1319 1320 score += faults; 1321 } 1322 1323 return score; 1324} 1325 --- 159 unchanged lines hidden (view full) --- 1485 return true; 1486} 1487 1488struct task_numa_env { 1489 struct task_struct *p; 1490 1491 int src_cpu, src_nid; 1492 int dst_cpu, dst_nid; |
1493 int imb_numa_nr; |
|
1492 1493 struct numa_stats src_stats, dst_stats; 1494 1495 int imbalance_pct; 1496 int dist; 1497 1498 struct task_struct *best_task; 1499 long best_imp; 1500 int best_cpu; 1501}; 1502 1503static unsigned long cpu_load(struct rq *rq); 1504static unsigned long cpu_runnable(struct rq *rq); 1505static inline long adjust_numa_imbalance(int imbalance, | 1494 1495 struct numa_stats src_stats, dst_stats; 1496 1497 int imbalance_pct; 1498 int dist; 1499 1500 struct task_struct *best_task; 1501 long best_imp; 1502 int best_cpu; 1503}; 1504 1505static unsigned long cpu_load(struct rq *rq); 1506static unsigned long cpu_runnable(struct rq *rq); 1507static inline long adjust_numa_imbalance(int imbalance, |
1506 int dst_running, int dst_weight); | 1508 int dst_running, int imb_numa_nr); |
1507 1508static inline enum 1509numa_type numa_classify(unsigned int imbalance_pct, 1510 struct numa_stats *ns) 1511{ 1512 if ((ns->nr_running > ns->weight) && 1513 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || 1514 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) --- 364 unchanged lines hidden (view full) --- 1879 * more running tasks that the imbalance is ignored as the 1880 * move improves the imbalance from the perspective of the 1881 * CPU load balancer. 1882 * */ 1883 src_running = env->src_stats.nr_running - 1; 1884 dst_running = env->dst_stats.nr_running + 1; 1885 imbalance = max(0, dst_running - src_running); 1886 imbalance = adjust_numa_imbalance(imbalance, dst_running, | 1509 1510static inline enum 1511numa_type numa_classify(unsigned int imbalance_pct, 1512 struct numa_stats *ns) 1513{ 1514 if ((ns->nr_running > ns->weight) && 1515 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || 1516 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) --- 364 unchanged lines hidden (view full) --- 1881 * more running tasks that the imbalance is ignored as the 1882 * move improves the imbalance from the perspective of the 1883 * CPU load balancer. 1884 * */ 1885 src_running = env->src_stats.nr_running - 1; 1886 dst_running = env->dst_stats.nr_running + 1; 1887 imbalance = max(0, dst_running - src_running); 1888 imbalance = adjust_numa_imbalance(imbalance, dst_running, |
1887 env->dst_stats.weight); | 1889 env->imb_numa_nr); |
1888 1889 /* Use idle CPU if there is no imbalance */ 1890 if (!imbalance) { 1891 maymove = true; 1892 if (env->dst_stats.idle_cpu >= 0) { 1893 env->dst_cpu = env->dst_stats.idle_cpu; 1894 task_numa_assign(env, NULL, 0); 1895 return; --- 48 unchanged lines hidden (view full) --- 1944 * imbalance and would be the first to start moving tasks about. 1945 * 1946 * And we want to avoid any moving of tasks about, as that would create 1947 * random movement of tasks -- counter the numa conditions we're trying 1948 * to satisfy here. 1949 */ 1950 rcu_read_lock(); 1951 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | 1890 1891 /* Use idle CPU if there is no imbalance */ 1892 if (!imbalance) { 1893 maymove = true; 1894 if (env->dst_stats.idle_cpu >= 0) { 1895 env->dst_cpu = env->dst_stats.idle_cpu; 1896 task_numa_assign(env, NULL, 0); 1897 return; --- 48 unchanged lines hidden (view full) --- 1946 * imbalance and would be the first to start moving tasks about. 1947 * 1948 * And we want to avoid any moving of tasks about, as that would create 1949 * random movement of tasks -- counter the numa conditions we're trying 1950 * to satisfy here. 1951 */ 1952 rcu_read_lock(); 1953 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); |
1952 if (sd) | 1954 if (sd) { |
1953 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | 1955 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; |
1956 env.imb_numa_nr = sd->imb_numa_nr; 1957 } |
|
1954 rcu_read_unlock(); 1955 1956 /* 1957 * Cpusets can break the scheduler domain tree into smaller 1958 * balance domains, some of which do not cross NUMA boundaries. 1959 * Tasks that are "trapped" in such domains cannot be migrated 1960 * elsewhere, so there is no point in (re)trying. 1961 */ --- 858 unchanged lines hidden (view full) --- 2820 } 2821 } 2822 p->node_stamp = 0; 2823 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; 2824 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2825 /* Protect against double add, see task_tick_numa and task_numa_work */ 2826 p->numa_work.next = &p->numa_work; 2827 p->numa_faults = NULL; | 1958 rcu_read_unlock(); 1959 1960 /* 1961 * Cpusets can break the scheduler domain tree into smaller 1962 * balance domains, some of which do not cross NUMA boundaries. 1963 * Tasks that are "trapped" in such domains cannot be migrated 1964 * elsewhere, so there is no point in (re)trying. 1965 */ --- 858 unchanged lines hidden (view full) --- 2824 } 2825 } 2826 p->node_stamp = 0; 2827 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; 2828 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2829 /* Protect against double add, see task_tick_numa and task_numa_work */ 2830 p->numa_work.next = &p->numa_work; 2831 p->numa_faults = NULL; |
2832 p->numa_pages_migrated = 0; 2833 p->total_numa_faults = 0; |
|
2828 RCU_INIT_POINTER(p->numa_group, NULL); 2829 p->last_task_numa_placement = 0; 2830 p->last_sum_exec_runtime = 0; 2831 2832 init_task_work(&p->numa_work, task_numa_work); 2833 2834 /* New address space, reset the preferred nid */ 2835 if (!(clone_flags & CLONE_VM)) { --- 187 unchanged lines hidden (view full) --- 3023{ 3024 cfs_rq->avg.load_avg += se->avg.load_avg; 3025 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 3026} 3027 3028static inline void 3029dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3030{ | 2834 RCU_INIT_POINTER(p->numa_group, NULL); 2835 p->last_task_numa_placement = 0; 2836 p->last_sum_exec_runtime = 0; 2837 2838 init_task_work(&p->numa_work, task_numa_work); 2839 2840 /* New address space, reset the preferred nid */ 2841 if (!(clone_flags & CLONE_VM)) { --- 187 unchanged lines hidden (view full) --- 3029{ 3030 cfs_rq->avg.load_avg += se->avg.load_avg; 3031 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 3032} 3033 3034static inline void 3035dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3036{ |
3037 u32 divider = get_pelt_divider(&se->avg); |
|
3031 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); | 3038 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); |
3032 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); 3033 /* See update_cfs_rq_load_avg() */ 3034 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 3035 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); | 3039 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; |
3036} 3037#else 3038static inline void 3039enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3040static inline void 3041dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3042#endif 3043 --- 334 unchanged lines hidden (view full) --- 3378#else 3379 p_last_update_time = prev->avg.last_update_time; 3380 n_last_update_time = next->avg.last_update_time; 3381#endif 3382 __update_load_avg_blocked_se(p_last_update_time, se); 3383 se->avg.last_update_time = n_last_update_time; 3384} 3385 | 3040} 3041#else 3042static inline void 3043enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3044static inline void 3045dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3046#endif 3047 --- 334 unchanged lines hidden (view full) --- 3382#else 3383 p_last_update_time = prev->avg.last_update_time; 3384 n_last_update_time = next->avg.last_update_time; 3385#endif 3386 __update_load_avg_blocked_se(p_last_update_time, se); 3387 se->avg.last_update_time = n_last_update_time; 3388} 3389 |
3390 |
|
3386/* 3387 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to 3388 * propagate its contribution. The key to this propagation is the invariant 3389 * that for each group: 3390 * 3391 * ge->avg == grq->avg (1) 3392 * 3393 * _IFF_ we look at the pure running and runnable sums. Because they --- 51 unchanged lines hidden (view full) --- 3445 * 3446 * On removal, we'll assume each task is equally runnable; which yields: 3447 * 3448 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight 3449 * 3450 * XXX: only do this for the part of runnable > running ? 3451 * 3452 */ | 3391/* 3392 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to 3393 * propagate its contribution. The key to this propagation is the invariant 3394 * that for each group: 3395 * 3396 * ge->avg == grq->avg (1) 3397 * 3398 * _IFF_ we look at the pure running and runnable sums. Because they --- 51 unchanged lines hidden (view full) --- 3450 * 3451 * On removal, we'll assume each task is equally runnable; which yields: 3452 * 3453 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight 3454 * 3455 * XXX: only do this for the part of runnable > running ? 3456 * 3457 */ |
3458 |
|
3453static inline void 3454update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3455{ | 3459static inline void 3460update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3461{ |
3456 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; 3457 u32 new_sum, divider; | 3462 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; 3463 u32 divider; |
3458 3459 /* Nothing to update */ | 3464 3465 /* Nothing to update */ |
3460 if (!delta_avg) | 3466 if (!delta) |
3461 return; 3462 3463 /* 3464 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3465 * See ___update_load_avg() for details. 3466 */ 3467 divider = get_pelt_divider(&cfs_rq->avg); 3468 | 3467 return; 3468 3469 /* 3470 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3471 * See ___update_load_avg() for details. 3472 */ 3473 divider = get_pelt_divider(&cfs_rq->avg); 3474 |
3469 | |
3470 /* Set new sched_entity's utilization */ 3471 se->avg.util_avg = gcfs_rq->avg.util_avg; | 3475 /* Set new sched_entity's utilization */ 3476 se->avg.util_avg = gcfs_rq->avg.util_avg; |
3472 new_sum = se->avg.util_avg * divider; 3473 delta_sum = (long)new_sum - (long)se->avg.util_sum; 3474 se->avg.util_sum = new_sum; | 3477 se->avg.util_sum = se->avg.util_avg * divider; |
3475 3476 /* Update parent cfs_rq utilization */ | 3478 3479 /* Update parent cfs_rq utilization */ |
3477 add_positive(&cfs_rq->avg.util_avg, delta_avg); 3478 add_positive(&cfs_rq->avg.util_sum, delta_sum); 3479 3480 /* See update_cfs_rq_load_avg() */ 3481 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, 3482 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); | 3480 add_positive(&cfs_rq->avg.util_avg, delta); 3481 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; |
3483} 3484 3485static inline void 3486update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3487{ | 3482} 3483 3484static inline void 3485update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3486{ |
3488 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; 3489 u32 new_sum, divider; | 3487 long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; 3488 u32 divider; |
3490 3491 /* Nothing to update */ | 3489 3490 /* Nothing to update */ |
3492 if (!delta_avg) | 3491 if (!delta) |
3493 return; 3494 3495 /* 3496 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3497 * See ___update_load_avg() for details. 3498 */ 3499 divider = get_pelt_divider(&cfs_rq->avg); 3500 3501 /* Set new sched_entity's runnable */ 3502 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; | 3492 return; 3493 3494 /* 3495 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3496 * See ___update_load_avg() for details. 3497 */ 3498 divider = get_pelt_divider(&cfs_rq->avg); 3499 3500 /* Set new sched_entity's runnable */ 3501 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; |
3503 new_sum = se->avg.runnable_avg * divider; 3504 delta_sum = (long)new_sum - (long)se->avg.runnable_sum; 3505 se->avg.runnable_sum = new_sum; | 3502 se->avg.runnable_sum = se->avg.runnable_avg * divider; |
3506 3507 /* Update parent cfs_rq runnable */ | 3503 3504 /* Update parent cfs_rq runnable */ |
3508 add_positive(&cfs_rq->avg.runnable_avg, delta_avg); 3509 add_positive(&cfs_rq->avg.runnable_sum, delta_sum); 3510 /* See update_cfs_rq_load_avg() */ 3511 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, 3512 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); | 3505 add_positive(&cfs_rq->avg.runnable_avg, delta); 3506 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; |
3513} 3514 3515static inline void 3516update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3517{ | 3507} 3508 3509static inline void 3510update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3511{ |
3518 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; | 3512 long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; |
3519 unsigned long load_avg; 3520 u64 load_sum = 0; | 3513 unsigned long load_avg; 3514 u64 load_sum = 0; |
3521 s64 delta_sum; | |
3522 u32 divider; 3523 3524 if (!runnable_sum) 3525 return; 3526 3527 gcfs_rq->prop_runnable_sum = 0; 3528 3529 /* --- 10 unchanged lines hidden (view full) --- 3540 runnable_sum += se->avg.load_sum; 3541 runnable_sum = min_t(long, runnable_sum, divider); 3542 } else { 3543 /* 3544 * Estimate the new unweighted runnable_sum of the gcfs_rq by 3545 * assuming all tasks are equally runnable. 3546 */ 3547 if (scale_load_down(gcfs_rq->load.weight)) { | 3515 u32 divider; 3516 3517 if (!runnable_sum) 3518 return; 3519 3520 gcfs_rq->prop_runnable_sum = 0; 3521 3522 /* --- 10 unchanged lines hidden (view full) --- 3533 runnable_sum += se->avg.load_sum; 3534 runnable_sum = min_t(long, runnable_sum, divider); 3535 } else { 3536 /* 3537 * Estimate the new unweighted runnable_sum of the gcfs_rq by 3538 * assuming all tasks are equally runnable. 3539 */ 3540 if (scale_load_down(gcfs_rq->load.weight)) { |
3548 load_sum = div_u64(gcfs_rq->avg.load_sum, | 3541 load_sum = div_s64(gcfs_rq->avg.load_sum, |
3549 scale_load_down(gcfs_rq->load.weight)); 3550 } 3551 3552 /* But make sure to not inflate se's runnable */ 3553 runnable_sum = min(se->avg.load_sum, load_sum); 3554 } 3555 3556 /* 3557 * runnable_sum can't be lower than running_sum 3558 * Rescale running sum to be in the same range as runnable sum 3559 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] 3560 * runnable_sum is in [0 : LOAD_AVG_MAX] 3561 */ 3562 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; 3563 runnable_sum = max(runnable_sum, running_sum); 3564 | 3542 scale_load_down(gcfs_rq->load.weight)); 3543 } 3544 3545 /* But make sure to not inflate se's runnable */ 3546 runnable_sum = min(se->avg.load_sum, load_sum); 3547 } 3548 3549 /* 3550 * runnable_sum can't be lower than running_sum 3551 * Rescale running sum to be in the same range as runnable sum 3552 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] 3553 * runnable_sum is in [0 : LOAD_AVG_MAX] 3554 */ 3555 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; 3556 runnable_sum = max(runnable_sum, running_sum); 3557 |
3565 load_sum = se_weight(se) * runnable_sum; 3566 load_avg = div_u64(load_sum, divider); | 3558 load_sum = (s64)se_weight(se) * runnable_sum; 3559 load_avg = div_s64(load_sum, divider); |
3567 | 3560 |
3568 delta_avg = load_avg - se->avg.load_avg; 3569 if (!delta_avg) | 3561 se->avg.load_sum = runnable_sum; 3562 3563 delta = load_avg - se->avg.load_avg; 3564 if (!delta) |
3570 return; 3571 | 3565 return; 3566 |
3572 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; 3573 3574 se->avg.load_sum = runnable_sum; | |
3575 se->avg.load_avg = load_avg; | 3567 se->avg.load_avg = load_avg; |
3576 add_positive(&cfs_rq->avg.load_avg, delta_avg); 3577 add_positive(&cfs_rq->avg.load_sum, delta_sum); 3578 /* See update_cfs_rq_load_avg() */ 3579 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 3580 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); | 3568 3569 add_positive(&cfs_rq->avg.load_avg, delta); 3570 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; |
3581} 3582 3583static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) 3584{ 3585 cfs_rq->propagate = 1; 3586 cfs_rq->prop_runnable_sum += runnable_sum; 3587} 3588 --- 74 unchanged lines hidden (view full) --- 3663 * @cfs_rq: cfs_rq to update 3664 * 3665 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3666 * avg. The immediate corollary is that all (fair) tasks must be attached, see 3667 * post_init_entity_util_avg(). 3668 * 3669 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3670 * | 3571} 3572 3573static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) 3574{ 3575 cfs_rq->propagate = 1; 3576 cfs_rq->prop_runnable_sum += runnable_sum; 3577} 3578 --- 74 unchanged lines hidden (view full) --- 3653 * @cfs_rq: cfs_rq to update 3654 * 3655 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3656 * avg. The immediate corollary is that all (fair) tasks must be attached, see 3657 * post_init_entity_util_avg(). 3658 * 3659 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3660 * |
3671 * Return: true if the load decayed or we removed load. | 3661 * Returns true if the load decayed or we removed load. |
3672 * 3673 * Since both these conditions indicate a changed cfs_rq->avg.load we should 3674 * call update_tg_load_avg() when this function returns true. 3675 */ 3676static inline int 3677update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3678{ 3679 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; --- 8 unchanged lines hidden (view full) --- 3688 swap(cfs_rq->removed.util_avg, removed_util); 3689 swap(cfs_rq->removed.load_avg, removed_load); 3690 swap(cfs_rq->removed.runnable_avg, removed_runnable); 3691 cfs_rq->removed.nr = 0; 3692 raw_spin_unlock(&cfs_rq->removed.lock); 3693 3694 r = removed_load; 3695 sub_positive(&sa->load_avg, r); | 3662 * 3663 * Since both these conditions indicate a changed cfs_rq->avg.load we should 3664 * call update_tg_load_avg() when this function returns true. 3665 */ 3666static inline int 3667update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3668{ 3669 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; --- 8 unchanged lines hidden (view full) --- 3678 swap(cfs_rq->removed.util_avg, removed_util); 3679 swap(cfs_rq->removed.load_avg, removed_load); 3680 swap(cfs_rq->removed.runnable_avg, removed_runnable); 3681 cfs_rq->removed.nr = 0; 3682 raw_spin_unlock(&cfs_rq->removed.lock); 3683 3684 r = removed_load; 3685 sub_positive(&sa->load_avg, r); |
3696 sub_positive(&sa->load_sum, r * divider); 3697 /* See sa->util_sum below */ 3698 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); | 3686 sa->load_sum = sa->load_avg * divider; |
3699 3700 r = removed_util; 3701 sub_positive(&sa->util_avg, r); | 3687 3688 r = removed_util; 3689 sub_positive(&sa->util_avg, r); |
3702 sub_positive(&sa->util_sum, r * divider); 3703 /* 3704 * Because of rounding, se->util_sum might ends up being +1 more than 3705 * cfs->util_sum. Although this is not a problem by itself, detaching 3706 * a lot of tasks with the rounding problem between 2 updates of 3707 * util_avg (~1ms) can make cfs->util_sum becoming null whereas 3708 * cfs_util_avg is not. 3709 * Check that util_sum is still above its lower bound for the new 3710 * util_avg. Given that period_contrib might have moved since the last 3711 * sync, we are only sure that util_sum must be above or equal to 3712 * util_avg * minimum possible divider 3713 */ 3714 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); | 3690 sa->util_sum = sa->util_avg * divider; |
3715 3716 r = removed_runnable; 3717 sub_positive(&sa->runnable_avg, r); | 3691 3692 r = removed_runnable; 3693 sub_positive(&sa->runnable_avg, r); |
3718 sub_positive(&sa->runnable_sum, r * divider); 3719 /* See sa->util_sum above */ 3720 sa->runnable_sum = max_t(u32, sa->runnable_sum, 3721 sa->runnable_avg * PELT_MIN_DIVIDER); | 3694 sa->runnable_sum = sa->runnable_avg * divider; |
3722 3723 /* 3724 * removed_runnable is the unweighted version of removed_load so we 3725 * can use it to estimate removed_load_sum. 3726 */ 3727 add_tg_cfs_propagate(cfs_rq, 3728 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); 3729 --- 70 unchanged lines hidden (view full) --- 3800 * @cfs_rq: cfs_rq to detach from 3801 * @se: sched_entity to detach 3802 * 3803 * Must call update_cfs_rq_load_avg() before this, since we rely on 3804 * cfs_rq->avg.last_update_time being current. 3805 */ 3806static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3807{ | 3695 3696 /* 3697 * removed_runnable is the unweighted version of removed_load so we 3698 * can use it to estimate removed_load_sum. 3699 */ 3700 add_tg_cfs_propagate(cfs_rq, 3701 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); 3702 --- 70 unchanged lines hidden (view full) --- 3773 * @cfs_rq: cfs_rq to detach from 3774 * @se: sched_entity to detach 3775 * 3776 * Must call update_cfs_rq_load_avg() before this, since we rely on 3777 * cfs_rq->avg.last_update_time being current. 3778 */ 3779static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3780{ |
3781 /* 3782 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3783 * See ___update_load_avg() for details. 3784 */ 3785 u32 divider = get_pelt_divider(&cfs_rq->avg); 3786 |
|
3808 dequeue_load_avg(cfs_rq, se); 3809 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); | 3787 dequeue_load_avg(cfs_rq, se); 3788 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); |
3810 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3811 /* See update_cfs_rq_load_avg() */ 3812 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, 3813 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); 3814 | 3789 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; |
3815 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); | 3790 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); |
3816 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); 3817 /* See update_cfs_rq_load_avg() */ 3818 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, 3819 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); | 3791 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; |
3820 3821 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3822 3823 cfs_rq_util_change(cfs_rq, 0); 3824 3825 trace_pelt_cfs_tp(cfs_rq); 3826} 3827 --- 4740 unchanged lines hidden (view full) --- 8568 * 8569 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more 8570 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. 8571 * Bigger imbalances in the number of busy CPUs will be dealt with in 8572 * update_sd_pick_busiest(). 8573 * 8574 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings 8575 * of @dst_cpu are idle and @sg has lower priority. | 3792 3793 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3794 3795 cfs_rq_util_change(cfs_rq, 0); 3796 3797 trace_pelt_cfs_tp(cfs_rq); 3798} 3799 --- 4740 unchanged lines hidden (view full) --- 8540 * 8541 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more 8542 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. 8543 * Bigger imbalances in the number of busy CPUs will be dealt with in 8544 * update_sd_pick_busiest(). 8545 * 8546 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings 8547 * of @dst_cpu are idle and @sg has lower priority. |
8576 * 8577 * Return: true if @dst_cpu can pull tasks, false otherwise. | |
8578 */ 8579static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, 8580 struct sg_lb_stats *sgs, 8581 struct sched_group *sg) 8582{ 8583#ifdef CONFIG_SCHED_SMT 8584 bool local_is_smt, sg_is_smt; 8585 int sg_busy_cpus; --- 59 unchanged lines hidden (view full) --- 8645 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); 8646 8647 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); 8648} 8649 8650/** 8651 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 8652 * @env: The load balancing environment. | 8548 */ 8549static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, 8550 struct sg_lb_stats *sgs, 8551 struct sched_group *sg) 8552{ 8553#ifdef CONFIG_SCHED_SMT 8554 bool local_is_smt, sg_is_smt; 8555 int sg_busy_cpus; --- 59 unchanged lines hidden (view full) --- 8615 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); 8616 8617 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); 8618} 8619 8620/** 8621 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 8622 * @env: The load balancing environment. |
8653 * @sds: Load-balancing data with statistics of the local group. | |
8654 * @group: sched_group whose statistics are to be updated. 8655 * @sgs: variable to hold the statistics for this group. 8656 * @sg_status: Holds flag indicating the status of the sched_group 8657 */ 8658static inline void update_sg_lb_stats(struct lb_env *env, 8659 struct sd_lb_stats *sds, 8660 struct sched_group *group, 8661 struct sg_lb_stats *sgs, --- 373 unchanged lines hidden (view full) --- 9035 return true; 9036} 9037 9038/* 9039 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. 9040 * This is an approximation as the number of running tasks may not be 9041 * related to the number of busy CPUs due to sched_setaffinity. 9042 */ | 8623 * @group: sched_group whose statistics are to be updated. 8624 * @sgs: variable to hold the statistics for this group. 8625 * @sg_status: Holds flag indicating the status of the sched_group 8626 */ 8627static inline void update_sg_lb_stats(struct lb_env *env, 8628 struct sd_lb_stats *sds, 8629 struct sched_group *group, 8630 struct sg_lb_stats *sgs, --- 373 unchanged lines hidden (view full) --- 9004 return true; 9005} 9006 9007/* 9008 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. 9009 * This is an approximation as the number of running tasks may not be 9010 * related to the number of busy CPUs due to sched_setaffinity. 9011 */ |
9043static inline bool allow_numa_imbalance(int dst_running, int dst_weight) | 9012static inline bool allow_numa_imbalance(int running, int imb_numa_nr) |
9044{ | 9013{ |
9045 return (dst_running < (dst_weight >> 2)); | 9014 return running <= imb_numa_nr; |
9046} 9047 9048/* 9049 * find_idlest_group() finds and returns the least busy CPU group within the 9050 * domain. 9051 * 9052 * Assumes p is allowed on at least one CPU in sd. 9053 */ --- 117 unchanged lines hidden (view full) --- 9171 if (cpu_to_node(this_cpu) == p->numa_preferred_nid) 9172 return NULL; 9173 9174 idlest_cpu = cpumask_first(sched_group_span(idlest)); 9175 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) 9176 return idlest; 9177#endif 9178 /* | 9015} 9016 9017/* 9018 * find_idlest_group() finds and returns the least busy CPU group within the 9019 * domain. 9020 * 9021 * Assumes p is allowed on at least one CPU in sd. 9022 */ --- 117 unchanged lines hidden (view full) --- 9140 if (cpu_to_node(this_cpu) == p->numa_preferred_nid) 9141 return NULL; 9142 9143 idlest_cpu = cpumask_first(sched_group_span(idlest)); 9144 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) 9145 return idlest; 9146#endif 9147 /* |
9179 * Otherwise, keep the task on this node to stay close 9180 * its wakeup source and improve locality. If there is 9181 * a real need of migration, periodic load balance will 9182 * take care of it. | 9148 * Otherwise, keep the task close to the wakeup source 9149 * and improve locality if the number of running tasks 9150 * would remain below threshold where an imbalance is 9151 * allowed. If there is a real need of migration, 9152 * periodic load balance will take care of it. |
9183 */ | 9153 */ |
9184 if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) | 9154 if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) |
9185 return NULL; 9186 } 9187 9188 /* 9189 * Select group with highest number of idle CPUs. We could also 9190 * compare the utilization which is more stable but it can end 9191 * up that the group has less spare capacity but finally more 9192 * idle CPUs which means more opportunity to run task. --- 75 unchanged lines hidden (view full) --- 9268 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); 9269 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); 9270 } 9271} 9272 9273#define NUMA_IMBALANCE_MIN 2 9274 9275static inline long adjust_numa_imbalance(int imbalance, | 9155 return NULL; 9156 } 9157 9158 /* 9159 * Select group with highest number of idle CPUs. We could also 9160 * compare the utilization which is more stable but it can end 9161 * up that the group has less spare capacity but finally more 9162 * idle CPUs which means more opportunity to run task. --- 75 unchanged lines hidden (view full) --- 9238 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); 9239 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); 9240 } 9241} 9242 9243#define NUMA_IMBALANCE_MIN 2 9244 9245static inline long adjust_numa_imbalance(int imbalance, |
9276 int dst_running, int dst_weight) | 9246 int dst_running, int imb_numa_nr) |
9277{ | 9247{ |
9278 if (!allow_numa_imbalance(dst_running, dst_weight)) | 9248 if (!allow_numa_imbalance(dst_running, imb_numa_nr)) |
9279 return imbalance; 9280 9281 /* 9282 * Allow a small imbalance based on a simple pair of communicating 9283 * tasks that remain local when the destination is lightly loaded. 9284 */ 9285 if (imbalance <= NUMA_IMBALANCE_MIN) 9286 return 0; --- 95 unchanged lines hidden (view full) --- 9382 env->migration_type = migrate_task; 9383 env->imbalance = max_t(long, 0, (local->idle_cpus - 9384 busiest->idle_cpus) >> 1); 9385 } 9386 9387 /* Consider allowing a small imbalance between NUMA groups */ 9388 if (env->sd->flags & SD_NUMA) { 9389 env->imbalance = adjust_numa_imbalance(env->imbalance, | 9249 return imbalance; 9250 9251 /* 9252 * Allow a small imbalance based on a simple pair of communicating 9253 * tasks that remain local when the destination is lightly loaded. 9254 */ 9255 if (imbalance <= NUMA_IMBALANCE_MIN) 9256 return 0; --- 95 unchanged lines hidden (view full) --- 9352 env->migration_type = migrate_task; 9353 env->imbalance = max_t(long, 0, (local->idle_cpus - 9354 busiest->idle_cpus) >> 1); 9355 } 9356 9357 /* Consider allowing a small imbalance between NUMA groups */ 9358 if (env->sd->flags & SD_NUMA) { 9359 env->imbalance = adjust_numa_imbalance(env->imbalance, |
9390 busiest->sum_nr_running, busiest->group_weight); | 9360 local->sum_nr_running + 1, env->sd->imb_numa_nr); |
9391 } 9392 9393 return; 9394 } 9395 9396 /* 9397 * Local is fully busy but has to take more load to relieve the 9398 * busiest group --- 54 unchanged lines hidden (view full) --- 9453 * avg_load : Only if imbalance is significant enough. 9454 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite 9455 * different in groups. 9456 */ 9457 9458/** 9459 * find_busiest_group - Returns the busiest group within the sched_domain 9460 * if there is an imbalance. | 9361 } 9362 9363 return; 9364 } 9365 9366 /* 9367 * Local is fully busy but has to take more load to relieve the 9368 * busiest group --- 54 unchanged lines hidden (view full) --- 9423 * avg_load : Only if imbalance is significant enough. 9424 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite 9425 * different in groups. 9426 */ 9427 9428/** 9429 * find_busiest_group - Returns the busiest group within the sched_domain 9430 * if there is an imbalance. |
9461 * @env: The load balancing environment. | |
9462 * 9463 * Also calculates the amount of runnable load which should be moved 9464 * to restore balance. 9465 * | 9431 * 9432 * Also calculates the amount of runnable load which should be moved 9433 * to restore balance. 9434 * |
9435 * @env: The load balancing environment. 9436 * |
|
9466 * Return: - The busiest group if imbalance exists. 9467 */ 9468static struct sched_group *find_busiest_group(struct lb_env *env) 9469{ 9470 struct sg_lb_stats *local, *busiest; 9471 struct sd_lb_stats sds; 9472 9473 init_sd_lb_stats(&sds); --- 2454 unchanged lines hidden --- | 9437 * Return: - The busiest group if imbalance exists. 9438 */ 9439static struct sched_group *find_busiest_group(struct lb_env *env) 9440{ 9441 struct sg_lb_stats *local, *busiest; 9442 struct sd_lb_stats sds; 9443 9444 init_sd_lb_stats(&sds); --- 2454 unchanged lines hidden --- |