fair.c (1e7f32f776089af32b6ec9b801fe976778c8448b) fair.c (0fb3978b0aac3a5c08637aed03cc2d65f793508f)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>

--- 1245 unchanged lines hidden (view full) ---

1254
1255static bool numa_is_active_node(int nid, struct numa_group *ng)
1256{
1257 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1258}
1259
1260/* Handle placement on systems where not all nodes are directly connected. */
1261static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>

--- 1245 unchanged lines hidden (view full) ---

1254
1255static bool numa_is_active_node(int nid, struct numa_group *ng)
1256{
1257 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1258}
1259
1260/* Handle placement on systems where not all nodes are directly connected. */
1261static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1262 int maxdist, bool task)
1262 int lim_dist, bool task)
1263{
1264 unsigned long score = 0;
1263{
1264 unsigned long score = 0;
1265 int node;
1265 int node, max_dist;
1266
1267 /*
1268 * All nodes are directly connected, and the same distance
1269 * from each other. No need for fancy placement algorithms.
1270 */
1271 if (sched_numa_topology_type == NUMA_DIRECT)
1272 return 0;
1273
1266
1267 /*
1268 * All nodes are directly connected, and the same distance
1269 * from each other. No need for fancy placement algorithms.
1270 */
1271 if (sched_numa_topology_type == NUMA_DIRECT)
1272 return 0;
1273
1274 /* sched_max_numa_distance may be changed in parallel. */
1275 max_dist = READ_ONCE(sched_max_numa_distance);
1274 /*
1275 * This code is called for each node, introducing N^2 complexity,
1276 * which should be ok given the number of nodes rarely exceeds 8.
1277 */
1278 for_each_online_node(node) {
1279 unsigned long faults;
1280 int dist = node_distance(nid, node);
1281
1282 /*
1283 * The furthest away nodes in the system are not interesting
1284 * for placement; nid was already counted.
1285 */
1276 /*
1277 * This code is called for each node, introducing N^2 complexity,
1278 * which should be ok given the number of nodes rarely exceeds 8.
1279 */
1280 for_each_online_node(node) {
1281 unsigned long faults;
1282 int dist = node_distance(nid, node);
1283
1284 /*
1285 * The furthest away nodes in the system are not interesting
1286 * for placement; nid was already counted.
1287 */
1286 if (dist == sched_max_numa_distance || node == nid)
1288 if (dist >= max_dist || node == nid)
1287 continue;
1288
1289 /*
1290 * On systems with a backplane NUMA topology, compare groups
1291 * of nodes, and move tasks towards the group with the most
1292 * memory accesses. When comparing two nodes at distance
1293 * "hoplimit", only nodes closer by than "hoplimit" are part
1294 * of each group. Skip other nodes.
1295 */
1289 continue;
1290
1291 /*
1292 * On systems with a backplane NUMA topology, compare groups
1293 * of nodes, and move tasks towards the group with the most
1294 * memory accesses. When comparing two nodes at distance
1295 * "hoplimit", only nodes closer by than "hoplimit" are part
1296 * of each group. Skip other nodes.
1297 */
1296 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1297 dist >= maxdist)
1298 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1298 continue;
1299
1300 /* Add up the faults from nearby nodes. */
1301 if (task)
1302 faults = task_faults(p, node);
1303 else
1304 faults = group_faults(p, node);
1305
1306 /*
1307 * On systems with a glueless mesh NUMA topology, there are
1308 * no fixed "groups of nodes". Instead, nodes that are not
1309 * directly connected bounce traffic through intermediate
1310 * nodes; a numa_group can occupy any set of nodes.
1311 * The further away a node is, the less the faults count.
1312 * This seems to result in good task placement.
1313 */
1314 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1299 continue;
1300
1301 /* Add up the faults from nearby nodes. */
1302 if (task)
1303 faults = task_faults(p, node);
1304 else
1305 faults = group_faults(p, node);
1306
1307 /*
1308 * On systems with a glueless mesh NUMA topology, there are
1309 * no fixed "groups of nodes". Instead, nodes that are not
1310 * directly connected bounce traffic through intermediate
1311 * nodes; a numa_group can occupy any set of nodes.
1312 * The further away a node is, the less the faults count.
1313 * This seems to result in good task placement.
1314 */
1315 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1315 faults *= (sched_max_numa_distance - dist);
1316 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1316 faults *= (max_dist - dist);
1317 faults /= (max_dist - LOCAL_DISTANCE);
1317 }
1318
1319 score += faults;
1320 }
1321
1322 return score;
1323}
1324

--- 159 unchanged lines hidden (view full) ---

1484 return true;
1485}
1486
1487struct task_numa_env {
1488 struct task_struct *p;
1489
1490 int src_cpu, src_nid;
1491 int dst_cpu, dst_nid;
1318 }
1319
1320 score += faults;
1321 }
1322
1323 return score;
1324}
1325

--- 159 unchanged lines hidden (view full) ---

1485 return true;
1486}
1487
1488struct task_numa_env {
1489 struct task_struct *p;
1490
1491 int src_cpu, src_nid;
1492 int dst_cpu, dst_nid;
1493 int imb_numa_nr;
1492
1493 struct numa_stats src_stats, dst_stats;
1494
1495 int imbalance_pct;
1496 int dist;
1497
1498 struct task_struct *best_task;
1499 long best_imp;
1500 int best_cpu;
1501};
1502
1503static unsigned long cpu_load(struct rq *rq);
1504static unsigned long cpu_runnable(struct rq *rq);
1505static inline long adjust_numa_imbalance(int imbalance,
1494
1495 struct numa_stats src_stats, dst_stats;
1496
1497 int imbalance_pct;
1498 int dist;
1499
1500 struct task_struct *best_task;
1501 long best_imp;
1502 int best_cpu;
1503};
1504
1505static unsigned long cpu_load(struct rq *rq);
1506static unsigned long cpu_runnable(struct rq *rq);
1507static inline long adjust_numa_imbalance(int imbalance,
1506 int dst_running, int dst_weight);
1508 int dst_running, int imb_numa_nr);
1507
1508static inline enum
1509numa_type numa_classify(unsigned int imbalance_pct,
1510 struct numa_stats *ns)
1511{
1512 if ((ns->nr_running > ns->weight) &&
1513 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1514 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))

--- 364 unchanged lines hidden (view full) ---

1879 * more running tasks that the imbalance is ignored as the
1880 * move improves the imbalance from the perspective of the
1881 * CPU load balancer.
1882 * */
1883 src_running = env->src_stats.nr_running - 1;
1884 dst_running = env->dst_stats.nr_running + 1;
1885 imbalance = max(0, dst_running - src_running);
1886 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1509
1510static inline enum
1511numa_type numa_classify(unsigned int imbalance_pct,
1512 struct numa_stats *ns)
1513{
1514 if ((ns->nr_running > ns->weight) &&
1515 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1516 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))

--- 364 unchanged lines hidden (view full) ---

1881 * more running tasks that the imbalance is ignored as the
1882 * move improves the imbalance from the perspective of the
1883 * CPU load balancer.
1884 * */
1885 src_running = env->src_stats.nr_running - 1;
1886 dst_running = env->dst_stats.nr_running + 1;
1887 imbalance = max(0, dst_running - src_running);
1888 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1887 env->dst_stats.weight);
1889 env->imb_numa_nr);
1888
1889 /* Use idle CPU if there is no imbalance */
1890 if (!imbalance) {
1891 maymove = true;
1892 if (env->dst_stats.idle_cpu >= 0) {
1893 env->dst_cpu = env->dst_stats.idle_cpu;
1894 task_numa_assign(env, NULL, 0);
1895 return;

--- 48 unchanged lines hidden (view full) ---

1944 * imbalance and would be the first to start moving tasks about.
1945 *
1946 * And we want to avoid any moving of tasks about, as that would create
1947 * random movement of tasks -- counter the numa conditions we're trying
1948 * to satisfy here.
1949 */
1950 rcu_read_lock();
1951 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1890
1891 /* Use idle CPU if there is no imbalance */
1892 if (!imbalance) {
1893 maymove = true;
1894 if (env->dst_stats.idle_cpu >= 0) {
1895 env->dst_cpu = env->dst_stats.idle_cpu;
1896 task_numa_assign(env, NULL, 0);
1897 return;

--- 48 unchanged lines hidden (view full) ---

1946 * imbalance and would be the first to start moving tasks about.
1947 *
1948 * And we want to avoid any moving of tasks about, as that would create
1949 * random movement of tasks -- counter the numa conditions we're trying
1950 * to satisfy here.
1951 */
1952 rcu_read_lock();
1953 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1952 if (sd)
1954 if (sd) {
1953 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1955 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1956 env.imb_numa_nr = sd->imb_numa_nr;
1957 }
1954 rcu_read_unlock();
1955
1956 /*
1957 * Cpusets can break the scheduler domain tree into smaller
1958 * balance domains, some of which do not cross NUMA boundaries.
1959 * Tasks that are "trapped" in such domains cannot be migrated
1960 * elsewhere, so there is no point in (re)trying.
1961 */

--- 858 unchanged lines hidden (view full) ---

2820 }
2821 }
2822 p->node_stamp = 0;
2823 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2824 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2825 /* Protect against double add, see task_tick_numa and task_numa_work */
2826 p->numa_work.next = &p->numa_work;
2827 p->numa_faults = NULL;
1958 rcu_read_unlock();
1959
1960 /*
1961 * Cpusets can break the scheduler domain tree into smaller
1962 * balance domains, some of which do not cross NUMA boundaries.
1963 * Tasks that are "trapped" in such domains cannot be migrated
1964 * elsewhere, so there is no point in (re)trying.
1965 */

--- 858 unchanged lines hidden (view full) ---

2824 }
2825 }
2826 p->node_stamp = 0;
2827 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2828 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2829 /* Protect against double add, see task_tick_numa and task_numa_work */
2830 p->numa_work.next = &p->numa_work;
2831 p->numa_faults = NULL;
2832 p->numa_pages_migrated = 0;
2833 p->total_numa_faults = 0;
2828 RCU_INIT_POINTER(p->numa_group, NULL);
2829 p->last_task_numa_placement = 0;
2830 p->last_sum_exec_runtime = 0;
2831
2832 init_task_work(&p->numa_work, task_numa_work);
2833
2834 /* New address space, reset the preferred nid */
2835 if (!(clone_flags & CLONE_VM)) {

--- 187 unchanged lines hidden (view full) ---

3023{
3024 cfs_rq->avg.load_avg += se->avg.load_avg;
3025 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3026}
3027
3028static inline void
3029dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3030{
2834 RCU_INIT_POINTER(p->numa_group, NULL);
2835 p->last_task_numa_placement = 0;
2836 p->last_sum_exec_runtime = 0;
2837
2838 init_task_work(&p->numa_work, task_numa_work);
2839
2840 /* New address space, reset the preferred nid */
2841 if (!(clone_flags & CLONE_VM)) {

--- 187 unchanged lines hidden (view full) ---

3029{
3030 cfs_rq->avg.load_avg += se->avg.load_avg;
3031 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3032}
3033
3034static inline void
3035dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3036{
3037 u32 divider = get_pelt_divider(&se->avg);
3031 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3038 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3032 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3033 /* See update_cfs_rq_load_avg() */
3034 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3035 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3039 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3036}
3037#else
3038static inline void
3039enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3040static inline void
3041dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3042#endif
3043

--- 334 unchanged lines hidden (view full) ---

3378#else
3379 p_last_update_time = prev->avg.last_update_time;
3380 n_last_update_time = next->avg.last_update_time;
3381#endif
3382 __update_load_avg_blocked_se(p_last_update_time, se);
3383 se->avg.last_update_time = n_last_update_time;
3384}
3385
3040}
3041#else
3042static inline void
3043enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3044static inline void
3045dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3046#endif
3047

--- 334 unchanged lines hidden (view full) ---

3382#else
3383 p_last_update_time = prev->avg.last_update_time;
3384 n_last_update_time = next->avg.last_update_time;
3385#endif
3386 __update_load_avg_blocked_se(p_last_update_time, se);
3387 se->avg.last_update_time = n_last_update_time;
3388}
3389
3390
3386/*
3387 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3388 * propagate its contribution. The key to this propagation is the invariant
3389 * that for each group:
3390 *
3391 * ge->avg == grq->avg (1)
3392 *
3393 * _IFF_ we look at the pure running and runnable sums. Because they

--- 51 unchanged lines hidden (view full) ---

3445 *
3446 * On removal, we'll assume each task is equally runnable; which yields:
3447 *
3448 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3449 *
3450 * XXX: only do this for the part of runnable > running ?
3451 *
3452 */
3391/*
3392 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3393 * propagate its contribution. The key to this propagation is the invariant
3394 * that for each group:
3395 *
3396 * ge->avg == grq->avg (1)
3397 *
3398 * _IFF_ we look at the pure running and runnable sums. Because they

--- 51 unchanged lines hidden (view full) ---

3450 *
3451 * On removal, we'll assume each task is equally runnable; which yields:
3452 *
3453 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3454 *
3455 * XXX: only do this for the part of runnable > running ?
3456 *
3457 */
3458
3453static inline void
3454update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3455{
3459static inline void
3460update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3461{
3456 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
3457 u32 new_sum, divider;
3462 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3463 u32 divider;
3458
3459 /* Nothing to update */
3464
3465 /* Nothing to update */
3460 if (!delta_avg)
3466 if (!delta)
3461 return;
3462
3463 /*
3464 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3465 * See ___update_load_avg() for details.
3466 */
3467 divider = get_pelt_divider(&cfs_rq->avg);
3468
3467 return;
3468
3469 /*
3470 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3471 * See ___update_load_avg() for details.
3472 */
3473 divider = get_pelt_divider(&cfs_rq->avg);
3474
3469
3470 /* Set new sched_entity's utilization */
3471 se->avg.util_avg = gcfs_rq->avg.util_avg;
3475 /* Set new sched_entity's utilization */
3476 se->avg.util_avg = gcfs_rq->avg.util_avg;
3472 new_sum = se->avg.util_avg * divider;
3473 delta_sum = (long)new_sum - (long)se->avg.util_sum;
3474 se->avg.util_sum = new_sum;
3477 se->avg.util_sum = se->avg.util_avg * divider;
3475
3476 /* Update parent cfs_rq utilization */
3478
3479 /* Update parent cfs_rq utilization */
3477 add_positive(&cfs_rq->avg.util_avg, delta_avg);
3478 add_positive(&cfs_rq->avg.util_sum, delta_sum);
3479
3480 /* See update_cfs_rq_load_avg() */
3481 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3482 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3480 add_positive(&cfs_rq->avg.util_avg, delta);
3481 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3483}
3484
3485static inline void
3486update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3487{
3482}
3483
3484static inline void
3485update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3486{
3488 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3489 u32 new_sum, divider;
3487 long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3488 u32 divider;
3490
3491 /* Nothing to update */
3489
3490 /* Nothing to update */
3492 if (!delta_avg)
3491 if (!delta)
3493 return;
3494
3495 /*
3496 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3497 * See ___update_load_avg() for details.
3498 */
3499 divider = get_pelt_divider(&cfs_rq->avg);
3500
3501 /* Set new sched_entity's runnable */
3502 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3492 return;
3493
3494 /*
3495 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3496 * See ___update_load_avg() for details.
3497 */
3498 divider = get_pelt_divider(&cfs_rq->avg);
3499
3500 /* Set new sched_entity's runnable */
3501 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3503 new_sum = se->avg.runnable_avg * divider;
3504 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
3505 se->avg.runnable_sum = new_sum;
3502 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3506
3507 /* Update parent cfs_rq runnable */
3503
3504 /* Update parent cfs_rq runnable */
3508 add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
3509 add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
3510 /* See update_cfs_rq_load_avg() */
3511 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3512 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
3505 add_positive(&cfs_rq->avg.runnable_avg, delta);
3506 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3513}
3514
3515static inline void
3516update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3517{
3507}
3508
3509static inline void
3510update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3511{
3518 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3512 long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3519 unsigned long load_avg;
3520 u64 load_sum = 0;
3513 unsigned long load_avg;
3514 u64 load_sum = 0;
3521 s64 delta_sum;
3522 u32 divider;
3523
3524 if (!runnable_sum)
3525 return;
3526
3527 gcfs_rq->prop_runnable_sum = 0;
3528
3529 /*

--- 10 unchanged lines hidden (view full) ---

3540 runnable_sum += se->avg.load_sum;
3541 runnable_sum = min_t(long, runnable_sum, divider);
3542 } else {
3543 /*
3544 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3545 * assuming all tasks are equally runnable.
3546 */
3547 if (scale_load_down(gcfs_rq->load.weight)) {
3515 u32 divider;
3516
3517 if (!runnable_sum)
3518 return;
3519
3520 gcfs_rq->prop_runnable_sum = 0;
3521
3522 /*

--- 10 unchanged lines hidden (view full) ---

3533 runnable_sum += se->avg.load_sum;
3534 runnable_sum = min_t(long, runnable_sum, divider);
3535 } else {
3536 /*
3537 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3538 * assuming all tasks are equally runnable.
3539 */
3540 if (scale_load_down(gcfs_rq->load.weight)) {
3548 load_sum = div_u64(gcfs_rq->avg.load_sum,
3541 load_sum = div_s64(gcfs_rq->avg.load_sum,
3549 scale_load_down(gcfs_rq->load.weight));
3550 }
3551
3552 /* But make sure to not inflate se's runnable */
3553 runnable_sum = min(se->avg.load_sum, load_sum);
3554 }
3555
3556 /*
3557 * runnable_sum can't be lower than running_sum
3558 * Rescale running sum to be in the same range as runnable sum
3559 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3560 * runnable_sum is in [0 : LOAD_AVG_MAX]
3561 */
3562 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3563 runnable_sum = max(runnable_sum, running_sum);
3564
3542 scale_load_down(gcfs_rq->load.weight));
3543 }
3544
3545 /* But make sure to not inflate se's runnable */
3546 runnable_sum = min(se->avg.load_sum, load_sum);
3547 }
3548
3549 /*
3550 * runnable_sum can't be lower than running_sum
3551 * Rescale running sum to be in the same range as runnable sum
3552 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3553 * runnable_sum is in [0 : LOAD_AVG_MAX]
3554 */
3555 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3556 runnable_sum = max(runnable_sum, running_sum);
3557
3565 load_sum = se_weight(se) * runnable_sum;
3566 load_avg = div_u64(load_sum, divider);
3558 load_sum = (s64)se_weight(se) * runnable_sum;
3559 load_avg = div_s64(load_sum, divider);
3567
3560
3568 delta_avg = load_avg - se->avg.load_avg;
3569 if (!delta_avg)
3561 se->avg.load_sum = runnable_sum;
3562
3563 delta = load_avg - se->avg.load_avg;
3564 if (!delta)
3570 return;
3571
3565 return;
3566
3572 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3573
3574 se->avg.load_sum = runnable_sum;
3575 se->avg.load_avg = load_avg;
3567 se->avg.load_avg = load_avg;
3576 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3577 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3578 /* See update_cfs_rq_load_avg() */
3579 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3580 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3568
3569 add_positive(&cfs_rq->avg.load_avg, delta);
3570 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3581}
3582
3583static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3584{
3585 cfs_rq->propagate = 1;
3586 cfs_rq->prop_runnable_sum += runnable_sum;
3587}
3588

--- 74 unchanged lines hidden (view full) ---

3663 * @cfs_rq: cfs_rq to update
3664 *
3665 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3666 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3667 * post_init_entity_util_avg().
3668 *
3669 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3670 *
3571}
3572
3573static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3574{
3575 cfs_rq->propagate = 1;
3576 cfs_rq->prop_runnable_sum += runnable_sum;
3577}
3578

--- 74 unchanged lines hidden (view full) ---

3653 * @cfs_rq: cfs_rq to update
3654 *
3655 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3656 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3657 * post_init_entity_util_avg().
3658 *
3659 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3660 *
3671 * Return: true if the load decayed or we removed load.
3661 * Returns true if the load decayed or we removed load.
3672 *
3673 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3674 * call update_tg_load_avg() when this function returns true.
3675 */
3676static inline int
3677update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3678{
3679 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;

--- 8 unchanged lines hidden (view full) ---

3688 swap(cfs_rq->removed.util_avg, removed_util);
3689 swap(cfs_rq->removed.load_avg, removed_load);
3690 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3691 cfs_rq->removed.nr = 0;
3692 raw_spin_unlock(&cfs_rq->removed.lock);
3693
3694 r = removed_load;
3695 sub_positive(&sa->load_avg, r);
3662 *
3663 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3664 * call update_tg_load_avg() when this function returns true.
3665 */
3666static inline int
3667update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3668{
3669 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;

--- 8 unchanged lines hidden (view full) ---

3678 swap(cfs_rq->removed.util_avg, removed_util);
3679 swap(cfs_rq->removed.load_avg, removed_load);
3680 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3681 cfs_rq->removed.nr = 0;
3682 raw_spin_unlock(&cfs_rq->removed.lock);
3683
3684 r = removed_load;
3685 sub_positive(&sa->load_avg, r);
3696 sub_positive(&sa->load_sum, r * divider);
3697 /* See sa->util_sum below */
3698 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
3686 sa->load_sum = sa->load_avg * divider;
3699
3700 r = removed_util;
3701 sub_positive(&sa->util_avg, r);
3687
3688 r = removed_util;
3689 sub_positive(&sa->util_avg, r);
3702 sub_positive(&sa->util_sum, r * divider);
3703 /*
3704 * Because of rounding, se->util_sum might ends up being +1 more than
3705 * cfs->util_sum. Although this is not a problem by itself, detaching
3706 * a lot of tasks with the rounding problem between 2 updates of
3707 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3708 * cfs_util_avg is not.
3709 * Check that util_sum is still above its lower bound for the new
3710 * util_avg. Given that period_contrib might have moved since the last
3711 * sync, we are only sure that util_sum must be above or equal to
3712 * util_avg * minimum possible divider
3713 */
3714 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
3690 sa->util_sum = sa->util_avg * divider;
3715
3716 r = removed_runnable;
3717 sub_positive(&sa->runnable_avg, r);
3691
3692 r = removed_runnable;
3693 sub_positive(&sa->runnable_avg, r);
3718 sub_positive(&sa->runnable_sum, r * divider);
3719 /* See sa->util_sum above */
3720 sa->runnable_sum = max_t(u32, sa->runnable_sum,
3721 sa->runnable_avg * PELT_MIN_DIVIDER);
3694 sa->runnable_sum = sa->runnable_avg * divider;
3722
3723 /*
3724 * removed_runnable is the unweighted version of removed_load so we
3725 * can use it to estimate removed_load_sum.
3726 */
3727 add_tg_cfs_propagate(cfs_rq,
3728 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3729

--- 70 unchanged lines hidden (view full) ---

3800 * @cfs_rq: cfs_rq to detach from
3801 * @se: sched_entity to detach
3802 *
3803 * Must call update_cfs_rq_load_avg() before this, since we rely on
3804 * cfs_rq->avg.last_update_time being current.
3805 */
3806static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3807{
3695
3696 /*
3697 * removed_runnable is the unweighted version of removed_load so we
3698 * can use it to estimate removed_load_sum.
3699 */
3700 add_tg_cfs_propagate(cfs_rq,
3701 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3702

--- 70 unchanged lines hidden (view full) ---

3773 * @cfs_rq: cfs_rq to detach from
3774 * @se: sched_entity to detach
3775 *
3776 * Must call update_cfs_rq_load_avg() before this, since we rely on
3777 * cfs_rq->avg.last_update_time being current.
3778 */
3779static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3780{
3781 /*
3782 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3783 * See ___update_load_avg() for details.
3784 */
3785 u32 divider = get_pelt_divider(&cfs_rq->avg);
3786
3808 dequeue_load_avg(cfs_rq, se);
3809 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3787 dequeue_load_avg(cfs_rq, se);
3788 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3810 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3811 /* See update_cfs_rq_load_avg() */
3812 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3813 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3814
3789 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3815 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3790 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3816 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3817 /* See update_cfs_rq_load_avg() */
3818 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3819 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
3791 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3820
3821 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3822
3823 cfs_rq_util_change(cfs_rq, 0);
3824
3825 trace_pelt_cfs_tp(cfs_rq);
3826}
3827

--- 4740 unchanged lines hidden (view full) ---

8568 *
8569 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
8570 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
8571 * Bigger imbalances in the number of busy CPUs will be dealt with in
8572 * update_sd_pick_busiest().
8573 *
8574 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
8575 * of @dst_cpu are idle and @sg has lower priority.
3792
3793 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3794
3795 cfs_rq_util_change(cfs_rq, 0);
3796
3797 trace_pelt_cfs_tp(cfs_rq);
3798}
3799

--- 4740 unchanged lines hidden (view full) ---

8540 *
8541 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
8542 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
8543 * Bigger imbalances in the number of busy CPUs will be dealt with in
8544 * update_sd_pick_busiest().
8545 *
8546 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
8547 * of @dst_cpu are idle and @sg has lower priority.
8576 *
8577 * Return: true if @dst_cpu can pull tasks, false otherwise.
8578 */
8579static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
8580 struct sg_lb_stats *sgs,
8581 struct sched_group *sg)
8582{
8583#ifdef CONFIG_SCHED_SMT
8584 bool local_is_smt, sg_is_smt;
8585 int sg_busy_cpus;

--- 59 unchanged lines hidden (view full) ---

8645 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
8646
8647 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
8648}
8649
8650/**
8651 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8652 * @env: The load balancing environment.
8548 */
8549static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
8550 struct sg_lb_stats *sgs,
8551 struct sched_group *sg)
8552{
8553#ifdef CONFIG_SCHED_SMT
8554 bool local_is_smt, sg_is_smt;
8555 int sg_busy_cpus;

--- 59 unchanged lines hidden (view full) ---

8615 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
8616
8617 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
8618}
8619
8620/**
8621 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8622 * @env: The load balancing environment.
8653 * @sds: Load-balancing data with statistics of the local group.
8654 * @group: sched_group whose statistics are to be updated.
8655 * @sgs: variable to hold the statistics for this group.
8656 * @sg_status: Holds flag indicating the status of the sched_group
8657 */
8658static inline void update_sg_lb_stats(struct lb_env *env,
8659 struct sd_lb_stats *sds,
8660 struct sched_group *group,
8661 struct sg_lb_stats *sgs,

--- 373 unchanged lines hidden (view full) ---

9035 return true;
9036}
9037
9038/*
9039 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
9040 * This is an approximation as the number of running tasks may not be
9041 * related to the number of busy CPUs due to sched_setaffinity.
9042 */
8623 * @group: sched_group whose statistics are to be updated.
8624 * @sgs: variable to hold the statistics for this group.
8625 * @sg_status: Holds flag indicating the status of the sched_group
8626 */
8627static inline void update_sg_lb_stats(struct lb_env *env,
8628 struct sd_lb_stats *sds,
8629 struct sched_group *group,
8630 struct sg_lb_stats *sgs,

--- 373 unchanged lines hidden (view full) ---

9004 return true;
9005}
9006
9007/*
9008 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
9009 * This is an approximation as the number of running tasks may not be
9010 * related to the number of busy CPUs due to sched_setaffinity.
9011 */
9043static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
9012static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
9044{
9013{
9045 return (dst_running < (dst_weight >> 2));
9014 return running <= imb_numa_nr;
9046}
9047
9048/*
9049 * find_idlest_group() finds and returns the least busy CPU group within the
9050 * domain.
9051 *
9052 * Assumes p is allowed on at least one CPU in sd.
9053 */

--- 117 unchanged lines hidden (view full) ---

9171 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9172 return NULL;
9173
9174 idlest_cpu = cpumask_first(sched_group_span(idlest));
9175 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9176 return idlest;
9177#endif
9178 /*
9015}
9016
9017/*
9018 * find_idlest_group() finds and returns the least busy CPU group within the
9019 * domain.
9020 *
9021 * Assumes p is allowed on at least one CPU in sd.
9022 */

--- 117 unchanged lines hidden (view full) ---

9140 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9141 return NULL;
9142
9143 idlest_cpu = cpumask_first(sched_group_span(idlest));
9144 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9145 return idlest;
9146#endif
9147 /*
9179 * Otherwise, keep the task on this node to stay close
9180 * its wakeup source and improve locality. If there is
9181 * a real need of migration, periodic load balance will
9182 * take care of it.
9148 * Otherwise, keep the task close to the wakeup source
9149 * and improve locality if the number of running tasks
9150 * would remain below threshold where an imbalance is
9151 * allowed. If there is a real need of migration,
9152 * periodic load balance will take care of it.
9183 */
9153 */
9184 if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
9154 if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
9185 return NULL;
9186 }
9187
9188 /*
9189 * Select group with highest number of idle CPUs. We could also
9190 * compare the utilization which is more stable but it can end
9191 * up that the group has less spare capacity but finally more
9192 * idle CPUs which means more opportunity to run task.

--- 75 unchanged lines hidden (view full) ---

9268 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9269 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9270 }
9271}
9272
9273#define NUMA_IMBALANCE_MIN 2
9274
9275static inline long adjust_numa_imbalance(int imbalance,
9155 return NULL;
9156 }
9157
9158 /*
9159 * Select group with highest number of idle CPUs. We could also
9160 * compare the utilization which is more stable but it can end
9161 * up that the group has less spare capacity but finally more
9162 * idle CPUs which means more opportunity to run task.

--- 75 unchanged lines hidden (view full) ---

9238 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9239 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9240 }
9241}
9242
9243#define NUMA_IMBALANCE_MIN 2
9244
9245static inline long adjust_numa_imbalance(int imbalance,
9276 int dst_running, int dst_weight)
9246 int dst_running, int imb_numa_nr)
9277{
9247{
9278 if (!allow_numa_imbalance(dst_running, dst_weight))
9248 if (!allow_numa_imbalance(dst_running, imb_numa_nr))
9279 return imbalance;
9280
9281 /*
9282 * Allow a small imbalance based on a simple pair of communicating
9283 * tasks that remain local when the destination is lightly loaded.
9284 */
9285 if (imbalance <= NUMA_IMBALANCE_MIN)
9286 return 0;

--- 95 unchanged lines hidden (view full) ---

9382 env->migration_type = migrate_task;
9383 env->imbalance = max_t(long, 0, (local->idle_cpus -
9384 busiest->idle_cpus) >> 1);
9385 }
9386
9387 /* Consider allowing a small imbalance between NUMA groups */
9388 if (env->sd->flags & SD_NUMA) {
9389 env->imbalance = adjust_numa_imbalance(env->imbalance,
9249 return imbalance;
9250
9251 /*
9252 * Allow a small imbalance based on a simple pair of communicating
9253 * tasks that remain local when the destination is lightly loaded.
9254 */
9255 if (imbalance <= NUMA_IMBALANCE_MIN)
9256 return 0;

--- 95 unchanged lines hidden (view full) ---

9352 env->migration_type = migrate_task;
9353 env->imbalance = max_t(long, 0, (local->idle_cpus -
9354 busiest->idle_cpus) >> 1);
9355 }
9356
9357 /* Consider allowing a small imbalance between NUMA groups */
9358 if (env->sd->flags & SD_NUMA) {
9359 env->imbalance = adjust_numa_imbalance(env->imbalance,
9390 busiest->sum_nr_running, busiest->group_weight);
9360 local->sum_nr_running + 1, env->sd->imb_numa_nr);
9391 }
9392
9393 return;
9394 }
9395
9396 /*
9397 * Local is fully busy but has to take more load to relieve the
9398 * busiest group

--- 54 unchanged lines hidden (view full) ---

9453 * avg_load : Only if imbalance is significant enough.
9454 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9455 * different in groups.
9456 */
9457
9458/**
9459 * find_busiest_group - Returns the busiest group within the sched_domain
9460 * if there is an imbalance.
9361 }
9362
9363 return;
9364 }
9365
9366 /*
9367 * Local is fully busy but has to take more load to relieve the
9368 * busiest group

--- 54 unchanged lines hidden (view full) ---

9423 * avg_load : Only if imbalance is significant enough.
9424 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9425 * different in groups.
9426 */
9427
9428/**
9429 * find_busiest_group - Returns the busiest group within the sched_domain
9430 * if there is an imbalance.
9461 * @env: The load balancing environment.
9462 *
9463 * Also calculates the amount of runnable load which should be moved
9464 * to restore balance.
9465 *
9431 *
9432 * Also calculates the amount of runnable load which should be moved
9433 * to restore balance.
9434 *
9435 * @env: The load balancing environment.
9436 *
9466 * Return: - The busiest group if imbalance exists.
9467 */
9468static struct sched_group *find_busiest_group(struct lb_env *env)
9469{
9470 struct sg_lb_stats *local, *busiest;
9471 struct sd_lb_stats sds;
9472
9473 init_sd_lb_stats(&sds);

--- 2454 unchanged lines hidden ---
9437 * Return: - The busiest group if imbalance exists.
9438 */
9439static struct sched_group *find_busiest_group(struct lb_env *env)
9440{
9441 struct sg_lb_stats *local, *busiest;
9442 struct sd_lb_stats sds;
9443
9444 init_sd_lb_stats(&sds);

--- 2454 unchanged lines hidden ---