1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_est.c: simple rate estimator for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * 7 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> 8 * Network name space (netns) aware. 9 * Global data moved to netns i.e struct netns_ipvs 10 * Affected data: est_list and est_lock. 11 * estimation_timer() runs with timer per netns. 12 * get_stats()) do the per cpu summing. 13 */ 14 15 #define KMSG_COMPONENT "IPVS" 16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 17 18 #include <linux/kernel.h> 19 #include <linux/jiffies.h> 20 #include <linux/types.h> 21 #include <linux/interrupt.h> 22 #include <linux/sysctl.h> 23 #include <linux/list.h> 24 25 #include <net/ip_vs.h> 26 27 /* 28 This code is to estimate rate in a shorter interval (such as 8 29 seconds) for virtual services and real servers. For measure rate in a 30 long interval, it is easy to implement a user level daemon which 31 periodically reads those statistical counters and measure rate. 32 33 We measure rate during the last 8 seconds every 2 seconds: 34 35 avgrate = avgrate*(1-W) + rate*W 36 37 where W = 2^(-2) 38 39 NOTES. 40 41 * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. 42 43 * Netlink users can see 64-bit values but sockopt users are restricted 44 to 32-bit values for conns, packets, bps, cps and pps. 45 46 * A lot of code is taken from net/core/gen_estimator.c 47 48 KEY POINTS: 49 - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled 50 - kthreads read the cpustats to update the estimators (svcs, dests, total) 51 - the states of estimators can be read (get stats) or modified (zero stats) 52 from processes 53 54 KTHREADS: 55 - estimators are added initially to est_temp_list and later kthread 0 56 distributes them to one or many kthreads for estimation 57 - kthread contexts are created and attached to array 58 - the kthread tasks are started when first service is added, before that 59 the total stats are not estimated 60 - when configuration (cpulist/nice) is changed, the tasks are restarted 61 by work (est_reload_work) 62 - kthread tasks are stopped while the cpulist is empty 63 - the kthread context holds lists with estimators (chains) which are 64 processed every 2 seconds 65 - as estimators can be added dynamically and in bursts, we try to spread 66 them to multiple chains which are estimated at different time 67 - on start, kthread 0 enters calculation phase to determine the chain limits 68 and the limit of estimators per kthread 69 - est_add_ktid: ktid where to add new ests, can point to empty slot where 70 we should add kt data 71 */ 72 73 static struct lock_class_key __ipvs_est_key; 74 75 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); 76 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); 77 78 static void ip_vs_chain_estimation(struct hlist_head *chain) 79 { 80 struct ip_vs_estimator *e; 81 struct ip_vs_cpu_stats *c; 82 struct ip_vs_stats *s; 83 u64 rate; 84 85 hlist_for_each_entry_rcu(e, chain, list) { 86 u64 conns, inpkts, outpkts, inbytes, outbytes; 87 u64 kconns = 0, kinpkts = 0, koutpkts = 0; 88 u64 kinbytes = 0, koutbytes = 0; 89 unsigned int start; 90 int i; 91 92 if (kthread_should_stop()) 93 break; 94 95 s = container_of(e, struct ip_vs_stats, est); 96 for_each_possible_cpu(i) { 97 c = per_cpu_ptr(s->cpustats, i); 98 do { 99 start = u64_stats_fetch_begin(&c->syncp); 100 conns = u64_stats_read(&c->cnt.conns); 101 inpkts = u64_stats_read(&c->cnt.inpkts); 102 outpkts = u64_stats_read(&c->cnt.outpkts); 103 inbytes = u64_stats_read(&c->cnt.inbytes); 104 outbytes = u64_stats_read(&c->cnt.outbytes); 105 } while (u64_stats_fetch_retry(&c->syncp, start)); 106 kconns += conns; 107 kinpkts += inpkts; 108 koutpkts += outpkts; 109 kinbytes += inbytes; 110 koutbytes += outbytes; 111 } 112 113 spin_lock(&s->lock); 114 115 s->kstats.conns = kconns; 116 s->kstats.inpkts = kinpkts; 117 s->kstats.outpkts = koutpkts; 118 s->kstats.inbytes = kinbytes; 119 s->kstats.outbytes = koutbytes; 120 121 /* scaled by 2^10, but divided 2 seconds */ 122 rate = (s->kstats.conns - e->last_conns) << 9; 123 e->last_conns = s->kstats.conns; 124 e->cps += ((s64)rate - (s64)e->cps) >> 2; 125 126 rate = (s->kstats.inpkts - e->last_inpkts) << 9; 127 e->last_inpkts = s->kstats.inpkts; 128 e->inpps += ((s64)rate - (s64)e->inpps) >> 2; 129 130 rate = (s->kstats.outpkts - e->last_outpkts) << 9; 131 e->last_outpkts = s->kstats.outpkts; 132 e->outpps += ((s64)rate - (s64)e->outpps) >> 2; 133 134 /* scaled by 2^5, but divided 2 seconds */ 135 rate = (s->kstats.inbytes - e->last_inbytes) << 4; 136 e->last_inbytes = s->kstats.inbytes; 137 e->inbps += ((s64)rate - (s64)e->inbps) >> 2; 138 139 rate = (s->kstats.outbytes - e->last_outbytes) << 4; 140 e->last_outbytes = s->kstats.outbytes; 141 e->outbps += ((s64)rate - (s64)e->outbps) >> 2; 142 spin_unlock(&s->lock); 143 } 144 } 145 146 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) 147 { 148 struct ip_vs_est_tick_data *td; 149 int cid; 150 151 rcu_read_lock(); 152 td = rcu_dereference(kd->ticks[row]); 153 if (!td) 154 goto out; 155 for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { 156 if (kthread_should_stop()) 157 break; 158 ip_vs_chain_estimation(&td->chains[cid]); 159 cond_resched_rcu(); 160 td = rcu_dereference(kd->ticks[row]); 161 if (!td) 162 break; 163 } 164 165 out: 166 rcu_read_unlock(); 167 } 168 169 static int ip_vs_estimation_kthread(void *data) 170 { 171 struct ip_vs_est_kt_data *kd = data; 172 struct netns_ipvs *ipvs = kd->ipvs; 173 int row = kd->est_row; 174 unsigned long now; 175 int id = kd->id; 176 long gap; 177 178 if (id > 0) { 179 if (!ipvs->est_chain_max) 180 return 0; 181 } else { 182 if (!ipvs->est_chain_max) { 183 ipvs->est_calc_phase = 1; 184 /* commit est_calc_phase before reading est_genid */ 185 smp_mb(); 186 } 187 188 /* kthread 0 will handle the calc phase */ 189 if (ipvs->est_calc_phase) 190 ip_vs_est_calc_phase(ipvs); 191 } 192 193 while (1) { 194 if (!id && !hlist_empty(&ipvs->est_temp_list)) 195 ip_vs_est_drain_temp_list(ipvs); 196 set_current_state(TASK_IDLE); 197 if (kthread_should_stop()) 198 break; 199 200 /* before estimation, check if we should sleep */ 201 now = jiffies; 202 gap = kd->est_timer - now; 203 if (gap > 0) { 204 if (gap > IPVS_EST_TICK) { 205 kd->est_timer = now - IPVS_EST_TICK; 206 gap = IPVS_EST_TICK; 207 } 208 schedule_timeout(gap); 209 } else { 210 __set_current_state(TASK_RUNNING); 211 if (gap < -8 * IPVS_EST_TICK) 212 kd->est_timer = now; 213 } 214 215 if (kd->tick_len[row]) 216 ip_vs_tick_estimation(kd, row); 217 218 row++; 219 if (row >= IPVS_EST_NTICKS) 220 row = 0; 221 WRITE_ONCE(kd->est_row, row); 222 kd->est_timer += IPVS_EST_TICK; 223 } 224 __set_current_state(TASK_RUNNING); 225 226 return 0; 227 } 228 229 /* Schedule stop/start for kthread tasks */ 230 void ip_vs_est_reload_start(struct netns_ipvs *ipvs) 231 { 232 /* Ignore reloads before first service is added */ 233 if (!ipvs->enable) 234 return; 235 ip_vs_est_stopped_recalc(ipvs); 236 /* Bump the kthread configuration genid */ 237 atomic_inc(&ipvs->est_genid); 238 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); 239 } 240 241 /* Start kthread task with current configuration */ 242 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, 243 struct ip_vs_est_kt_data *kd) 244 { 245 unsigned long now; 246 int ret = 0; 247 long gap; 248 249 lockdep_assert_held(&ipvs->est_mutex); 250 251 if (kd->task) 252 goto out; 253 now = jiffies; 254 gap = kd->est_timer - now; 255 /* Sync est_timer if task is starting later */ 256 if (abs(gap) > 4 * IPVS_EST_TICK) 257 kd->est_timer = now; 258 kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", 259 ipvs->gen, kd->id); 260 if (IS_ERR(kd->task)) { 261 ret = PTR_ERR(kd->task); 262 kd->task = NULL; 263 goto out; 264 } 265 266 set_user_nice(kd->task, sysctl_est_nice(ipvs)); 267 set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); 268 269 pr_info("starting estimator thread %d...\n", kd->id); 270 wake_up_process(kd->task); 271 272 out: 273 return ret; 274 } 275 276 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) 277 { 278 if (kd->task) { 279 pr_info("stopping estimator thread %d...\n", kd->id); 280 kthread_stop(kd->task); 281 kd->task = NULL; 282 } 283 } 284 285 /* Apply parameters to kthread */ 286 static void ip_vs_est_set_params(struct netns_ipvs *ipvs, 287 struct ip_vs_est_kt_data *kd) 288 { 289 kd->chain_max = ipvs->est_chain_max; 290 /* We are using single chain on RCU preemption */ 291 if (IPVS_EST_TICK_CHAINS == 1) 292 kd->chain_max *= IPVS_EST_CHAIN_FACTOR; 293 kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; 294 kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; 295 } 296 297 /* Create and start estimation kthread in a free or new array slot */ 298 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) 299 { 300 struct ip_vs_est_kt_data *kd = NULL; 301 int id = ipvs->est_kt_count; 302 int ret = -ENOMEM; 303 void *arr = NULL; 304 int i; 305 306 if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && 307 ipvs->enable && ipvs->est_max_threads) 308 return -EINVAL; 309 310 mutex_lock(&ipvs->est_mutex); 311 312 for (i = 0; i < id; i++) { 313 if (!ipvs->est_kt_arr[i]) 314 break; 315 } 316 if (i >= id) { 317 arr = krealloc_array(ipvs->est_kt_arr, id + 1, 318 sizeof(struct ip_vs_est_kt_data *), 319 GFP_KERNEL); 320 if (!arr) 321 goto out; 322 ipvs->est_kt_arr = arr; 323 } else { 324 id = i; 325 } 326 327 kd = kzalloc(sizeof(*kd), GFP_KERNEL); 328 if (!kd) 329 goto out; 330 kd->ipvs = ipvs; 331 bitmap_fill(kd->avail, IPVS_EST_NTICKS); 332 kd->est_timer = jiffies; 333 kd->id = id; 334 ip_vs_est_set_params(ipvs, kd); 335 336 /* Pre-allocate stats used in calc phase */ 337 if (!id && !kd->calc_stats) { 338 kd->calc_stats = ip_vs_stats_alloc(); 339 if (!kd->calc_stats) 340 goto out; 341 } 342 343 /* Start kthread tasks only when services are present */ 344 if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { 345 ret = ip_vs_est_kthread_start(ipvs, kd); 346 if (ret < 0) 347 goto out; 348 } 349 350 if (arr) 351 ipvs->est_kt_count++; 352 ipvs->est_kt_arr[id] = kd; 353 kd = NULL; 354 /* Use most recent kthread for new ests */ 355 ipvs->est_add_ktid = id; 356 ret = 0; 357 358 out: 359 mutex_unlock(&ipvs->est_mutex); 360 if (kd) { 361 ip_vs_stats_free(kd->calc_stats); 362 kfree(kd); 363 } 364 365 return ret; 366 } 367 368 /* Select ktid where to add new ests: available, unused or new slot */ 369 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) 370 { 371 int ktid, best = ipvs->est_kt_count; 372 struct ip_vs_est_kt_data *kd; 373 374 for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { 375 kd = ipvs->est_kt_arr[ktid]; 376 if (kd) { 377 if (kd->est_count < kd->est_max_count) { 378 best = ktid; 379 break; 380 } 381 } else if (ktid < best) { 382 best = ktid; 383 } 384 } 385 ipvs->est_add_ktid = best; 386 } 387 388 /* Add estimator to current kthread (est_add_ktid) */ 389 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, 390 struct ip_vs_estimator *est) 391 { 392 struct ip_vs_est_kt_data *kd = NULL; 393 struct ip_vs_est_tick_data *td; 394 int ktid, row, crow, cid, ret; 395 int delay = est->ktrow; 396 397 BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, 398 "Too many chains for ktcid"); 399 400 if (ipvs->est_add_ktid < ipvs->est_kt_count) { 401 kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; 402 if (kd) 403 goto add_est; 404 } 405 406 ret = ip_vs_est_add_kthread(ipvs); 407 if (ret < 0) 408 goto out; 409 kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; 410 411 add_est: 412 ktid = kd->id; 413 /* For small number of estimators prefer to use few ticks, 414 * otherwise try to add into the last estimated row. 415 * est_row and add_row point after the row we should use 416 */ 417 if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) 418 crow = READ_ONCE(kd->est_row); 419 else 420 crow = kd->add_row; 421 crow += delay; 422 if (crow >= IPVS_EST_NTICKS) 423 crow -= IPVS_EST_NTICKS; 424 /* Assume initial delay ? */ 425 if (delay >= IPVS_EST_NTICKS - 1) { 426 /* Preserve initial delay or decrease it if no space in tick */ 427 row = crow; 428 if (crow < IPVS_EST_NTICKS - 1) { 429 crow++; 430 row = find_last_bit(kd->avail, crow); 431 } 432 if (row >= crow) 433 row = find_last_bit(kd->avail, IPVS_EST_NTICKS); 434 } else { 435 /* Preserve delay or increase it if no space in tick */ 436 row = IPVS_EST_NTICKS; 437 if (crow > 0) 438 row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); 439 if (row >= IPVS_EST_NTICKS) 440 row = find_first_bit(kd->avail, IPVS_EST_NTICKS); 441 } 442 443 td = rcu_dereference_protected(kd->ticks[row], 1); 444 if (!td) { 445 td = kzalloc(sizeof(*td), GFP_KERNEL); 446 if (!td) { 447 ret = -ENOMEM; 448 goto out; 449 } 450 rcu_assign_pointer(kd->ticks[row], td); 451 } 452 453 cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); 454 455 kd->est_count++; 456 kd->tick_len[row]++; 457 if (!td->chain_len[cid]) 458 __set_bit(cid, td->present); 459 td->chain_len[cid]++; 460 est->ktid = ktid; 461 est->ktrow = row; 462 est->ktcid = cid; 463 hlist_add_head_rcu(&est->list, &td->chains[cid]); 464 465 if (td->chain_len[cid] >= kd->chain_max) { 466 __set_bit(cid, td->full); 467 if (kd->tick_len[row] >= kd->tick_max) 468 __clear_bit(row, kd->avail); 469 } 470 471 /* Update est_add_ktid to point to first available/empty kt slot */ 472 if (kd->est_count == kd->est_max_count) 473 ip_vs_est_update_ktid(ipvs); 474 475 ret = 0; 476 477 out: 478 return ret; 479 } 480 481 /* Start estimation for stats */ 482 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) 483 { 484 struct ip_vs_estimator *est = &stats->est; 485 int ret; 486 487 if (!ipvs->est_max_threads && ipvs->enable) 488 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 489 490 est->ktid = -1; 491 est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ 492 493 /* We prefer this code to be short, kthread 0 will requeue the 494 * estimator to available chain. If tasks are disabled, we 495 * will not allocate much memory, just for kt 0. 496 */ 497 ret = 0; 498 if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) 499 ret = ip_vs_est_add_kthread(ipvs); 500 if (ret >= 0) 501 hlist_add_head(&est->list, &ipvs->est_temp_list); 502 else 503 INIT_HLIST_NODE(&est->list); 504 return ret; 505 } 506 507 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) 508 { 509 if (kd) { 510 if (kd->task) { 511 pr_info("stop unused estimator thread %d...\n", kd->id); 512 kthread_stop(kd->task); 513 } 514 ip_vs_stats_free(kd->calc_stats); 515 kfree(kd); 516 } 517 } 518 519 /* Unlink estimator from chain */ 520 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) 521 { 522 struct ip_vs_estimator *est = &stats->est; 523 struct ip_vs_est_tick_data *td; 524 struct ip_vs_est_kt_data *kd; 525 int ktid = est->ktid; 526 int row = est->ktrow; 527 int cid = est->ktcid; 528 529 /* Failed to add to chain ? */ 530 if (hlist_unhashed(&est->list)) 531 return; 532 533 /* On return, estimator can be freed, dequeue it now */ 534 535 /* In est_temp_list ? */ 536 if (ktid < 0) { 537 hlist_del(&est->list); 538 goto end_kt0; 539 } 540 541 hlist_del_rcu(&est->list); 542 kd = ipvs->est_kt_arr[ktid]; 543 td = rcu_dereference_protected(kd->ticks[row], 1); 544 __clear_bit(cid, td->full); 545 td->chain_len[cid]--; 546 if (!td->chain_len[cid]) 547 __clear_bit(cid, td->present); 548 kd->tick_len[row]--; 549 __set_bit(row, kd->avail); 550 if (!kd->tick_len[row]) { 551 RCU_INIT_POINTER(kd->ticks[row], NULL); 552 kfree_rcu(td); 553 } 554 kd->est_count--; 555 if (kd->est_count) { 556 /* This kt slot can become available just now, prefer it */ 557 if (ktid < ipvs->est_add_ktid) 558 ipvs->est_add_ktid = ktid; 559 return; 560 } 561 562 if (ktid > 0) { 563 mutex_lock(&ipvs->est_mutex); 564 ip_vs_est_kthread_destroy(kd); 565 ipvs->est_kt_arr[ktid] = NULL; 566 if (ktid == ipvs->est_kt_count - 1) { 567 ipvs->est_kt_count--; 568 while (ipvs->est_kt_count > 1 && 569 !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) 570 ipvs->est_kt_count--; 571 } 572 mutex_unlock(&ipvs->est_mutex); 573 574 /* This slot is now empty, prefer another available kt slot */ 575 if (ktid == ipvs->est_add_ktid) 576 ip_vs_est_update_ktid(ipvs); 577 } 578 579 end_kt0: 580 /* kt 0 is freed after all other kthreads and chains are empty */ 581 if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { 582 kd = ipvs->est_kt_arr[0]; 583 if (!kd || !kd->est_count) { 584 mutex_lock(&ipvs->est_mutex); 585 if (kd) { 586 ip_vs_est_kthread_destroy(kd); 587 ipvs->est_kt_arr[0] = NULL; 588 } 589 ipvs->est_kt_count--; 590 mutex_unlock(&ipvs->est_mutex); 591 ipvs->est_add_ktid = 0; 592 } 593 } 594 } 595 596 /* Register all ests from est_temp_list to kthreads */ 597 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) 598 { 599 struct ip_vs_estimator *est; 600 601 while (1) { 602 int max = 16; 603 604 mutex_lock(&__ip_vs_mutex); 605 606 while (max-- > 0) { 607 est = hlist_entry_safe(ipvs->est_temp_list.first, 608 struct ip_vs_estimator, list); 609 if (est) { 610 if (kthread_should_stop()) 611 goto unlock; 612 hlist_del_init(&est->list); 613 if (ip_vs_enqueue_estimator(ipvs, est) >= 0) 614 continue; 615 est->ktid = -1; 616 hlist_add_head(&est->list, 617 &ipvs->est_temp_list); 618 /* Abort, some entries will not be estimated 619 * until next attempt 620 */ 621 } 622 goto unlock; 623 } 624 mutex_unlock(&__ip_vs_mutex); 625 cond_resched(); 626 } 627 628 unlock: 629 mutex_unlock(&__ip_vs_mutex); 630 } 631 632 /* Calculate limits for all kthreads */ 633 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) 634 { 635 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); 636 struct ip_vs_est_kt_data *kd; 637 struct hlist_head chain; 638 struct ip_vs_stats *s; 639 int cache_factor = 4; 640 int i, loops, ntest; 641 s32 min_est = 0; 642 ktime_t t1, t2; 643 int max = 8; 644 int ret = 1; 645 s64 diff; 646 u64 val; 647 648 INIT_HLIST_HEAD(&chain); 649 mutex_lock(&__ip_vs_mutex); 650 kd = ipvs->est_kt_arr[0]; 651 mutex_unlock(&__ip_vs_mutex); 652 s = kd ? kd->calc_stats : NULL; 653 if (!s) 654 goto out; 655 hlist_add_head(&s->est.list, &chain); 656 657 loops = 1; 658 /* Get best result from many tests */ 659 for (ntest = 0; ntest < 12; ntest++) { 660 if (!(ntest & 3)) { 661 /* Wait for cpufreq frequency transition */ 662 wait_event_idle_timeout(wq, kthread_should_stop(), 663 HZ / 50); 664 if (!ipvs->enable || kthread_should_stop()) 665 goto stop; 666 } 667 668 local_bh_disable(); 669 rcu_read_lock(); 670 671 /* Put stats in cache */ 672 ip_vs_chain_estimation(&chain); 673 674 t1 = ktime_get(); 675 for (i = loops * cache_factor; i > 0; i--) 676 ip_vs_chain_estimation(&chain); 677 t2 = ktime_get(); 678 679 rcu_read_unlock(); 680 local_bh_enable(); 681 682 if (!ipvs->enable || kthread_should_stop()) 683 goto stop; 684 cond_resched(); 685 686 diff = ktime_to_ns(ktime_sub(t2, t1)); 687 if (diff <= 1 * NSEC_PER_USEC) { 688 /* Do more loops on low time resolution */ 689 loops *= 2; 690 continue; 691 } 692 if (diff >= NSEC_PER_SEC) 693 continue; 694 val = diff; 695 do_div(val, loops); 696 if (!min_est || val < min_est) { 697 min_est = val; 698 /* goal: 95usec per chain */ 699 val = 95 * NSEC_PER_USEC; 700 if (val >= min_est) { 701 do_div(val, min_est); 702 max = (int)val; 703 } else { 704 max = 1; 705 } 706 } 707 } 708 709 out: 710 if (s) 711 hlist_del_init(&s->est.list); 712 *chain_max = max; 713 return ret; 714 715 stop: 716 ret = 0; 717 goto out; 718 } 719 720 /* Calculate the parameters and apply them in context of kt #0 721 * ECP: est_calc_phase 722 * ECM: est_chain_max 723 * ECP ECM Insert Chain enable Description 724 * --------------------------------------------------------------------------- 725 * 0 0 est_temp_list 0 create kt #0 context 726 * 0 0 est_temp_list 0->1 service added, start kthread #0 task 727 * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase 728 * 1 0 est_temp_list 1 kt #0: determine est_chain_max, 729 * stop tasks, move ests to est_temp_list 730 * and free kd for kthreads 1..last 731 * 1->0 0->N kt chains 1 ests can go to kthreads 732 * 0 N kt chains 1 drain est_temp_list, create new kthread 733 * contexts, start tasks, estimate 734 */ 735 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) 736 { 737 int genid = atomic_read(&ipvs->est_genid); 738 struct ip_vs_est_tick_data *td; 739 struct ip_vs_est_kt_data *kd; 740 struct ip_vs_estimator *est; 741 struct ip_vs_stats *stats; 742 int id, row, cid, delay; 743 bool last, last_td; 744 int chain_max; 745 int step; 746 747 if (!ip_vs_est_calc_limits(ipvs, &chain_max)) 748 return; 749 750 mutex_lock(&__ip_vs_mutex); 751 752 /* Stop all other tasks, so that we can immediately move the 753 * estimators to est_temp_list without RCU grace period 754 */ 755 mutex_lock(&ipvs->est_mutex); 756 for (id = 1; id < ipvs->est_kt_count; id++) { 757 /* netns clean up started, abort */ 758 if (!ipvs->enable) 759 goto unlock2; 760 kd = ipvs->est_kt_arr[id]; 761 if (!kd) 762 continue; 763 ip_vs_est_kthread_stop(kd); 764 } 765 mutex_unlock(&ipvs->est_mutex); 766 767 /* Move all estimators to est_temp_list but carefully, 768 * all estimators and kthread data can be released while 769 * we reschedule. Even for kthread 0. 770 */ 771 step = 0; 772 773 /* Order entries in est_temp_list in ascending delay, so now 774 * walk delay(desc), id(desc), cid(asc) 775 */ 776 delay = IPVS_EST_NTICKS; 777 778 next_delay: 779 delay--; 780 if (delay < 0) 781 goto end_dequeue; 782 783 last_kt: 784 /* Destroy contexts backwards */ 785 id = ipvs->est_kt_count; 786 787 next_kt: 788 if (!ipvs->enable || kthread_should_stop()) 789 goto unlock; 790 id--; 791 if (id < 0) 792 goto next_delay; 793 kd = ipvs->est_kt_arr[id]; 794 if (!kd) 795 goto next_kt; 796 /* kt 0 can exist with empty chains */ 797 if (!id && kd->est_count <= 1) 798 goto next_delay; 799 800 row = kd->est_row + delay; 801 if (row >= IPVS_EST_NTICKS) 802 row -= IPVS_EST_NTICKS; 803 td = rcu_dereference_protected(kd->ticks[row], 1); 804 if (!td) 805 goto next_kt; 806 807 cid = 0; 808 809 walk_chain: 810 if (kthread_should_stop()) 811 goto unlock; 812 step++; 813 if (!(step & 63)) { 814 /* Give chance estimators to be added (to est_temp_list) 815 * and deleted (releasing kthread contexts) 816 */ 817 mutex_unlock(&__ip_vs_mutex); 818 cond_resched(); 819 mutex_lock(&__ip_vs_mutex); 820 821 /* Current kt released ? */ 822 if (id >= ipvs->est_kt_count) 823 goto last_kt; 824 if (kd != ipvs->est_kt_arr[id]) 825 goto next_kt; 826 /* Current td released ? */ 827 if (td != rcu_dereference_protected(kd->ticks[row], 1)) 828 goto next_kt; 829 /* No fatal changes on the current kd and td */ 830 } 831 est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, 832 list); 833 if (!est) { 834 cid++; 835 if (cid >= IPVS_EST_TICK_CHAINS) 836 goto next_kt; 837 goto walk_chain; 838 } 839 /* We can cheat and increase est_count to protect kt 0 context 840 * from release but we prefer to keep the last estimator 841 */ 842 last = kd->est_count <= 1; 843 /* Do not free kt #0 data */ 844 if (!id && last) 845 goto next_delay; 846 last_td = kd->tick_len[row] <= 1; 847 stats = container_of(est, struct ip_vs_stats, est); 848 ip_vs_stop_estimator(ipvs, stats); 849 /* Tasks are stopped, move without RCU grace period */ 850 est->ktid = -1; 851 est->ktrow = row - kd->est_row; 852 if (est->ktrow < 0) 853 est->ktrow += IPVS_EST_NTICKS; 854 hlist_add_head(&est->list, &ipvs->est_temp_list); 855 /* kd freed ? */ 856 if (last) 857 goto next_kt; 858 /* td freed ? */ 859 if (last_td) 860 goto next_kt; 861 goto walk_chain; 862 863 end_dequeue: 864 /* All estimators removed while calculating ? */ 865 if (!ipvs->est_kt_count) 866 goto unlock; 867 kd = ipvs->est_kt_arr[0]; 868 if (!kd) 869 goto unlock; 870 kd->add_row = kd->est_row; 871 ipvs->est_chain_max = chain_max; 872 ip_vs_est_set_params(ipvs, kd); 873 874 pr_info("using max %d ests per chain, %d per kthread\n", 875 kd->chain_max, kd->est_max_count); 876 877 /* Try to keep tot_stats in kt0, enqueue it early */ 878 if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && 879 ipvs->tot_stats->s.est.ktid == -1) { 880 hlist_del(&ipvs->tot_stats->s.est.list); 881 hlist_add_head(&ipvs->tot_stats->s.est.list, 882 &ipvs->est_temp_list); 883 } 884 885 mutex_lock(&ipvs->est_mutex); 886 887 /* We completed the calc phase, new calc phase not requested */ 888 if (genid == atomic_read(&ipvs->est_genid)) 889 ipvs->est_calc_phase = 0; 890 891 unlock2: 892 mutex_unlock(&ipvs->est_mutex); 893 894 unlock: 895 mutex_unlock(&__ip_vs_mutex); 896 } 897 898 void ip_vs_zero_estimator(struct ip_vs_stats *stats) 899 { 900 struct ip_vs_estimator *est = &stats->est; 901 struct ip_vs_kstats *k = &stats->kstats; 902 903 /* reset counters, caller must hold the stats->lock lock */ 904 est->last_inbytes = k->inbytes; 905 est->last_outbytes = k->outbytes; 906 est->last_conns = k->conns; 907 est->last_inpkts = k->inpkts; 908 est->last_outpkts = k->outpkts; 909 est->cps = 0; 910 est->inpps = 0; 911 est->outpps = 0; 912 est->inbps = 0; 913 est->outbps = 0; 914 } 915 916 /* Get decoded rates */ 917 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) 918 { 919 struct ip_vs_estimator *e = &stats->est; 920 921 dst->cps = (e->cps + 0x1FF) >> 10; 922 dst->inpps = (e->inpps + 0x1FF) >> 10; 923 dst->outpps = (e->outpps + 0x1FF) >> 10; 924 dst->inbps = (e->inbps + 0xF) >> 5; 925 dst->outbps = (e->outbps + 0xF) >> 5; 926 } 927 928 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) 929 { 930 INIT_HLIST_HEAD(&ipvs->est_temp_list); 931 ipvs->est_kt_arr = NULL; 932 ipvs->est_max_threads = 0; 933 ipvs->est_calc_phase = 0; 934 ipvs->est_chain_max = 0; 935 ipvs->est_kt_count = 0; 936 ipvs->est_add_ktid = 0; 937 atomic_set(&ipvs->est_genid, 0); 938 atomic_set(&ipvs->est_genid_done, 0); 939 __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); 940 return 0; 941 } 942 943 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) 944 { 945 int i; 946 947 for (i = 0; i < ipvs->est_kt_count; i++) 948 ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); 949 kfree(ipvs->est_kt_arr); 950 mutex_destroy(&ipvs->est_mutex); 951 } 952