1 /* 2 * linux/kernel/acct.c 3 * 4 * BSD Process Accounting for Linux 5 * 6 * Author: Marco van Wieringen <mvw@planets.elm.net> 7 * 8 * Some code based on ideas and code from: 9 * Thomas K. Dyas <tdyas@eden.rutgers.edu> 10 * 11 * This file implements BSD-style process accounting. Whenever any 12 * process exits, an accounting record of type "struct acct" is 13 * written to the file specified with the acct() system call. It is 14 * up to user-level programs to do useful things with the accounting 15 * log. The kernel just provides the raw accounting information. 16 * 17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. 18 * 19 * Plugged two leaks. 1) It didn't return acct_file into the free_filps if 20 * the file happened to be read-only. 2) If the accounting was suspended 21 * due to the lack of space it happily allowed to reopen it and completely 22 * lost the old acct_file. 3/10/98, Al Viro. 23 * 24 * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). 25 * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. 26 * 27 * Fixed a nasty interaction with with sys_umount(). If the accointing 28 * was suspeneded we failed to stop it on umount(). Messy. 29 * Another one: remount to readonly didn't stop accounting. 30 * Question: what should we do if we have CAP_SYS_ADMIN but not 31 * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY 32 * unless we are messing with the root. In that case we are getting a 33 * real mess with do_remount_sb(). 9/11/98, AV. 34 * 35 * Fixed a bunch of races (and pair of leaks). Probably not the best way, 36 * but this one obviously doesn't introduce deadlocks. Later. BTW, found 37 * one race (and leak) in BSD implementation. 38 * OK, that's better. ANOTHER race and leak in BSD variant. There always 39 * is one more bug... 10/11/98, AV. 40 * 41 * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold 42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks 43 * a struct file opened for write. Fixed. 2/6/2000, AV. 44 */ 45 46 #include <linux/config.h> 47 #include <linux/mm.h> 48 #include <linux/slab.h> 49 #include <linux/acct.h> 50 #include <linux/file.h> 51 #include <linux/tty.h> 52 #include <linux/security.h> 53 #include <linux/vfs.h> 54 #include <linux/jiffies.h> 55 #include <linux/times.h> 56 #include <linux/syscalls.h> 57 #include <asm/uaccess.h> 58 #include <asm/div64.h> 59 #include <linux/blkdev.h> /* sector_div */ 60 61 /* 62 * These constants control the amount of freespace that suspend and 63 * resume the process accounting system, and the time delay between 64 * each check. 65 * Turned into sysctl-controllable parameters. AV, 12/11/98 66 */ 67 68 int acct_parm[3] = {4, 2, 30}; 69 #define RESUME (acct_parm[0]) /* >foo% free space - resume */ 70 #define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ 71 #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ 72 73 /* 74 * External references and all of the globals. 75 */ 76 static void do_acct_process(long, struct file *); 77 78 /* 79 * This structure is used so that all the data protected by lock 80 * can be placed in the same cache line as the lock. This primes 81 * the cache line to have the data after getting the lock. 82 */ 83 struct acct_glbs { 84 spinlock_t lock; 85 volatile int active; 86 volatile int needcheck; 87 struct file *file; 88 struct timer_list timer; 89 }; 90 91 static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92 93 /* 94 * Called whenever the timer says to check the free space. 95 */ 96 static void acct_timeout(unsigned long unused) 97 { 98 acct_globals.needcheck = 1; 99 } 100 101 /* 102 * Check the amount of free space and suspend/resume accordingly. 103 */ 104 static int check_free_space(struct file *file) 105 { 106 struct kstatfs sbuf; 107 int res; 108 int act; 109 sector_t resume; 110 sector_t suspend; 111 112 spin_lock(&acct_globals.lock); 113 res = acct_globals.active; 114 if (!file || !acct_globals.needcheck) 115 goto out; 116 spin_unlock(&acct_globals.lock); 117 118 /* May block */ 119 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) 120 return res; 121 suspend = sbuf.f_blocks * SUSPEND; 122 resume = sbuf.f_blocks * RESUME; 123 124 sector_div(suspend, 100); 125 sector_div(resume, 100); 126 127 if (sbuf.f_bavail <= suspend) 128 act = -1; 129 else if (sbuf.f_bavail >= resume) 130 act = 1; 131 else 132 act = 0; 133 134 /* 135 * If some joker switched acct_globals.file under us we'ld better be 136 * silent and _not_ touch anything. 137 */ 138 spin_lock(&acct_globals.lock); 139 if (file != acct_globals.file) { 140 if (act) 141 res = act>0; 142 goto out; 143 } 144 145 if (acct_globals.active) { 146 if (act < 0) { 147 acct_globals.active = 0; 148 printk(KERN_INFO "Process accounting paused\n"); 149 } 150 } else { 151 if (act > 0) { 152 acct_globals.active = 1; 153 printk(KERN_INFO "Process accounting resumed\n"); 154 } 155 } 156 157 del_timer(&acct_globals.timer); 158 acct_globals.needcheck = 0; 159 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 160 add_timer(&acct_globals.timer); 161 res = acct_globals.active; 162 out: 163 spin_unlock(&acct_globals.lock); 164 return res; 165 } 166 167 /* 168 * Close the old accounting file (if currently open) and then replace 169 * it with file (if non-NULL). 170 * 171 * NOTE: acct_globals.lock MUST be held on entry and exit. 172 */ 173 static void acct_file_reopen(struct file *file) 174 { 175 struct file *old_acct = NULL; 176 177 if (acct_globals.file) { 178 old_acct = acct_globals.file; 179 del_timer(&acct_globals.timer); 180 acct_globals.active = 0; 181 acct_globals.needcheck = 0; 182 acct_globals.file = NULL; 183 } 184 if (file) { 185 acct_globals.file = file; 186 acct_globals.needcheck = 0; 187 acct_globals.active = 1; 188 /* It's been deleted if it was used before so this is safe */ 189 init_timer(&acct_globals.timer); 190 acct_globals.timer.function = acct_timeout; 191 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 192 add_timer(&acct_globals.timer); 193 } 194 if (old_acct) { 195 spin_unlock(&acct_globals.lock); 196 do_acct_process(0, old_acct); 197 filp_close(old_acct, NULL); 198 spin_lock(&acct_globals.lock); 199 } 200 } 201 202 /** 203 * sys_acct - enable/disable process accounting 204 * @name: file name for accounting records or NULL to shutdown accounting 205 * 206 * Returns 0 for success or negative errno values for failure. 207 * 208 * sys_acct() is the only system call needed to implement process 209 * accounting. It takes the name of the file where accounting records 210 * should be written. If the filename is NULL, accounting will be 211 * shutdown. 212 */ 213 asmlinkage long sys_acct(const char __user *name) 214 { 215 struct file *file = NULL; 216 char *tmp; 217 int error; 218 219 if (!capable(CAP_SYS_PACCT)) 220 return -EPERM; 221 222 if (name) { 223 tmp = getname(name); 224 if (IS_ERR(tmp)) { 225 return (PTR_ERR(tmp)); 226 } 227 /* Difference from BSD - they don't do O_APPEND */ 228 file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 229 putname(tmp); 230 if (IS_ERR(file)) { 231 return (PTR_ERR(file)); 232 } 233 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 234 filp_close(file, NULL); 235 return (-EACCES); 236 } 237 238 if (!file->f_op->write) { 239 filp_close(file, NULL); 240 return (-EIO); 241 } 242 } 243 244 error = security_acct(file); 245 if (error) { 246 if (file) 247 filp_close(file, NULL); 248 return error; 249 } 250 251 spin_lock(&acct_globals.lock); 252 acct_file_reopen(file); 253 spin_unlock(&acct_globals.lock); 254 255 return (0); 256 } 257 258 /** 259 * acct_auto_close - turn off a filesystem's accounting if it is on 260 * @sb: super block for the filesystem 261 * 262 * If the accounting is turned on for a file in the filesystem pointed 263 * to by sb, turn accounting off. 264 */ 265 void acct_auto_close(struct super_block *sb) 266 { 267 spin_lock(&acct_globals.lock); 268 if (acct_globals.file && 269 acct_globals.file->f_dentry->d_inode->i_sb == sb) { 270 acct_file_reopen((struct file *)NULL); 271 } 272 spin_unlock(&acct_globals.lock); 273 } 274 275 /* 276 * encode an unsigned long into a comp_t 277 * 278 * This routine has been adopted from the encode_comp_t() function in 279 * the kern_acct.c file of the FreeBSD operating system. The encoding 280 * is a 13-bit fraction with a 3-bit (base 8) exponent. 281 */ 282 283 #define MANTSIZE 13 /* 13 bit mantissa. */ 284 #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ 285 #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ 286 287 static comp_t encode_comp_t(unsigned long value) 288 { 289 int exp, rnd; 290 291 exp = rnd = 0; 292 while (value > MAXFRACT) { 293 rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ 294 value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ 295 exp++; 296 } 297 298 /* 299 * If we need to round up, do it (and handle overflow correctly). 300 */ 301 if (rnd && (++value > MAXFRACT)) { 302 value >>= EXPSIZE; 303 exp++; 304 } 305 306 /* 307 * Clean it up and polish it off. 308 */ 309 exp <<= MANTSIZE; /* Shift the exponent into place */ 310 exp += value; /* and add on the mantissa. */ 311 return exp; 312 } 313 314 #if ACCT_VERSION==1 || ACCT_VERSION==2 315 /* 316 * encode an u64 into a comp2_t (24 bits) 317 * 318 * Format: 5 bit base 2 exponent, 20 bits mantissa. 319 * The leading bit of the mantissa is not stored, but implied for 320 * non-zero exponents. 321 * Largest encodable value is 50 bits. 322 */ 323 324 #define MANTSIZE2 20 /* 20 bit mantissa. */ 325 #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ 326 #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ 327 #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ 328 329 static comp2_t encode_comp2_t(u64 value) 330 { 331 int exp, rnd; 332 333 exp = (value > (MAXFRACT2>>1)); 334 rnd = 0; 335 while (value > MAXFRACT2) { 336 rnd = value & 1; 337 value >>= 1; 338 exp++; 339 } 340 341 /* 342 * If we need to round up, do it (and handle overflow correctly). 343 */ 344 if (rnd && (++value > MAXFRACT2)) { 345 value >>= 1; 346 exp++; 347 } 348 349 if (exp > MAXEXP2) { 350 /* Overflow. Return largest representable number instead. */ 351 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; 352 } else { 353 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); 354 } 355 } 356 #endif 357 358 #if ACCT_VERSION==3 359 /* 360 * encode an u64 into a 32 bit IEEE float 361 */ 362 static u32 encode_float(u64 value) 363 { 364 unsigned exp = 190; 365 unsigned u; 366 367 if (value==0) return 0; 368 while ((s64)value > 0){ 369 value <<= 1; 370 exp--; 371 } 372 u = (u32)(value >> 40) & 0x7fffffu; 373 return u | (exp << 23); 374 } 375 #endif 376 377 /* 378 * Write an accounting entry for an exiting process 379 * 380 * The acct_process() call is the workhorse of the process 381 * accounting system. The struct acct is built here and then written 382 * into the accounting file. This function should only be called from 383 * do_exit(). 384 */ 385 386 /* 387 * do_acct_process does all actual work. Caller holds the reference to file. 388 */ 389 static void do_acct_process(long exitcode, struct file *file) 390 { 391 acct_t ac; 392 mm_segment_t fs; 393 unsigned long vsize; 394 unsigned long flim; 395 u64 elapsed; 396 u64 run_time; 397 struct timespec uptime; 398 399 /* 400 * First check to see if there is enough free_space to continue 401 * the process accounting system. 402 */ 403 if (!check_free_space(file)) 404 return; 405 406 /* 407 * Fill the accounting struct with the needed info as recorded 408 * by the different kernel functions. 409 */ 410 memset((caddr_t)&ac, 0, sizeof(acct_t)); 411 412 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 413 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 414 415 /* calculate run_time in nsec*/ 416 do_posix_clock_monotonic_gettime(&uptime); 417 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 418 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 419 + current->start_time.tv_nsec; 420 /* convert nsec -> AHZ */ 421 elapsed = nsec_to_AHZ(run_time); 422 #if ACCT_VERSION==3 423 ac.ac_etime = encode_float(elapsed); 424 #else 425 ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? 426 (unsigned long) elapsed : (unsigned long) -1l); 427 #endif 428 #if ACCT_VERSION==1 || ACCT_VERSION==2 429 { 430 /* new enlarged etime field */ 431 comp2_t etime = encode_comp2_t(elapsed); 432 ac.ac_etime_hi = etime >> 16; 433 ac.ac_etime_lo = (u16) etime; 434 } 435 #endif 436 do_div(elapsed, AHZ); 437 ac.ac_btime = xtime.tv_sec - elapsed; 438 ac.ac_utime = encode_comp_t(jiffies_to_AHZ( 439 current->signal->utime + 440 current->group_leader->utime)); 441 ac.ac_stime = encode_comp_t(jiffies_to_AHZ( 442 current->signal->stime + 443 current->group_leader->stime)); 444 /* we really need to bite the bullet and change layout */ 445 ac.ac_uid = current->uid; 446 ac.ac_gid = current->gid; 447 #if ACCT_VERSION==2 448 ac.ac_ahz = AHZ; 449 #endif 450 #if ACCT_VERSION==1 || ACCT_VERSION==2 451 /* backward-compatible 16 bit fields */ 452 ac.ac_uid16 = current->uid; 453 ac.ac_gid16 = current->gid; 454 #endif 455 #if ACCT_VERSION==3 456 ac.ac_pid = current->tgid; 457 ac.ac_ppid = current->parent->tgid; 458 #endif 459 460 read_lock(&tasklist_lock); /* pin current->signal */ 461 ac.ac_tty = current->signal->tty ? 462 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 463 read_unlock(&tasklist_lock); 464 465 ac.ac_flag = 0; 466 if (current->flags & PF_FORKNOEXEC) 467 ac.ac_flag |= AFORK; 468 if (current->flags & PF_SUPERPRIV) 469 ac.ac_flag |= ASU; 470 if (current->flags & PF_DUMPCORE) 471 ac.ac_flag |= ACORE; 472 if (current->flags & PF_SIGNALED) 473 ac.ac_flag |= AXSIG; 474 475 vsize = 0; 476 if (current->mm) { 477 struct vm_area_struct *vma; 478 down_read(¤t->mm->mmap_sem); 479 vma = current->mm->mmap; 480 while (vma) { 481 vsize += vma->vm_end - vma->vm_start; 482 vma = vma->vm_next; 483 } 484 up_read(¤t->mm->mmap_sem); 485 } 486 vsize = vsize / 1024; 487 ac.ac_mem = encode_comp_t(vsize); 488 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 489 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 490 ac.ac_minflt = encode_comp_t(current->signal->min_flt + 491 current->group_leader->min_flt); 492 ac.ac_majflt = encode_comp_t(current->signal->maj_flt + 493 current->group_leader->maj_flt); 494 ac.ac_swaps = encode_comp_t(0); 495 ac.ac_exitcode = exitcode; 496 497 /* 498 * Kernel segment override to datasegment and write it 499 * to the accounting file. 500 */ 501 fs = get_fs(); 502 set_fs(KERNEL_DS); 503 /* 504 * Accounting records are not subject to resource limits. 505 */ 506 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 507 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 508 file->f_op->write(file, (char *)&ac, 509 sizeof(acct_t), &file->f_pos); 510 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 511 set_fs(fs); 512 } 513 514 /** 515 * acct_process - now just a wrapper around do_acct_process 516 * @exitcode: task exit code 517 * 518 * handles process accounting for an exiting task 519 */ 520 void acct_process(long exitcode) 521 { 522 struct file *file = NULL; 523 524 /* 525 * accelerate the common fastpath: 526 */ 527 if (!acct_globals.file) 528 return; 529 530 spin_lock(&acct_globals.lock); 531 file = acct_globals.file; 532 if (unlikely(!file)) { 533 spin_unlock(&acct_globals.lock); 534 return; 535 } 536 get_file(file); 537 spin_unlock(&acct_globals.lock); 538 539 do_acct_process(exitcode, file); 540 fput(file); 541 } 542 543 544 /** 545 * acct_update_integrals - update mm integral fields in task_struct 546 * @tsk: task_struct for accounting 547 */ 548 void acct_update_integrals(struct task_struct *tsk) 549 { 550 if (likely(tsk->mm)) { 551 long delta = tsk->stime - tsk->acct_stimexpd; 552 553 if (delta == 0) 554 return; 555 tsk->acct_stimexpd = tsk->stime; 556 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 558 } 559 } 560 561 /** 562 * acct_clear_integrals - clear the mm integral fields in task_struct 563 * @tsk: task_struct whose accounting fields are cleared 564 */ 565 void acct_clear_integrals(struct task_struct *tsk) 566 { 567 if (tsk) { 568 tsk->acct_stimexpd = 0; 569 tsk->acct_rss_mem1 = 0; 570 tsk->acct_vm_mem1 = 0; 571 } 572 } 573