1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 enum devcg_behavior { 29 DEVCG_DEFAULT_NONE, 30 DEVCG_DEFAULT_ALLOW, 31 DEVCG_DEFAULT_DENY, 32 }; 33 34 /* 35 * exception list locking rules: 36 * hold devcgroup_mutex for update/read. 37 * hold rcu_read_lock() for read. 38 */ 39 40 struct dev_exception_item { 41 u32 major, minor; 42 short type; 43 short access; 44 struct list_head list; 45 struct rcu_head rcu; 46 }; 47 48 struct dev_cgroup { 49 struct cgroup_subsys_state css; 50 struct list_head exceptions; 51 enum devcg_behavior behavior; 52 }; 53 54 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 55 { 56 return container_of(s, struct dev_cgroup, css); 57 } 58 59 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 60 { 61 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 62 } 63 64 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 65 { 66 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 67 } 68 69 struct cgroup_subsys devices_subsys; 70 71 static int devcgroup_can_attach(struct cgroup *new_cgrp, 72 struct cgroup_taskset *set) 73 { 74 struct task_struct *task = cgroup_taskset_first(set); 75 76 if (current != task && !capable(CAP_SYS_ADMIN)) 77 return -EPERM; 78 return 0; 79 } 80 81 /* 82 * called under devcgroup_mutex 83 */ 84 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 85 { 86 struct dev_exception_item *ex, *tmp, *new; 87 88 lockdep_assert_held(&devcgroup_mutex); 89 90 list_for_each_entry(ex, orig, list) { 91 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 92 if (!new) 93 goto free_and_exit; 94 list_add_tail(&new->list, dest); 95 } 96 97 return 0; 98 99 free_and_exit: 100 list_for_each_entry_safe(ex, tmp, dest, list) { 101 list_del(&ex->list); 102 kfree(ex); 103 } 104 return -ENOMEM; 105 } 106 107 /* 108 * called under devcgroup_mutex 109 */ 110 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 111 struct dev_exception_item *ex) 112 { 113 struct dev_exception_item *excopy, *walk; 114 115 lockdep_assert_held(&devcgroup_mutex); 116 117 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 118 if (!excopy) 119 return -ENOMEM; 120 121 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 122 if (walk->type != ex->type) 123 continue; 124 if (walk->major != ex->major) 125 continue; 126 if (walk->minor != ex->minor) 127 continue; 128 129 walk->access |= ex->access; 130 kfree(excopy); 131 excopy = NULL; 132 } 133 134 if (excopy != NULL) 135 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 136 return 0; 137 } 138 139 /* 140 * called under devcgroup_mutex 141 */ 142 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 143 struct dev_exception_item *ex) 144 { 145 struct dev_exception_item *walk, *tmp; 146 147 lockdep_assert_held(&devcgroup_mutex); 148 149 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 150 if (walk->type != ex->type) 151 continue; 152 if (walk->major != ex->major) 153 continue; 154 if (walk->minor != ex->minor) 155 continue; 156 157 walk->access &= ~ex->access; 158 if (!walk->access) { 159 list_del_rcu(&walk->list); 160 kfree_rcu(walk, rcu); 161 } 162 } 163 } 164 165 static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) 166 { 167 struct dev_exception_item *ex, *tmp; 168 169 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 170 list_del_rcu(&ex->list); 171 kfree_rcu(ex, rcu); 172 } 173 } 174 175 /** 176 * dev_exception_clean - frees all entries of the exception list 177 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 178 * 179 * called under devcgroup_mutex 180 */ 181 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 182 { 183 lockdep_assert_held(&devcgroup_mutex); 184 185 __dev_exception_clean(dev_cgroup); 186 } 187 188 /** 189 * devcgroup_online - initializes devcgroup's behavior and exceptions based on 190 * parent's 191 * @cgroup: cgroup getting online 192 * returns 0 in case of success, error code otherwise 193 */ 194 static int devcgroup_online(struct cgroup *cgroup) 195 { 196 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; 197 int ret = 0; 198 199 mutex_lock(&devcgroup_mutex); 200 dev_cgroup = cgroup_to_devcgroup(cgroup); 201 if (cgroup->parent) 202 parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent); 203 204 if (parent_dev_cgroup == NULL) 205 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 206 else { 207 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 208 &parent_dev_cgroup->exceptions); 209 if (!ret) 210 dev_cgroup->behavior = parent_dev_cgroup->behavior; 211 } 212 mutex_unlock(&devcgroup_mutex); 213 214 return ret; 215 } 216 217 static void devcgroup_offline(struct cgroup *cgroup) 218 { 219 struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); 220 221 mutex_lock(&devcgroup_mutex); 222 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 223 mutex_unlock(&devcgroup_mutex); 224 } 225 226 /* 227 * called from kernel/cgroup.c with cgroup_lock() held. 228 */ 229 static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 230 { 231 struct dev_cgroup *dev_cgroup; 232 struct cgroup *parent_cgroup; 233 234 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 235 if (!dev_cgroup) 236 return ERR_PTR(-ENOMEM); 237 INIT_LIST_HEAD(&dev_cgroup->exceptions); 238 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 239 parent_cgroup = cgroup->parent; 240 241 return &dev_cgroup->css; 242 } 243 244 static void devcgroup_css_free(struct cgroup *cgroup) 245 { 246 struct dev_cgroup *dev_cgroup; 247 248 dev_cgroup = cgroup_to_devcgroup(cgroup); 249 __dev_exception_clean(dev_cgroup); 250 kfree(dev_cgroup); 251 } 252 253 #define DEVCG_ALLOW 1 254 #define DEVCG_DENY 2 255 #define DEVCG_LIST 3 256 257 #define MAJMINLEN 13 258 #define ACCLEN 4 259 260 static void set_access(char *acc, short access) 261 { 262 int idx = 0; 263 memset(acc, 0, ACCLEN); 264 if (access & ACC_READ) 265 acc[idx++] = 'r'; 266 if (access & ACC_WRITE) 267 acc[idx++] = 'w'; 268 if (access & ACC_MKNOD) 269 acc[idx++] = 'm'; 270 } 271 272 static char type_to_char(short type) 273 { 274 if (type == DEV_ALL) 275 return 'a'; 276 if (type == DEV_CHAR) 277 return 'c'; 278 if (type == DEV_BLOCK) 279 return 'b'; 280 return 'X'; 281 } 282 283 static void set_majmin(char *str, unsigned m) 284 { 285 if (m == ~0) 286 strcpy(str, "*"); 287 else 288 sprintf(str, "%u", m); 289 } 290 291 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 292 struct seq_file *m) 293 { 294 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 295 struct dev_exception_item *ex; 296 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 297 298 rcu_read_lock(); 299 /* 300 * To preserve the compatibility: 301 * - Only show the "all devices" when the default policy is to allow 302 * - List the exceptions in case the default policy is to deny 303 * This way, the file remains as a "whitelist of devices" 304 */ 305 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 306 set_access(acc, ACC_MASK); 307 set_majmin(maj, ~0); 308 set_majmin(min, ~0); 309 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 310 maj, min, acc); 311 } else { 312 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 313 set_access(acc, ex->access); 314 set_majmin(maj, ex->major); 315 set_majmin(min, ex->minor); 316 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 317 maj, min, acc); 318 } 319 } 320 rcu_read_unlock(); 321 322 return 0; 323 } 324 325 /** 326 * may_access - verifies if a new exception is part of what is allowed 327 * by a dev cgroup based on the default policy + 328 * exceptions. This is used to make sure a child cgroup 329 * won't have more privileges than its parent or to 330 * verify if a certain access is allowed. 331 * @dev_cgroup: dev cgroup to be tested against 332 * @refex: new exception 333 * @behavior: behavior of the exception 334 */ 335 static bool may_access(struct dev_cgroup *dev_cgroup, 336 struct dev_exception_item *refex, 337 enum devcg_behavior behavior) 338 { 339 struct dev_exception_item *ex; 340 bool match = false; 341 342 rcu_lockdep_assert(rcu_read_lock_held() || 343 lockdep_is_held(&devcgroup_mutex), 344 "device_cgroup::may_access() called without proper synchronization"); 345 346 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 347 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 348 continue; 349 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 350 continue; 351 if (ex->major != ~0 && ex->major != refex->major) 352 continue; 353 if (ex->minor != ~0 && ex->minor != refex->minor) 354 continue; 355 if (refex->access & (~ex->access)) 356 continue; 357 match = true; 358 break; 359 } 360 361 if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { 362 if (behavior == DEVCG_DEFAULT_ALLOW) { 363 /* the exception will deny access to certain devices */ 364 return true; 365 } else { 366 /* the exception will allow access to certain devices */ 367 if (match) 368 /* 369 * a new exception allowing access shouldn't 370 * match an parent's exception 371 */ 372 return false; 373 return true; 374 } 375 } else { 376 /* only behavior == DEVCG_DEFAULT_DENY allowed here */ 377 if (match) 378 /* parent has an exception that matches the proposed */ 379 return true; 380 else 381 return false; 382 } 383 return false; 384 } 385 386 /* 387 * parent_has_perm: 388 * when adding a new allow rule to a device exception list, the rule 389 * must be allowed in the parent device 390 */ 391 static int parent_has_perm(struct dev_cgroup *childcg, 392 struct dev_exception_item *ex) 393 { 394 struct cgroup *pcg = childcg->css.cgroup->parent; 395 struct dev_cgroup *parent; 396 397 if (!pcg) 398 return 1; 399 parent = cgroup_to_devcgroup(pcg); 400 return may_access(parent, ex, childcg->behavior); 401 } 402 403 /** 404 * may_allow_all - checks if it's possible to change the behavior to 405 * allow based on parent's rules. 406 * @parent: device cgroup's parent 407 * returns: != 0 in case it's allowed, 0 otherwise 408 */ 409 static inline int may_allow_all(struct dev_cgroup *parent) 410 { 411 if (!parent) 412 return 1; 413 return parent->behavior == DEVCG_DEFAULT_ALLOW; 414 } 415 416 /* 417 * Modify the exception list using allow/deny rules. 418 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 419 * so we can give a container CAP_MKNOD to let it create devices but not 420 * modify the exception list. 421 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 422 * us to also grant CAP_SYS_ADMIN to containers without giving away the 423 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 424 * 425 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 426 * new access is only allowed if you're in the top-level cgroup, or your 427 * parent cgroup has the access you're asking for. 428 */ 429 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 430 int filetype, const char *buffer) 431 { 432 const char *b; 433 char temp[12]; /* 11 + 1 characters needed for a u32 */ 434 int count, rc = 0; 435 struct dev_exception_item ex; 436 struct cgroup *p = devcgroup->css.cgroup; 437 struct dev_cgroup *parent = NULL; 438 439 if (!capable(CAP_SYS_ADMIN)) 440 return -EPERM; 441 442 if (p->parent) 443 parent = cgroup_to_devcgroup(p->parent); 444 445 memset(&ex, 0, sizeof(ex)); 446 b = buffer; 447 448 switch (*b) { 449 case 'a': 450 switch (filetype) { 451 case DEVCG_ALLOW: 452 if (!may_allow_all(parent)) 453 return -EPERM; 454 dev_exception_clean(devcgroup); 455 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 456 if (!parent) 457 break; 458 459 rc = dev_exceptions_copy(&devcgroup->exceptions, 460 &parent->exceptions); 461 if (rc) 462 return rc; 463 break; 464 case DEVCG_DENY: 465 dev_exception_clean(devcgroup); 466 devcgroup->behavior = DEVCG_DEFAULT_DENY; 467 break; 468 default: 469 return -EINVAL; 470 } 471 return 0; 472 case 'b': 473 ex.type = DEV_BLOCK; 474 break; 475 case 'c': 476 ex.type = DEV_CHAR; 477 break; 478 default: 479 return -EINVAL; 480 } 481 b++; 482 if (!isspace(*b)) 483 return -EINVAL; 484 b++; 485 if (*b == '*') { 486 ex.major = ~0; 487 b++; 488 } else if (isdigit(*b)) { 489 memset(temp, 0, sizeof(temp)); 490 for (count = 0; count < sizeof(temp) - 1; count++) { 491 temp[count] = *b; 492 b++; 493 if (!isdigit(*b)) 494 break; 495 } 496 rc = kstrtou32(temp, 10, &ex.major); 497 if (rc) 498 return -EINVAL; 499 } else { 500 return -EINVAL; 501 } 502 if (*b != ':') 503 return -EINVAL; 504 b++; 505 506 /* read minor */ 507 if (*b == '*') { 508 ex.minor = ~0; 509 b++; 510 } else if (isdigit(*b)) { 511 memset(temp, 0, sizeof(temp)); 512 for (count = 0; count < sizeof(temp) - 1; count++) { 513 temp[count] = *b; 514 b++; 515 if (!isdigit(*b)) 516 break; 517 } 518 rc = kstrtou32(temp, 10, &ex.minor); 519 if (rc) 520 return -EINVAL; 521 } else { 522 return -EINVAL; 523 } 524 if (!isspace(*b)) 525 return -EINVAL; 526 for (b++, count = 0; count < 3; count++, b++) { 527 switch (*b) { 528 case 'r': 529 ex.access |= ACC_READ; 530 break; 531 case 'w': 532 ex.access |= ACC_WRITE; 533 break; 534 case 'm': 535 ex.access |= ACC_MKNOD; 536 break; 537 case '\n': 538 case '\0': 539 count = 3; 540 break; 541 default: 542 return -EINVAL; 543 } 544 } 545 546 switch (filetype) { 547 case DEVCG_ALLOW: 548 if (!parent_has_perm(devcgroup, &ex)) 549 return -EPERM; 550 /* 551 * If the default policy is to allow by default, try to remove 552 * an matching exception instead. And be silent about it: we 553 * don't want to break compatibility 554 */ 555 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 556 dev_exception_rm(devcgroup, &ex); 557 return 0; 558 } 559 return dev_exception_add(devcgroup, &ex); 560 case DEVCG_DENY: 561 /* 562 * If the default policy is to deny by default, try to remove 563 * an matching exception instead. And be silent about it: we 564 * don't want to break compatibility 565 */ 566 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 567 dev_exception_rm(devcgroup, &ex); 568 return 0; 569 } 570 return dev_exception_add(devcgroup, &ex); 571 default: 572 return -EINVAL; 573 } 574 return 0; 575 } 576 577 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 578 const char *buffer) 579 { 580 int retval; 581 582 mutex_lock(&devcgroup_mutex); 583 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 584 cft->private, buffer); 585 mutex_unlock(&devcgroup_mutex); 586 return retval; 587 } 588 589 static struct cftype dev_cgroup_files[] = { 590 { 591 .name = "allow", 592 .write_string = devcgroup_access_write, 593 .private = DEVCG_ALLOW, 594 }, 595 { 596 .name = "deny", 597 .write_string = devcgroup_access_write, 598 .private = DEVCG_DENY, 599 }, 600 { 601 .name = "list", 602 .read_seq_string = devcgroup_seq_read, 603 .private = DEVCG_LIST, 604 }, 605 { } /* terminate */ 606 }; 607 608 struct cgroup_subsys devices_subsys = { 609 .name = "devices", 610 .can_attach = devcgroup_can_attach, 611 .css_alloc = devcgroup_css_alloc, 612 .css_free = devcgroup_css_free, 613 .css_online = devcgroup_online, 614 .css_offline = devcgroup_offline, 615 .subsys_id = devices_subsys_id, 616 .base_cftypes = dev_cgroup_files, 617 618 /* 619 * While devices cgroup has the rudimentary hierarchy support which 620 * checks the parent's restriction, it doesn't properly propagates 621 * config changes in ancestors to their descendents. A child 622 * should only be allowed to add more restrictions to the parent's 623 * configuration. Fix it and remove the following. 624 */ 625 .broken_hierarchy = true, 626 }; 627 628 /** 629 * __devcgroup_check_permission - checks if an inode operation is permitted 630 * @dev_cgroup: the dev cgroup to be tested against 631 * @type: device type 632 * @major: device major number 633 * @minor: device minor number 634 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 635 * 636 * returns 0 on success, -EPERM case the operation is not permitted 637 */ 638 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 639 short access) 640 { 641 struct dev_cgroup *dev_cgroup; 642 struct dev_exception_item ex; 643 int rc; 644 645 memset(&ex, 0, sizeof(ex)); 646 ex.type = type; 647 ex.major = major; 648 ex.minor = minor; 649 ex.access = access; 650 651 rcu_read_lock(); 652 dev_cgroup = task_devcgroup(current); 653 rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior); 654 rcu_read_unlock(); 655 656 if (!rc) 657 return -EPERM; 658 659 return 0; 660 } 661 662 int __devcgroup_inode_permission(struct inode *inode, int mask) 663 { 664 short type, access = 0; 665 666 if (S_ISBLK(inode->i_mode)) 667 type = DEV_BLOCK; 668 if (S_ISCHR(inode->i_mode)) 669 type = DEV_CHAR; 670 if (mask & MAY_WRITE) 671 access |= ACC_WRITE; 672 if (mask & MAY_READ) 673 access |= ACC_READ; 674 675 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 676 access); 677 } 678 679 int devcgroup_inode_mknod(int mode, dev_t dev) 680 { 681 short type; 682 683 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 684 return 0; 685 686 if (S_ISBLK(mode)) 687 type = DEV_BLOCK; 688 else 689 type = DEV_CHAR; 690 691 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 692 ACC_MKNOD); 693 694 } 695