1 /* 2 * net/core/netprio_cgroup.c Priority Control Group 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Neil Horman <nhorman@tuxdriver.com> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 #include <linux/types.h> 17 #include <linux/string.h> 18 #include <linux/errno.h> 19 #include <linux/skbuff.h> 20 #include <linux/cgroup.h> 21 #include <linux/rcupdate.h> 22 #include <linux/atomic.h> 23 #include <net/rtnetlink.h> 24 #include <net/pkt_cls.h> 25 #include <net/sock.h> 26 #include <net/netprio_cgroup.h> 27 28 #include <linux/fdtable.h> 29 30 #define PRIOIDX_SZ 128 31 32 static unsigned long prioidx_map[PRIOIDX_SZ]; 33 static DEFINE_SPINLOCK(prioidx_map_lock); 34 static atomic_t max_prioidx = ATOMIC_INIT(0); 35 36 static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) 37 { 38 return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), 39 struct cgroup_netprio_state, css); 40 } 41 42 static int get_prioidx(u32 *prio) 43 { 44 unsigned long flags; 45 u32 prioidx; 46 47 spin_lock_irqsave(&prioidx_map_lock, flags); 48 prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); 49 if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { 50 spin_unlock_irqrestore(&prioidx_map_lock, flags); 51 return -ENOSPC; 52 } 53 set_bit(prioidx, prioidx_map); 54 if (atomic_read(&max_prioidx) < prioidx) 55 atomic_set(&max_prioidx, prioidx); 56 spin_unlock_irqrestore(&prioidx_map_lock, flags); 57 *prio = prioidx; 58 return 0; 59 } 60 61 static void put_prioidx(u32 idx) 62 { 63 unsigned long flags; 64 65 spin_lock_irqsave(&prioidx_map_lock, flags); 66 clear_bit(idx, prioidx_map); 67 spin_unlock_irqrestore(&prioidx_map_lock, flags); 68 } 69 70 static int extend_netdev_table(struct net_device *dev, u32 new_len) 71 { 72 size_t new_size = sizeof(struct netprio_map) + 73 ((sizeof(u32) * new_len)); 74 struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); 75 struct netprio_map *old_priomap; 76 int i; 77 78 old_priomap = rtnl_dereference(dev->priomap); 79 80 if (!new_priomap) { 81 pr_warn("Unable to alloc new priomap!\n"); 82 return -ENOMEM; 83 } 84 85 for (i = 0; 86 old_priomap && (i < old_priomap->priomap_len); 87 i++) 88 new_priomap->priomap[i] = old_priomap->priomap[i]; 89 90 new_priomap->priomap_len = new_len; 91 92 rcu_assign_pointer(dev->priomap, new_priomap); 93 if (old_priomap) 94 kfree_rcu(old_priomap, rcu); 95 return 0; 96 } 97 98 static int write_update_netdev_table(struct net_device *dev) 99 { 100 int ret = 0; 101 u32 max_len; 102 struct netprio_map *map; 103 104 rtnl_lock(); 105 max_len = atomic_read(&max_prioidx) + 1; 106 map = rtnl_dereference(dev->priomap); 107 if (!map || map->priomap_len < max_len) 108 ret = extend_netdev_table(dev, max_len); 109 rtnl_unlock(); 110 111 return ret; 112 } 113 114 static int update_netdev_tables(void) 115 { 116 int ret = 0; 117 struct net_device *dev; 118 u32 max_len; 119 struct netprio_map *map; 120 121 rtnl_lock(); 122 max_len = atomic_read(&max_prioidx) + 1; 123 for_each_netdev(&init_net, dev) { 124 map = rtnl_dereference(dev->priomap); 125 /* 126 * don't allocate priomap if we didn't 127 * change net_prio.ifpriomap (map == NULL), 128 * this will speed up skb_update_prio. 129 */ 130 if (map && map->priomap_len < max_len) { 131 ret = extend_netdev_table(dev, max_len); 132 if (ret < 0) 133 break; 134 } 135 } 136 rtnl_unlock(); 137 return ret; 138 } 139 140 static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) 141 { 142 struct cgroup_netprio_state *cs; 143 int ret = -EINVAL; 144 145 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 146 if (!cs) 147 return ERR_PTR(-ENOMEM); 148 149 if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) 150 goto out; 151 152 ret = get_prioidx(&cs->prioidx); 153 if (ret < 0) { 154 pr_warn("No space in priority index array\n"); 155 goto out; 156 } 157 158 ret = update_netdev_tables(); 159 if (ret < 0) { 160 put_prioidx(cs->prioidx); 161 goto out; 162 } 163 164 return &cs->css; 165 out: 166 kfree(cs); 167 return ERR_PTR(ret); 168 } 169 170 static void cgrp_destroy(struct cgroup *cgrp) 171 { 172 struct cgroup_netprio_state *cs; 173 struct net_device *dev; 174 struct netprio_map *map; 175 176 cs = cgrp_netprio_state(cgrp); 177 rtnl_lock(); 178 for_each_netdev(&init_net, dev) { 179 map = rtnl_dereference(dev->priomap); 180 if (map && cs->prioidx < map->priomap_len) 181 map->priomap[cs->prioidx] = 0; 182 } 183 rtnl_unlock(); 184 put_prioidx(cs->prioidx); 185 kfree(cs); 186 } 187 188 static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) 189 { 190 return (u64)cgrp_netprio_state(cgrp)->prioidx; 191 } 192 193 static int read_priomap(struct cgroup *cont, struct cftype *cft, 194 struct cgroup_map_cb *cb) 195 { 196 struct net_device *dev; 197 u32 prioidx = cgrp_netprio_state(cont)->prioidx; 198 u32 priority; 199 struct netprio_map *map; 200 201 rcu_read_lock(); 202 for_each_netdev_rcu(&init_net, dev) { 203 map = rcu_dereference(dev->priomap); 204 priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; 205 cb->fill(cb, dev->name, priority); 206 } 207 rcu_read_unlock(); 208 return 0; 209 } 210 211 static int write_priomap(struct cgroup *cgrp, struct cftype *cft, 212 const char *buffer) 213 { 214 char *devname = kstrdup(buffer, GFP_KERNEL); 215 int ret = -EINVAL; 216 u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; 217 unsigned long priority; 218 char *priostr; 219 struct net_device *dev; 220 struct netprio_map *map; 221 222 if (!devname) 223 return -ENOMEM; 224 225 /* 226 * Minimally sized valid priomap string 227 */ 228 if (strlen(devname) < 3) 229 goto out_free_devname; 230 231 priostr = strstr(devname, " "); 232 if (!priostr) 233 goto out_free_devname; 234 235 /* 236 *Separate the devname from the associated priority 237 *and advance the priostr pointer to the priority value 238 */ 239 *priostr = '\0'; 240 priostr++; 241 242 /* 243 * If the priostr points to NULL, we're at the end of the passed 244 * in string, and its not a valid write 245 */ 246 if (*priostr == '\0') 247 goto out_free_devname; 248 249 ret = kstrtoul(priostr, 10, &priority); 250 if (ret < 0) 251 goto out_free_devname; 252 253 ret = -ENODEV; 254 255 dev = dev_get_by_name(&init_net, devname); 256 if (!dev) 257 goto out_free_devname; 258 259 ret = write_update_netdev_table(dev); 260 if (ret < 0) 261 goto out_put_dev; 262 263 rcu_read_lock(); 264 map = rcu_dereference(dev->priomap); 265 if (map) 266 map->priomap[prioidx] = priority; 267 rcu_read_unlock(); 268 269 out_put_dev: 270 dev_put(dev); 271 272 out_free_devname: 273 kfree(devname); 274 return ret; 275 } 276 277 void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 278 { 279 struct task_struct *p; 280 char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); 281 282 if (!tmp) { 283 pr_warn("Unable to attach cgrp due to alloc failure!\n"); 284 return; 285 } 286 287 cgroup_taskset_for_each(p, cgrp, tset) { 288 unsigned int fd; 289 struct fdtable *fdt; 290 struct files_struct *files; 291 292 task_lock(p); 293 files = p->files; 294 if (!files) { 295 task_unlock(p); 296 continue; 297 } 298 299 rcu_read_lock(); 300 fdt = files_fdtable(files); 301 for (fd = 0; fd < fdt->max_fds; fd++) { 302 char *path; 303 struct file *file; 304 struct socket *sock; 305 unsigned long s; 306 int rv, err = 0; 307 308 file = fcheck_files(files, fd); 309 if (!file) 310 continue; 311 312 path = d_path(&file->f_path, tmp, PAGE_SIZE); 313 rv = sscanf(path, "socket:[%lu]", &s); 314 if (rv <= 0) 315 continue; 316 317 sock = sock_from_file(file, &err); 318 if (!err) 319 sock_update_netprioidx(sock->sk, p); 320 } 321 rcu_read_unlock(); 322 task_unlock(p); 323 } 324 kfree(tmp); 325 } 326 327 static struct cftype ss_files[] = { 328 { 329 .name = "prioidx", 330 .read_u64 = read_prioidx, 331 }, 332 { 333 .name = "ifpriomap", 334 .read_map = read_priomap, 335 .write_string = write_priomap, 336 }, 337 { } /* terminate */ 338 }; 339 340 struct cgroup_subsys net_prio_subsys = { 341 .name = "net_prio", 342 .create = cgrp_create, 343 .destroy = cgrp_destroy, 344 .attach = net_prio_attach, 345 #ifdef CONFIG_NETPRIO_CGROUP 346 .subsys_id = net_prio_subsys_id, 347 #endif 348 .base_cftypes = ss_files, 349 .module = THIS_MODULE 350 }; 351 352 static int netprio_device_event(struct notifier_block *unused, 353 unsigned long event, void *ptr) 354 { 355 struct net_device *dev = ptr; 356 struct netprio_map *old; 357 358 /* 359 * Note this is called with rtnl_lock held so we have update side 360 * protection on our rcu assignments 361 */ 362 363 switch (event) { 364 case NETDEV_UNREGISTER: 365 old = rtnl_dereference(dev->priomap); 366 RCU_INIT_POINTER(dev->priomap, NULL); 367 if (old) 368 kfree_rcu(old, rcu); 369 break; 370 } 371 return NOTIFY_DONE; 372 } 373 374 static struct notifier_block netprio_device_notifier = { 375 .notifier_call = netprio_device_event 376 }; 377 378 static int __init init_cgroup_netprio(void) 379 { 380 int ret; 381 382 ret = cgroup_load_subsys(&net_prio_subsys); 383 if (ret) 384 goto out; 385 #ifndef CONFIG_NETPRIO_CGROUP 386 smp_wmb(); 387 net_prio_subsys_id = net_prio_subsys.subsys_id; 388 #endif 389 390 register_netdevice_notifier(&netprio_device_notifier); 391 392 out: 393 return ret; 394 } 395 396 static void __exit exit_cgroup_netprio(void) 397 { 398 struct netprio_map *old; 399 struct net_device *dev; 400 401 unregister_netdevice_notifier(&netprio_device_notifier); 402 403 cgroup_unload_subsys(&net_prio_subsys); 404 405 #ifndef CONFIG_NETPRIO_CGROUP 406 net_prio_subsys_id = -1; 407 synchronize_rcu(); 408 #endif 409 410 rtnl_lock(); 411 for_each_netdev(&init_net, dev) { 412 old = rtnl_dereference(dev->priomap); 413 RCU_INIT_POINTER(dev->priomap, NULL); 414 if (old) 415 kfree_rcu(old, rcu); 416 } 417 rtnl_unlock(); 418 } 419 420 module_init(init_cgroup_netprio); 421 module_exit(exit_cgroup_netprio); 422 MODULE_LICENSE("GPL v2"); 423