xref: /openbmc/linux/fs/proc/proc_sysctl.c (revision 9eb47c26)
1 /*
2  * /proc/sys support
3  */
4 #include <linux/init.h>
5 #include <linux/sysctl.h>
6 #include <linux/poll.h>
7 #include <linux/proc_fs.h>
8 #include <linux/security.h>
9 #include <linux/namei.h>
10 #include <linux/module.h>
11 #include "internal.h"
12 
13 static const struct dentry_operations proc_sys_dentry_operations;
14 static const struct file_operations proc_sys_file_operations;
15 static const struct inode_operations proc_sys_inode_operations;
16 static const struct file_operations proc_sys_dir_file_operations;
17 static const struct inode_operations proc_sys_dir_operations;
18 
19 void proc_sys_poll_notify(struct ctl_table_poll *poll)
20 {
21 	if (!poll)
22 		return;
23 
24 	atomic_inc(&poll->event);
25 	wake_up_interruptible(&poll->wait);
26 }
27 
28 static struct ctl_table root_table[] = {
29 	{
30 		.procname = "",
31 		.mode = S_IRUGO|S_IXUGO,
32 		.child = &root_table[1],
33 	},
34 	{ }
35 };
36 static struct ctl_table_root sysctl_table_root;
37 static struct ctl_table_header root_table_header = {
38 	{{.count = 1,
39 	  .nreg = 1,
40 	  .ctl_table = root_table,
41 	  .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
42 	.root = &sysctl_table_root,
43 	.set = &sysctl_table_root.default_set,
44 };
45 static struct ctl_table_root sysctl_table_root = {
46 	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
47 	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
48 	.default_set.root = &sysctl_table_root,
49 };
50 
51 static DEFINE_SPINLOCK(sysctl_lock);
52 
53 static int namecmp(const char *name1, int len1, const char *name2, int len2)
54 {
55 	int minlen;
56 	int cmp;
57 
58 	minlen = len1;
59 	if (minlen > len2)
60 		minlen = len2;
61 
62 	cmp = memcmp(name1, name2, minlen);
63 	if (cmp == 0)
64 		cmp = len1 - len2;
65 	return cmp;
66 }
67 
68 static struct ctl_table *find_entry(struct ctl_table_header **phead,
69 	struct ctl_table_set *set,
70 	struct ctl_table_header *dir_head, struct ctl_table *dir,
71 	const char *name, int namelen)
72 {
73 	struct ctl_table_header *head;
74 	struct ctl_table *entry;
75 
76 	if (dir_head->set == set) {
77 		for (entry = dir; entry->procname; entry++) {
78 			const char *procname = entry->procname;
79 			if (namecmp(procname, strlen(procname), name, namelen) == 0) {
80 				*phead = dir_head;
81 				return entry;
82 			}
83 		}
84 	}
85 
86 	list_for_each_entry(head, &set->list, ctl_entry) {
87 		if (head->unregistering)
88 			continue;
89 		if (head->attached_to != dir)
90 			continue;
91 		for (entry = head->attached_by; entry->procname; entry++) {
92 			const char *procname = entry->procname;
93 			if (namecmp(procname, strlen(procname), name, namelen) == 0) {
94 				*phead = head;
95 				return entry;
96 			}
97 		}
98 	}
99 	return NULL;
100 }
101 
102 static void init_header(struct ctl_table_header *head,
103 	struct ctl_table_root *root, struct ctl_table_set *set,
104 	struct ctl_table *table)
105 {
106 	head->ctl_table_arg = table;
107 	INIT_LIST_HEAD(&head->ctl_entry);
108 	head->used = 0;
109 	head->count = 1;
110 	head->nreg = 1;
111 	head->unregistering = NULL;
112 	head->root = root;
113 	head->set = set;
114 	head->parent = NULL;
115 }
116 
117 static void erase_header(struct ctl_table_header *head)
118 {
119 	list_del_init(&head->ctl_entry);
120 }
121 
122 static void insert_header(struct ctl_table_header *header)
123 {
124 	header->parent->count++;
125 	list_add_tail(&header->ctl_entry, &header->set->list);
126 }
127 
128 /* called under sysctl_lock */
129 static int use_table(struct ctl_table_header *p)
130 {
131 	if (unlikely(p->unregistering))
132 		return 0;
133 	p->used++;
134 	return 1;
135 }
136 
137 /* called under sysctl_lock */
138 static void unuse_table(struct ctl_table_header *p)
139 {
140 	if (!--p->used)
141 		if (unlikely(p->unregistering))
142 			complete(p->unregistering);
143 }
144 
145 /* called under sysctl_lock, will reacquire if has to wait */
146 static void start_unregistering(struct ctl_table_header *p)
147 {
148 	/*
149 	 * if p->used is 0, nobody will ever touch that entry again;
150 	 * we'll eliminate all paths to it before dropping sysctl_lock
151 	 */
152 	if (unlikely(p->used)) {
153 		struct completion wait;
154 		init_completion(&wait);
155 		p->unregistering = &wait;
156 		spin_unlock(&sysctl_lock);
157 		wait_for_completion(&wait);
158 		spin_lock(&sysctl_lock);
159 	} else {
160 		/* anything non-NULL; we'll never dereference it */
161 		p->unregistering = ERR_PTR(-EINVAL);
162 	}
163 	/*
164 	 * do not remove from the list until nobody holds it; walking the
165 	 * list in do_sysctl() relies on that.
166 	 */
167 	erase_header(p);
168 }
169 
170 static void sysctl_head_get(struct ctl_table_header *head)
171 {
172 	spin_lock(&sysctl_lock);
173 	head->count++;
174 	spin_unlock(&sysctl_lock);
175 }
176 
177 void sysctl_head_put(struct ctl_table_header *head)
178 {
179 	spin_lock(&sysctl_lock);
180 	if (!--head->count)
181 		kfree_rcu(head, rcu);
182 	spin_unlock(&sysctl_lock);
183 }
184 
185 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
186 {
187 	if (!head)
188 		BUG();
189 	spin_lock(&sysctl_lock);
190 	if (!use_table(head))
191 		head = ERR_PTR(-ENOENT);
192 	spin_unlock(&sysctl_lock);
193 	return head;
194 }
195 
196 static void sysctl_head_finish(struct ctl_table_header *head)
197 {
198 	if (!head)
199 		return;
200 	spin_lock(&sysctl_lock);
201 	unuse_table(head);
202 	spin_unlock(&sysctl_lock);
203 }
204 
205 static struct ctl_table_set *
206 lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
207 {
208 	struct ctl_table_set *set = &root->default_set;
209 	if (root->lookup)
210 		set = root->lookup(root, namespaces);
211 	return set;
212 }
213 
214 static struct list_head *
215 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
216 {
217 	struct ctl_table_set *set = lookup_header_set(root, namespaces);
218 	return &set->list;
219 }
220 
221 static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
222 				      struct ctl_table_header *dir_head,
223 				      struct ctl_table *dir,
224 				      const char *name, int namelen)
225 {
226 	struct ctl_table_header *head;
227 	struct ctl_table *entry;
228 	struct ctl_table_root *root;
229 	struct ctl_table_set *set;
230 
231 	spin_lock(&sysctl_lock);
232 	root = &sysctl_table_root;
233 	do {
234 		set = lookup_header_set(root, current->nsproxy);
235 		entry = find_entry(&head, set, dir_head, dir, name, namelen);
236 		if (entry && use_table(head))
237 			*phead = head;
238 		else
239 			entry = NULL;
240 		root = list_entry(root->root_list.next,
241 				  struct ctl_table_root, root_list);
242 	} while (!entry && root != &sysctl_table_root);
243 	spin_unlock(&sysctl_lock);
244 	return entry;
245 }
246 
247 static struct ctl_table_header *next_usable_entry(struct ctl_table *dir,
248 	struct ctl_table_root *root, struct list_head *tmp)
249 {
250 	struct nsproxy *namespaces = current->nsproxy;
251 	struct list_head *header_list;
252 	struct ctl_table_header *head;
253 
254 	goto next;
255 	for (;;) {
256 		head = list_entry(tmp, struct ctl_table_header, ctl_entry);
257 		root = head->root;
258 
259 		if (head->attached_to != dir ||
260 		    !head->attached_by->procname ||
261 		    !use_table(head))
262 			goto next;
263 
264 		return head;
265 	next:
266 		tmp = tmp->next;
267 		header_list = lookup_header_list(root, namespaces);
268 		if (tmp != header_list)
269 			continue;
270 
271 		do {
272 			root = list_entry(root->root_list.next,
273 					struct ctl_table_root, root_list);
274 			if (root == &sysctl_table_root)
275 				goto out;
276 			header_list = lookup_header_list(root, namespaces);
277 		} while (list_empty(header_list));
278 		tmp = header_list->next;
279 	}
280 out:
281 	return NULL;
282 }
283 
284 static void first_entry(
285 	struct ctl_table_header *dir_head, struct ctl_table *dir,
286 	struct ctl_table_header **phead, struct ctl_table **pentry)
287 {
288 	struct ctl_table_header *head = dir_head;
289 	struct ctl_table *entry = dir;
290 
291 	spin_lock(&sysctl_lock);
292 	if (entry->procname) {
293 		use_table(head);
294 	} else {
295 		head = next_usable_entry(dir, &sysctl_table_root,
296 					 &sysctl_table_root.default_set.list);
297 		if (head)
298 			entry = head->attached_by;
299 	}
300 	spin_unlock(&sysctl_lock);
301 	*phead = head;
302 	*pentry = entry;
303 }
304 
305 static void next_entry(struct ctl_table *dir,
306 	struct ctl_table_header **phead, struct ctl_table **pentry)
307 {
308 	struct ctl_table_header *head = *phead;
309 	struct ctl_table *entry = *pentry;
310 
311 	entry++;
312 	if (!entry->procname) {
313 		struct ctl_table_root *root = head->root;
314 		struct list_head *tmp = &head->ctl_entry;
315 		if (head->attached_to != dir) {
316 			root = &sysctl_table_root;
317 			tmp = &sysctl_table_root.default_set.list;
318 		}
319 		spin_lock(&sysctl_lock);
320 		unuse_table(head);
321 		head = next_usable_entry(dir, root, tmp);
322 		spin_unlock(&sysctl_lock);
323 		if (head)
324 			entry = head->attached_by;
325 	}
326 	*phead = head;
327 	*pentry = entry;
328 }
329 
330 void register_sysctl_root(struct ctl_table_root *root)
331 {
332 	spin_lock(&sysctl_lock);
333 	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
334 	spin_unlock(&sysctl_lock);
335 }
336 
337 /*
338  * sysctl_perm does NOT grant the superuser all rights automatically, because
339  * some sysctl variables are readonly even to root.
340  */
341 
342 static int test_perm(int mode, int op)
343 {
344 	if (!current_euid())
345 		mode >>= 6;
346 	else if (in_egroup_p(0))
347 		mode >>= 3;
348 	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
349 		return 0;
350 	return -EACCES;
351 }
352 
353 static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
354 {
355 	int mode;
356 
357 	if (root->permissions)
358 		mode = root->permissions(root, current->nsproxy, table);
359 	else
360 		mode = table->mode;
361 
362 	return test_perm(mode, op);
363 }
364 
365 static struct inode *proc_sys_make_inode(struct super_block *sb,
366 		struct ctl_table_header *head, struct ctl_table *table)
367 {
368 	struct inode *inode;
369 	struct proc_inode *ei;
370 
371 	inode = new_inode(sb);
372 	if (!inode)
373 		goto out;
374 
375 	inode->i_ino = get_next_ino();
376 
377 	sysctl_head_get(head);
378 	ei = PROC_I(inode);
379 	ei->sysctl = head;
380 	ei->sysctl_entry = table;
381 
382 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
383 	inode->i_mode = table->mode;
384 	if (!table->child) {
385 		inode->i_mode |= S_IFREG;
386 		inode->i_op = &proc_sys_inode_operations;
387 		inode->i_fop = &proc_sys_file_operations;
388 	} else {
389 		inode->i_mode |= S_IFDIR;
390 		inode->i_op = &proc_sys_dir_operations;
391 		inode->i_fop = &proc_sys_dir_file_operations;
392 	}
393 out:
394 	return inode;
395 }
396 
397 static struct ctl_table_header *grab_header(struct inode *inode)
398 {
399 	struct ctl_table_header *head = PROC_I(inode)->sysctl;
400 	if (!head)
401 		head = &root_table_header;
402 	return sysctl_head_grab(head);
403 }
404 
405 static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
406 					struct nameidata *nd)
407 {
408 	struct ctl_table_header *head = grab_header(dir);
409 	struct ctl_table *table = PROC_I(dir)->sysctl_entry;
410 	struct ctl_table_header *h = NULL;
411 	struct qstr *name = &dentry->d_name;
412 	struct ctl_table *p;
413 	struct inode *inode;
414 	struct dentry *err = ERR_PTR(-ENOENT);
415 
416 	if (IS_ERR(head))
417 		return ERR_CAST(head);
418 
419 	if (table && !table->child) {
420 		WARN_ON(1);
421 		goto out;
422 	}
423 
424 	table = table ? table->child : &head->ctl_table[1];
425 
426 	p = lookup_entry(&h, head, table, name->name, name->len);
427 	if (!p)
428 		goto out;
429 
430 	err = ERR_PTR(-ENOMEM);
431 	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
432 	if (h)
433 		sysctl_head_finish(h);
434 
435 	if (!inode)
436 		goto out;
437 
438 	err = NULL;
439 	d_set_d_op(dentry, &proc_sys_dentry_operations);
440 	d_add(dentry, inode);
441 
442 out:
443 	sysctl_head_finish(head);
444 	return err;
445 }
446 
447 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
448 		size_t count, loff_t *ppos, int write)
449 {
450 	struct inode *inode = filp->f_path.dentry->d_inode;
451 	struct ctl_table_header *head = grab_header(inode);
452 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
453 	ssize_t error;
454 	size_t res;
455 
456 	if (IS_ERR(head))
457 		return PTR_ERR(head);
458 
459 	/*
460 	 * At this point we know that the sysctl was not unregistered
461 	 * and won't be until we finish.
462 	 */
463 	error = -EPERM;
464 	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
465 		goto out;
466 
467 	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
468 	error = -EINVAL;
469 	if (!table->proc_handler)
470 		goto out;
471 
472 	/* careful: calling conventions are nasty here */
473 	res = count;
474 	error = table->proc_handler(table, write, buf, &res, ppos);
475 	if (!error)
476 		error = res;
477 out:
478 	sysctl_head_finish(head);
479 
480 	return error;
481 }
482 
483 static ssize_t proc_sys_read(struct file *filp, char __user *buf,
484 				size_t count, loff_t *ppos)
485 {
486 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
487 }
488 
489 static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
490 				size_t count, loff_t *ppos)
491 {
492 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
493 }
494 
495 static int proc_sys_open(struct inode *inode, struct file *filp)
496 {
497 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
498 
499 	if (table->poll)
500 		filp->private_data = proc_sys_poll_event(table->poll);
501 
502 	return 0;
503 }
504 
505 static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
506 {
507 	struct inode *inode = filp->f_path.dentry->d_inode;
508 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
509 	unsigned long event = (unsigned long)filp->private_data;
510 	unsigned int ret = DEFAULT_POLLMASK;
511 
512 	if (!table->proc_handler)
513 		goto out;
514 
515 	if (!table->poll)
516 		goto out;
517 
518 	poll_wait(filp, &table->poll->wait, wait);
519 
520 	if (event != atomic_read(&table->poll->event)) {
521 		filp->private_data = proc_sys_poll_event(table->poll);
522 		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
523 	}
524 
525 out:
526 	return ret;
527 }
528 
529 static int proc_sys_fill_cache(struct file *filp, void *dirent,
530 				filldir_t filldir,
531 				struct ctl_table_header *head,
532 				struct ctl_table *table)
533 {
534 	struct dentry *child, *dir = filp->f_path.dentry;
535 	struct inode *inode;
536 	struct qstr qname;
537 	ino_t ino = 0;
538 	unsigned type = DT_UNKNOWN;
539 
540 	qname.name = table->procname;
541 	qname.len  = strlen(table->procname);
542 	qname.hash = full_name_hash(qname.name, qname.len);
543 
544 	child = d_lookup(dir, &qname);
545 	if (!child) {
546 		child = d_alloc(dir, &qname);
547 		if (child) {
548 			inode = proc_sys_make_inode(dir->d_sb, head, table);
549 			if (!inode) {
550 				dput(child);
551 				return -ENOMEM;
552 			} else {
553 				d_set_d_op(child, &proc_sys_dentry_operations);
554 				d_add(child, inode);
555 			}
556 		} else {
557 			return -ENOMEM;
558 		}
559 	}
560 	inode = child->d_inode;
561 	ino  = inode->i_ino;
562 	type = inode->i_mode >> 12;
563 	dput(child);
564 	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
565 }
566 
567 static int scan(struct ctl_table_header *head, ctl_table *table,
568 		unsigned long *pos, struct file *file,
569 		void *dirent, filldir_t filldir)
570 {
571 	int res;
572 
573 	if ((*pos)++ < file->f_pos)
574 		return 0;
575 
576 	res = proc_sys_fill_cache(file, dirent, filldir, head, table);
577 
578 	if (res == 0)
579 		file->f_pos = *pos;
580 
581 	return res;
582 }
583 
584 static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
585 {
586 	struct dentry *dentry = filp->f_path.dentry;
587 	struct inode *inode = dentry->d_inode;
588 	struct ctl_table_header *head = grab_header(inode);
589 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
590 	struct ctl_table_header *h = NULL;
591 	struct ctl_table *entry;
592 	unsigned long pos;
593 	int ret = -EINVAL;
594 
595 	if (IS_ERR(head))
596 		return PTR_ERR(head);
597 
598 	if (table && !table->child) {
599 		WARN_ON(1);
600 		goto out;
601 	}
602 
603 	table = table ? table->child : &head->ctl_table[1];
604 
605 	ret = 0;
606 	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
607 	if (filp->f_pos == 0) {
608 		if (filldir(dirent, ".", 1, filp->f_pos,
609 				inode->i_ino, DT_DIR) < 0)
610 			goto out;
611 		filp->f_pos++;
612 	}
613 	if (filp->f_pos == 1) {
614 		if (filldir(dirent, "..", 2, filp->f_pos,
615 				parent_ino(dentry), DT_DIR) < 0)
616 			goto out;
617 		filp->f_pos++;
618 	}
619 	pos = 2;
620 
621 	for (first_entry(head, table, &h, &entry); h; next_entry(table, &h, &entry)) {
622 		ret = scan(h, entry, &pos, filp, dirent, filldir);
623 		if (ret) {
624 			sysctl_head_finish(h);
625 			break;
626 		}
627 	}
628 	ret = 1;
629 out:
630 	sysctl_head_finish(head);
631 	return ret;
632 }
633 
634 static int proc_sys_permission(struct inode *inode, int mask)
635 {
636 	/*
637 	 * sysctl entries that are not writeable,
638 	 * are _NOT_ writeable, capabilities or not.
639 	 */
640 	struct ctl_table_header *head;
641 	struct ctl_table *table;
642 	int error;
643 
644 	/* Executable files are not allowed under /proc/sys/ */
645 	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
646 		return -EACCES;
647 
648 	head = grab_header(inode);
649 	if (IS_ERR(head))
650 		return PTR_ERR(head);
651 
652 	table = PROC_I(inode)->sysctl_entry;
653 	if (!table) /* global root - r-xr-xr-x */
654 		error = mask & MAY_WRITE ? -EACCES : 0;
655 	else /* Use the permissions on the sysctl table entry */
656 		error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
657 
658 	sysctl_head_finish(head);
659 	return error;
660 }
661 
662 static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
663 {
664 	struct inode *inode = dentry->d_inode;
665 	int error;
666 
667 	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
668 		return -EPERM;
669 
670 	error = inode_change_ok(inode, attr);
671 	if (error)
672 		return error;
673 
674 	if ((attr->ia_valid & ATTR_SIZE) &&
675 	    attr->ia_size != i_size_read(inode)) {
676 		error = vmtruncate(inode, attr->ia_size);
677 		if (error)
678 			return error;
679 	}
680 
681 	setattr_copy(inode, attr);
682 	mark_inode_dirty(inode);
683 	return 0;
684 }
685 
686 static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
687 {
688 	struct inode *inode = dentry->d_inode;
689 	struct ctl_table_header *head = grab_header(inode);
690 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
691 
692 	if (IS_ERR(head))
693 		return PTR_ERR(head);
694 
695 	generic_fillattr(inode, stat);
696 	if (table)
697 		stat->mode = (stat->mode & S_IFMT) | table->mode;
698 
699 	sysctl_head_finish(head);
700 	return 0;
701 }
702 
703 static const struct file_operations proc_sys_file_operations = {
704 	.open		= proc_sys_open,
705 	.poll		= proc_sys_poll,
706 	.read		= proc_sys_read,
707 	.write		= proc_sys_write,
708 	.llseek		= default_llseek,
709 };
710 
711 static const struct file_operations proc_sys_dir_file_operations = {
712 	.read		= generic_read_dir,
713 	.readdir	= proc_sys_readdir,
714 	.llseek		= generic_file_llseek,
715 };
716 
717 static const struct inode_operations proc_sys_inode_operations = {
718 	.permission	= proc_sys_permission,
719 	.setattr	= proc_sys_setattr,
720 	.getattr	= proc_sys_getattr,
721 };
722 
723 static const struct inode_operations proc_sys_dir_operations = {
724 	.lookup		= proc_sys_lookup,
725 	.permission	= proc_sys_permission,
726 	.setattr	= proc_sys_setattr,
727 	.getattr	= proc_sys_getattr,
728 };
729 
730 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
731 {
732 	if (nd->flags & LOOKUP_RCU)
733 		return -ECHILD;
734 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
735 }
736 
737 static int proc_sys_delete(const struct dentry *dentry)
738 {
739 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
740 }
741 
742 static int sysctl_is_seen(struct ctl_table_header *p)
743 {
744 	struct ctl_table_set *set = p->set;
745 	int res;
746 	spin_lock(&sysctl_lock);
747 	if (p->unregistering)
748 		res = 0;
749 	else if (!set->is_seen)
750 		res = 1;
751 	else
752 		res = set->is_seen(set);
753 	spin_unlock(&sysctl_lock);
754 	return res;
755 }
756 
757 static int proc_sys_compare(const struct dentry *parent,
758 		const struct inode *pinode,
759 		const struct dentry *dentry, const struct inode *inode,
760 		unsigned int len, const char *str, const struct qstr *name)
761 {
762 	struct ctl_table_header *head;
763 	/* Although proc doesn't have negative dentries, rcu-walk means
764 	 * that inode here can be NULL */
765 	/* AV: can it, indeed? */
766 	if (!inode)
767 		return 1;
768 	if (name->len != len)
769 		return 1;
770 	if (memcmp(name->name, str, len))
771 		return 1;
772 	head = rcu_dereference(PROC_I(inode)->sysctl);
773 	return !head || !sysctl_is_seen(head);
774 }
775 
776 static const struct dentry_operations proc_sys_dentry_operations = {
777 	.d_revalidate	= proc_sys_revalidate,
778 	.d_delete	= proc_sys_delete,
779 	.d_compare	= proc_sys_compare,
780 };
781 
782 static struct ctl_table *is_branch_in(struct ctl_table *branch,
783 				      struct ctl_table *table)
784 {
785 	struct ctl_table *p;
786 	const char *s = branch->procname;
787 
788 	/* branch should have named subdirectory as its first element */
789 	if (!s || !branch->child)
790 		return NULL;
791 
792 	/* ... and nothing else */
793 	if (branch[1].procname)
794 		return NULL;
795 
796 	/* table should contain subdirectory with the same name */
797 	for (p = table; p->procname; p++) {
798 		if (!p->child)
799 			continue;
800 		if (p->procname && strcmp(p->procname, s) == 0)
801 			return p;
802 	}
803 	return NULL;
804 }
805 
806 /* see if attaching q to p would be an improvement */
807 static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
808 {
809 	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
810 	struct ctl_table *next;
811 	int is_better = 0;
812 	int not_in_parent = !p->attached_by;
813 
814 	while ((next = is_branch_in(by, to)) != NULL) {
815 		if (by == q->attached_by)
816 			is_better = 1;
817 		if (to == p->attached_by)
818 			not_in_parent = 1;
819 		by = by->child;
820 		to = next->child;
821 	}
822 
823 	if (is_better && not_in_parent) {
824 		q->attached_by = by;
825 		q->attached_to = to;
826 		q->parent = p;
827 	}
828 }
829 
830 static int sysctl_check_table_dups(const char *path, struct ctl_table *old,
831 	struct ctl_table *table)
832 {
833 	struct ctl_table *entry, *test;
834 	int error = 0;
835 
836 	for (entry = old; entry->procname; entry++) {
837 		for (test = table; test->procname; test++) {
838 			if (strcmp(entry->procname, test->procname) == 0) {
839 				printk(KERN_ERR "sysctl duplicate entry: %s/%s\n",
840 					path, test->procname);
841 				error = -EEXIST;
842 			}
843 		}
844 	}
845 	return error;
846 }
847 
848 static int sysctl_check_dups(struct nsproxy *namespaces,
849 	struct ctl_table_header *header,
850 	const char *path, struct ctl_table *table)
851 {
852 	struct ctl_table_root *root;
853 	struct ctl_table_set *set;
854 	struct ctl_table_header *dir_head, *head;
855 	struct ctl_table *dir_table;
856 	int error = 0;
857 
858 	/* No dups if we are the only member of our directory */
859 	if (header->attached_by != table)
860 		return 0;
861 
862 	dir_head = header->parent;
863 	dir_table = header->attached_to;
864 
865 	error = sysctl_check_table_dups(path, dir_table, table);
866 
867 	root = &sysctl_table_root;
868 	do {
869 		set = lookup_header_set(root, namespaces);
870 
871 		list_for_each_entry(head, &set->list, ctl_entry) {
872 			if (head->unregistering)
873 				continue;
874 			if (head->attached_to != dir_table)
875 				continue;
876 			error = sysctl_check_table_dups(path, head->attached_by,
877 							table);
878 		}
879 		root = list_entry(root->root_list.next,
880 				  struct ctl_table_root, root_list);
881 	} while (root != &sysctl_table_root);
882 	return error;
883 }
884 
885 static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
886 {
887 	struct va_format vaf;
888 	va_list args;
889 
890 	va_start(args, fmt);
891 	vaf.fmt = fmt;
892 	vaf.va = &args;
893 
894 	printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
895 		path, table->procname, &vaf);
896 
897 	va_end(args);
898 	return -EINVAL;
899 }
900 
901 static int sysctl_check_table(const char *path, struct ctl_table *table)
902 {
903 	int err = 0;
904 	for (; table->procname; table++) {
905 		if (table->child)
906 			err = sysctl_err(path, table, "Not a file");
907 
908 		if ((table->proc_handler == proc_dostring) ||
909 		    (table->proc_handler == proc_dointvec) ||
910 		    (table->proc_handler == proc_dointvec_minmax) ||
911 		    (table->proc_handler == proc_dointvec_jiffies) ||
912 		    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
913 		    (table->proc_handler == proc_dointvec_ms_jiffies) ||
914 		    (table->proc_handler == proc_doulongvec_minmax) ||
915 		    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
916 			if (!table->data)
917 				err = sysctl_err(path, table, "No data");
918 			if (!table->maxlen)
919 				err = sysctl_err(path, table, "No maxlen");
920 		}
921 		if (!table->proc_handler)
922 			err = sysctl_err(path, table, "No proc_handler");
923 
924 		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
925 			err = sysctl_err(path, table, "bogus .mode 0%o",
926 				table->mode);
927 	}
928 	return err;
929 }
930 
931 /**
932  * __register_sysctl_table - register a leaf sysctl table
933  * @root: List of sysctl headers to register on
934  * @namespaces: Data to compute which lists of sysctl entries are visible
935  * @path: The path to the directory the sysctl table is in.
936  * @table: the top-level table structure
937  *
938  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
939  * array. A completely 0 filled entry terminates the table.
940  *
941  * The members of the &struct ctl_table structure are used as follows:
942  *
943  * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
944  *            enter a sysctl file
945  *
946  * data - a pointer to data for use by proc_handler
947  *
948  * maxlen - the maximum size in bytes of the data
949  *
950  * mode - the file permissions for the /proc/sys file
951  *
952  * child - must be %NULL.
953  *
954  * proc_handler - the text handler routine (described below)
955  *
956  * extra1, extra2 - extra pointers usable by the proc handler routines
957  *
958  * Leaf nodes in the sysctl tree will be represented by a single file
959  * under /proc; non-leaf nodes will be represented by directories.
960  *
961  * There must be a proc_handler routine for any terminal nodes.
962  * Several default handlers are available to cover common cases -
963  *
964  * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
965  * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
966  * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
967  *
968  * It is the handler's job to read the input buffer from user memory
969  * and process it. The handler should return 0 on success.
970  *
971  * This routine returns %NULL on a failure to register, and a pointer
972  * to the table header on success.
973  */
974 struct ctl_table_header *__register_sysctl_table(
975 	struct ctl_table_root *root,
976 	struct nsproxy *namespaces,
977 	const char *path, struct ctl_table *table)
978 {
979 	struct ctl_table_header *header;
980 	struct ctl_table *new, **prevp;
981 	const char *name, *nextname;
982 	unsigned int npath = 0;
983 	struct ctl_table_set *set;
984 	size_t path_bytes = 0;
985 	char *new_name;
986 
987 	/* Count the path components */
988 	for (name = path; name; name = nextname) {
989 		int namelen;
990 		nextname = strchr(name, '/');
991 		if (nextname) {
992 			namelen = nextname - name;
993 			nextname++;
994 		} else {
995 			namelen = strlen(name);
996 		}
997 		if (namelen == 0)
998 			continue;
999 		path_bytes += namelen + 1;
1000 		npath++;
1001 	}
1002 
1003 	/*
1004 	 * For each path component, allocate a 2-element ctl_table array.
1005 	 * The first array element will be filled with the sysctl entry
1006 	 * for this, the second will be the sentinel (procname == 0).
1007 	 *
1008 	 * We allocate everything in one go so that we don't have to
1009 	 * worry about freeing additional memory in unregister_sysctl_table.
1010 	 */
1011 	header = kzalloc(sizeof(struct ctl_table_header) + path_bytes +
1012 			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1013 	if (!header)
1014 		return NULL;
1015 
1016 	new = (struct ctl_table *) (header + 1);
1017 	new_name = (char *)(new + (2 * npath));
1018 
1019 	/* Now connect the dots */
1020 	prevp = &header->ctl_table;
1021 	for (name = path; name; name = nextname) {
1022 		int namelen;
1023 		nextname = strchr(name, '/');
1024 		if (nextname) {
1025 			namelen = nextname - name;
1026 			nextname++;
1027 		} else {
1028 			namelen = strlen(name);
1029 		}
1030 		if (namelen == 0)
1031 			continue;
1032 		memcpy(new_name, name, namelen);
1033 		new_name[namelen] = '\0';
1034 
1035 		new->procname = new_name;
1036 		new->mode     = 0555;
1037 
1038 		*prevp = new;
1039 		prevp = &new->child;
1040 
1041 		new += 2;
1042 		new_name += namelen + 1;
1043 	}
1044 	*prevp = table;
1045 
1046 	init_header(header, root, NULL, table);
1047 	if (sysctl_check_table(path, table))
1048 		goto fail;
1049 
1050 	spin_lock(&sysctl_lock);
1051 	header->set = lookup_header_set(root, namespaces);
1052 	header->attached_by = header->ctl_table;
1053 	header->attached_to = &root_table[1];
1054 	header->parent = &root_table_header;
1055 	set = header->set;
1056 	root = header->root;
1057 	for (;;) {
1058 		struct ctl_table_header *p;
1059 		list_for_each_entry(p, &set->list, ctl_entry) {
1060 			if (p->unregistering)
1061 				continue;
1062 			try_attach(p, header);
1063 		}
1064 		if (root == &sysctl_table_root)
1065 			break;
1066 		root = list_entry(root->root_list.prev,
1067 				  struct ctl_table_root, root_list);
1068 		set = lookup_header_set(root, namespaces);
1069 	}
1070 	if (sysctl_check_dups(namespaces, header, path, table))
1071 		goto fail_locked;
1072 	insert_header(header);
1073 	spin_unlock(&sysctl_lock);
1074 
1075 	return header;
1076 fail_locked:
1077 	spin_unlock(&sysctl_lock);
1078 fail:
1079 	kfree(header);
1080 	dump_stack();
1081 	return NULL;
1082 }
1083 
1084 static char *append_path(const char *path, char *pos, const char *name)
1085 {
1086 	int namelen;
1087 	namelen = strlen(name);
1088 	if (((pos - path) + namelen + 2) >= PATH_MAX)
1089 		return NULL;
1090 	memcpy(pos, name, namelen);
1091 	pos[namelen] = '/';
1092 	pos[namelen + 1] = '\0';
1093 	pos += namelen + 1;
1094 	return pos;
1095 }
1096 
1097 static int count_subheaders(struct ctl_table *table)
1098 {
1099 	int has_files = 0;
1100 	int nr_subheaders = 0;
1101 	struct ctl_table *entry;
1102 
1103 	/* special case: no directory and empty directory */
1104 	if (!table || !table->procname)
1105 		return 1;
1106 
1107 	for (entry = table; entry->procname; entry++) {
1108 		if (entry->child)
1109 			nr_subheaders += count_subheaders(entry->child);
1110 		else
1111 			has_files = 1;
1112 	}
1113 	return nr_subheaders + has_files;
1114 }
1115 
1116 static int register_leaf_sysctl_tables(const char *path, char *pos,
1117 	struct ctl_table_header ***subheader,
1118 	struct ctl_table_root *root, struct nsproxy *namespaces,
1119 	struct ctl_table *table)
1120 {
1121 	struct ctl_table *ctl_table_arg = NULL;
1122 	struct ctl_table *entry, *files;
1123 	int nr_files = 0;
1124 	int nr_dirs = 0;
1125 	int err = -ENOMEM;
1126 
1127 	for (entry = table; entry->procname; entry++) {
1128 		if (entry->child)
1129 			nr_dirs++;
1130 		else
1131 			nr_files++;
1132 	}
1133 
1134 	files = table;
1135 	/* If there are mixed files and directories we need a new table */
1136 	if (nr_dirs && nr_files) {
1137 		struct ctl_table *new;
1138 		files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
1139 				GFP_KERNEL);
1140 		if (!files)
1141 			goto out;
1142 
1143 		ctl_table_arg = files;
1144 		for (new = files, entry = table; entry->procname; entry++) {
1145 			if (entry->child)
1146 				continue;
1147 			*new = *entry;
1148 			new++;
1149 		}
1150 	}
1151 
1152 	/* Register everything except a directory full of subdirectories */
1153 	if (nr_files || !nr_dirs) {
1154 		struct ctl_table_header *header;
1155 		header = __register_sysctl_table(root, namespaces, path, files);
1156 		if (!header) {
1157 			kfree(ctl_table_arg);
1158 			goto out;
1159 		}
1160 
1161 		/* Remember if we need to free the file table */
1162 		header->ctl_table_arg = ctl_table_arg;
1163 		**subheader = header;
1164 		(*subheader)++;
1165 	}
1166 
1167 	/* Recurse into the subdirectories. */
1168 	for (entry = table; entry->procname; entry++) {
1169 		char *child_pos;
1170 
1171 		if (!entry->child)
1172 			continue;
1173 
1174 		err = -ENAMETOOLONG;
1175 		child_pos = append_path(path, pos, entry->procname);
1176 		if (!child_pos)
1177 			goto out;
1178 
1179 		err = register_leaf_sysctl_tables(path, child_pos, subheader,
1180 						  root, namespaces, entry->child);
1181 		pos[0] = '\0';
1182 		if (err)
1183 			goto out;
1184 	}
1185 	err = 0;
1186 out:
1187 	/* On failure our caller will unregister all registered subheaders */
1188 	return err;
1189 }
1190 
1191 /**
1192  * __register_sysctl_paths - register a sysctl table hierarchy
1193  * @root: List of sysctl headers to register on
1194  * @namespaces: Data to compute which lists of sysctl entries are visible
1195  * @path: The path to the directory the sysctl table is in.
1196  * @table: the top-level table structure
1197  *
1198  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1199  * array. A completely 0 filled entry terminates the table.
1200  *
1201  * See __register_sysctl_table for more details.
1202  */
1203 struct ctl_table_header *__register_sysctl_paths(
1204 	struct ctl_table_root *root,
1205 	struct nsproxy *namespaces,
1206 	const struct ctl_path *path, struct ctl_table *table)
1207 {
1208 	struct ctl_table *ctl_table_arg = table;
1209 	int nr_subheaders = count_subheaders(table);
1210 	struct ctl_table_header *header = NULL, **subheaders, **subheader;
1211 	const struct ctl_path *component;
1212 	char *new_path, *pos;
1213 
1214 	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
1215 	if (!new_path)
1216 		return NULL;
1217 
1218 	pos[0] = '\0';
1219 	for (component = path; component->procname; component++) {
1220 		pos = append_path(new_path, pos, component->procname);
1221 		if (!pos)
1222 			goto out;
1223 	}
1224 	while (table->procname && table->child && !table[1].procname) {
1225 		pos = append_path(new_path, pos, table->procname);
1226 		if (!pos)
1227 			goto out;
1228 		table = table->child;
1229 	}
1230 	if (nr_subheaders == 1) {
1231 		header = __register_sysctl_table(root, namespaces, new_path, table);
1232 		if (header)
1233 			header->ctl_table_arg = ctl_table_arg;
1234 	} else {
1235 		header = kzalloc(sizeof(*header) +
1236 				 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
1237 		if (!header)
1238 			goto out;
1239 
1240 		subheaders = (struct ctl_table_header **) (header + 1);
1241 		subheader = subheaders;
1242 		header->ctl_table_arg = ctl_table_arg;
1243 
1244 		if (register_leaf_sysctl_tables(new_path, pos, &subheader,
1245 						root, namespaces, table))
1246 			goto err_register_leaves;
1247 	}
1248 
1249 out:
1250 	kfree(new_path);
1251 	return header;
1252 
1253 err_register_leaves:
1254 	while (subheader > subheaders) {
1255 		struct ctl_table_header *subh = *(--subheader);
1256 		struct ctl_table *table = subh->ctl_table_arg;
1257 		unregister_sysctl_table(subh);
1258 		kfree(table);
1259 	}
1260 	kfree(header);
1261 	header = NULL;
1262 	goto out;
1263 }
1264 
1265 /**
1266  * register_sysctl_table_path - register a sysctl table hierarchy
1267  * @path: The path to the directory the sysctl table is in.
1268  * @table: the top-level table structure
1269  *
1270  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1271  * array. A completely 0 filled entry terminates the table.
1272  *
1273  * See __register_sysctl_paths for more details.
1274  */
1275 struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1276 						struct ctl_table *table)
1277 {
1278 	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1279 					path, table);
1280 }
1281 EXPORT_SYMBOL(register_sysctl_paths);
1282 
1283 /**
1284  * register_sysctl_table - register a sysctl table hierarchy
1285  * @table: the top-level table structure
1286  *
1287  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1288  * array. A completely 0 filled entry terminates the table.
1289  *
1290  * See register_sysctl_paths for more details.
1291  */
1292 struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1293 {
1294 	static const struct ctl_path null_path[] = { {} };
1295 
1296 	return register_sysctl_paths(null_path, table);
1297 }
1298 EXPORT_SYMBOL(register_sysctl_table);
1299 
1300 static void drop_sysctl_table(struct ctl_table_header *header)
1301 {
1302 	if (--header->nreg)
1303 		return;
1304 
1305 	start_unregistering(header);
1306 	if (!--header->parent->count) {
1307 		WARN_ON(1);
1308 		kfree_rcu(header->parent, rcu);
1309 	}
1310 	if (!--header->count)
1311 		kfree_rcu(header, rcu);
1312 }
1313 
1314 /**
1315  * unregister_sysctl_table - unregister a sysctl table hierarchy
1316  * @header: the header returned from register_sysctl_table
1317  *
1318  * Unregisters the sysctl table and all children. proc entries may not
1319  * actually be removed until they are no longer used by anyone.
1320  */
1321 void unregister_sysctl_table(struct ctl_table_header * header)
1322 {
1323 	int nr_subheaders;
1324 	might_sleep();
1325 
1326 	if (header == NULL)
1327 		return;
1328 
1329 	nr_subheaders = count_subheaders(header->ctl_table_arg);
1330 	if (unlikely(nr_subheaders > 1)) {
1331 		struct ctl_table_header **subheaders;
1332 		int i;
1333 
1334 		subheaders = (struct ctl_table_header **)(header + 1);
1335 		for (i = nr_subheaders -1; i >= 0; i--) {
1336 			struct ctl_table_header *subh = subheaders[i];
1337 			struct ctl_table *table = subh->ctl_table_arg;
1338 			unregister_sysctl_table(subh);
1339 			kfree(table);
1340 		}
1341 		kfree(header);
1342 		return;
1343 	}
1344 
1345 	spin_lock(&sysctl_lock);
1346 	drop_sysctl_table(header);
1347 	spin_unlock(&sysctl_lock);
1348 }
1349 EXPORT_SYMBOL(unregister_sysctl_table);
1350 
1351 void setup_sysctl_set(struct ctl_table_set *p,
1352 	struct ctl_table_root *root,
1353 	int (*is_seen)(struct ctl_table_set *))
1354 {
1355 	INIT_LIST_HEAD(&p->list);
1356 	p->root = root;
1357 	p->is_seen = is_seen;
1358 }
1359 
1360 void retire_sysctl_set(struct ctl_table_set *set)
1361 {
1362 	WARN_ON(!list_empty(&set->list));
1363 }
1364 
1365 int __init proc_sys_init(void)
1366 {
1367 	struct proc_dir_entry *proc_sys_root;
1368 
1369 	proc_sys_root = proc_mkdir("sys", NULL);
1370 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
1371 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
1372 	proc_sys_root->nlink = 0;
1373 
1374 	return sysctl_init();
1375 }
1376