xref: /openbmc/linux/fs/ceph/xattr.c (revision f220d3eb)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
3 #include <linux/ceph/pagelist.h>
4 
5 #include "super.h"
6 #include "mds_client.h"
7 
8 #include <linux/ceph/decode.h>
9 
10 #include <linux/xattr.h>
11 #include <linux/posix_acl_xattr.h>
12 #include <linux/slab.h>
13 
14 #define XATTR_CEPH_PREFIX "ceph."
15 #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
16 
17 static int __remove_xattr(struct ceph_inode_info *ci,
18 			  struct ceph_inode_xattr *xattr);
19 
20 static const struct xattr_handler ceph_other_xattr_handler;
21 
22 /*
23  * List of handlers for synthetic system.* attributes. Other
24  * attributes are handled directly.
25  */
26 const struct xattr_handler *ceph_xattr_handlers[] = {
27 #ifdef CONFIG_CEPH_FS_POSIX_ACL
28 	&posix_acl_access_xattr_handler,
29 	&posix_acl_default_xattr_handler,
30 #endif
31 	&ceph_other_xattr_handler,
32 	NULL,
33 };
34 
35 static bool ceph_is_valid_xattr(const char *name)
36 {
37 	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
38 	       !strncmp(name, XATTR_SECURITY_PREFIX,
39 			XATTR_SECURITY_PREFIX_LEN) ||
40 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
41 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
42 }
43 
44 /*
45  * These define virtual xattrs exposing the recursive directory
46  * statistics and layout metadata.
47  */
48 struct ceph_vxattr {
49 	char *name;
50 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
51 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
52 			      size_t size);
53 	bool (*exists_cb)(struct ceph_inode_info *ci);
54 	unsigned int flags;
55 };
56 
57 #define VXATTR_FLAG_READONLY		(1<<0)
58 #define VXATTR_FLAG_HIDDEN		(1<<1)
59 #define VXATTR_FLAG_RSTAT		(1<<2)
60 
61 /* layouts */
62 
63 static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
64 {
65 	struct ceph_file_layout *fl = &ci->i_layout;
66 	return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
67 		fl->object_size > 0 || fl->pool_id >= 0 ||
68 		rcu_dereference_raw(fl->pool_ns) != NULL);
69 }
70 
71 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
72 				   size_t size)
73 {
74 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
75 	struct ceph_osd_client *osdc = &fsc->client->osdc;
76 	struct ceph_string *pool_ns;
77 	s64 pool = ci->i_layout.pool_id;
78 	const char *pool_name;
79 	const char *ns_field = " pool_namespace=";
80 	char buf[128];
81 	size_t len, total_len = 0;
82 	int ret;
83 
84 	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
85 
86 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
87 	down_read(&osdc->lock);
88 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
89 	if (pool_name) {
90 		len = snprintf(buf, sizeof(buf),
91 		"stripe_unit=%u stripe_count=%u object_size=%u pool=",
92 		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
93 	        ci->i_layout.object_size);
94 		total_len = len + strlen(pool_name);
95 	} else {
96 		len = snprintf(buf, sizeof(buf),
97 		"stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
98 		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
99 	        ci->i_layout.object_size, (unsigned long long)pool);
100 		total_len = len;
101 	}
102 
103 	if (pool_ns)
104 		total_len += strlen(ns_field) + pool_ns->len;
105 
106 	if (!size) {
107 		ret = total_len;
108 	} else if (total_len > size) {
109 		ret = -ERANGE;
110 	} else {
111 		memcpy(val, buf, len);
112 		ret = len;
113 		if (pool_name) {
114 			len = strlen(pool_name);
115 			memcpy(val + ret, pool_name, len);
116 			ret += len;
117 		}
118 		if (pool_ns) {
119 			len = strlen(ns_field);
120 			memcpy(val + ret, ns_field, len);
121 			ret += len;
122 			memcpy(val + ret, pool_ns->str, pool_ns->len);
123 			ret += pool_ns->len;
124 		}
125 	}
126 	up_read(&osdc->lock);
127 	ceph_put_string(pool_ns);
128 	return ret;
129 }
130 
131 static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
132 					       char *val, size_t size)
133 {
134 	return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
135 }
136 
137 static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
138 						char *val, size_t size)
139 {
140 	return snprintf(val, size, "%u", ci->i_layout.stripe_count);
141 }
142 
143 static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
144 					       char *val, size_t size)
145 {
146 	return snprintf(val, size, "%u", ci->i_layout.object_size);
147 }
148 
149 static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
150 					char *val, size_t size)
151 {
152 	int ret;
153 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
154 	struct ceph_osd_client *osdc = &fsc->client->osdc;
155 	s64 pool = ci->i_layout.pool_id;
156 	const char *pool_name;
157 
158 	down_read(&osdc->lock);
159 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
160 	if (pool_name)
161 		ret = snprintf(val, size, "%s", pool_name);
162 	else
163 		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
164 	up_read(&osdc->lock);
165 	return ret;
166 }
167 
168 static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
169 						  char *val, size_t size)
170 {
171 	int ret = 0;
172 	struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
173 	if (ns) {
174 		ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
175 		ceph_put_string(ns);
176 	}
177 	return ret;
178 }
179 
180 /* directories */
181 
182 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
183 					size_t size)
184 {
185 	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
186 }
187 
188 static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
189 				      size_t size)
190 {
191 	return snprintf(val, size, "%lld", ci->i_files);
192 }
193 
194 static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
195 					size_t size)
196 {
197 	return snprintf(val, size, "%lld", ci->i_subdirs);
198 }
199 
200 static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
201 					 size_t size)
202 {
203 	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
204 }
205 
206 static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
207 				       size_t size)
208 {
209 	return snprintf(val, size, "%lld", ci->i_rfiles);
210 }
211 
212 static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
213 					 size_t size)
214 {
215 	return snprintf(val, size, "%lld", ci->i_rsubdirs);
216 }
217 
218 static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
219 				       size_t size)
220 {
221 	return snprintf(val, size, "%lld", ci->i_rbytes);
222 }
223 
224 static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
225 				       size_t size)
226 {
227 	return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec,
228 			ci->i_rctime.tv_nsec);
229 }
230 
231 /* quotas */
232 
233 static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
234 {
235 	bool ret = false;
236 	spin_lock(&ci->i_ceph_lock);
237 	if ((ci->i_max_files || ci->i_max_bytes) &&
238 	    ci->i_vino.snap == CEPH_NOSNAP &&
239 	    ci->i_snap_realm &&
240 	    ci->i_snap_realm->ino == ci->i_vino.ino)
241 		ret = true;
242 	spin_unlock(&ci->i_ceph_lock);
243 	return ret;
244 }
245 
246 static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
247 				  size_t size)
248 {
249 	return snprintf(val, size, "max_bytes=%llu max_files=%llu",
250 			ci->i_max_bytes, ci->i_max_files);
251 }
252 
253 static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
254 					    char *val, size_t size)
255 {
256 	return snprintf(val, size, "%llu", ci->i_max_bytes);
257 }
258 
259 static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
260 					    char *val, size_t size)
261 {
262 	return snprintf(val, size, "%llu", ci->i_max_files);
263 }
264 
265 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
266 #define CEPH_XATTR_NAME2(_type, _name, _name2)	\
267 	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
268 
269 #define XATTR_NAME_CEPH(_type, _name, _flags)				\
270 	{								\
271 		.name = CEPH_XATTR_NAME(_type, _name),			\
272 		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
273 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
274 		.exists_cb = NULL,					\
275 		.flags = (VXATTR_FLAG_READONLY | _flags),		\
276 	}
277 #define XATTR_RSTAT_FIELD(_type, _name)			\
278 	XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
279 #define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
280 	{								\
281 		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
282 		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
283 		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
284 		.exists_cb = ceph_vxattrcb_layout_exists,	\
285 		.flags = VXATTR_FLAG_HIDDEN,			\
286 	}
287 #define XATTR_QUOTA_FIELD(_type, _name)					\
288 	{								\
289 		.name = CEPH_XATTR_NAME(_type, _name),			\
290 		.name_size = sizeof(CEPH_XATTR_NAME(_type, _name)),	\
291 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name,	\
292 		.exists_cb = ceph_vxattrcb_quota_exists,		\
293 		.flags = VXATTR_FLAG_HIDDEN,				\
294 	}
295 
296 static struct ceph_vxattr ceph_dir_vxattrs[] = {
297 	{
298 		.name = "ceph.dir.layout",
299 		.name_size = sizeof("ceph.dir.layout"),
300 		.getxattr_cb = ceph_vxattrcb_layout,
301 		.exists_cb = ceph_vxattrcb_layout_exists,
302 		.flags = VXATTR_FLAG_HIDDEN,
303 	},
304 	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
305 	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
306 	XATTR_LAYOUT_FIELD(dir, layout, object_size),
307 	XATTR_LAYOUT_FIELD(dir, layout, pool),
308 	XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
309 	XATTR_NAME_CEPH(dir, entries, 0),
310 	XATTR_NAME_CEPH(dir, files, 0),
311 	XATTR_NAME_CEPH(dir, subdirs, 0),
312 	XATTR_RSTAT_FIELD(dir, rentries),
313 	XATTR_RSTAT_FIELD(dir, rfiles),
314 	XATTR_RSTAT_FIELD(dir, rsubdirs),
315 	XATTR_RSTAT_FIELD(dir, rbytes),
316 	XATTR_RSTAT_FIELD(dir, rctime),
317 	{
318 		.name = "ceph.quota",
319 		.name_size = sizeof("ceph.quota"),
320 		.getxattr_cb = ceph_vxattrcb_quota,
321 		.exists_cb = ceph_vxattrcb_quota_exists,
322 		.flags = VXATTR_FLAG_HIDDEN,
323 	},
324 	XATTR_QUOTA_FIELD(quota, max_bytes),
325 	XATTR_QUOTA_FIELD(quota, max_files),
326 	{ .name = NULL, 0 }	/* Required table terminator */
327 };
328 static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
329 
330 /* files */
331 
332 static struct ceph_vxattr ceph_file_vxattrs[] = {
333 	{
334 		.name = "ceph.file.layout",
335 		.name_size = sizeof("ceph.file.layout"),
336 		.getxattr_cb = ceph_vxattrcb_layout,
337 		.exists_cb = ceph_vxattrcb_layout_exists,
338 		.flags = VXATTR_FLAG_HIDDEN,
339 	},
340 	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
341 	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
342 	XATTR_LAYOUT_FIELD(file, layout, object_size),
343 	XATTR_LAYOUT_FIELD(file, layout, pool),
344 	XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
345 	{ .name = NULL, 0 }	/* Required table terminator */
346 };
347 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
348 
349 static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
350 {
351 	if (S_ISDIR(inode->i_mode))
352 		return ceph_dir_vxattrs;
353 	else if (S_ISREG(inode->i_mode))
354 		return ceph_file_vxattrs;
355 	return NULL;
356 }
357 
358 static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
359 {
360 	if (vxattrs == ceph_dir_vxattrs)
361 		return ceph_dir_vxattrs_name_size;
362 	if (vxattrs == ceph_file_vxattrs)
363 		return ceph_file_vxattrs_name_size;
364 	BUG_ON(vxattrs);
365 	return 0;
366 }
367 
368 /*
369  * Compute the aggregate size (including terminating '\0') of all
370  * virtual extended attribute names in the given vxattr table.
371  */
372 static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
373 {
374 	struct ceph_vxattr *vxattr;
375 	size_t size = 0;
376 
377 	for (vxattr = vxattrs; vxattr->name; vxattr++) {
378 		if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
379 			size += vxattr->name_size;
380 	}
381 
382 	return size;
383 }
384 
385 /* Routines called at initialization and exit time */
386 
387 void __init ceph_xattr_init(void)
388 {
389 	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
390 	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
391 }
392 
393 void ceph_xattr_exit(void)
394 {
395 	ceph_dir_vxattrs_name_size = 0;
396 	ceph_file_vxattrs_name_size = 0;
397 }
398 
399 static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
400 						const char *name)
401 {
402 	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
403 
404 	if (vxattr) {
405 		while (vxattr->name) {
406 			if (!strcmp(vxattr->name, name))
407 				return vxattr;
408 			vxattr++;
409 		}
410 	}
411 
412 	return NULL;
413 }
414 
415 static int __set_xattr(struct ceph_inode_info *ci,
416 			   const char *name, int name_len,
417 			   const char *val, int val_len,
418 			   int flags, int update_xattr,
419 			   struct ceph_inode_xattr **newxattr)
420 {
421 	struct rb_node **p;
422 	struct rb_node *parent = NULL;
423 	struct ceph_inode_xattr *xattr = NULL;
424 	int c;
425 	int new = 0;
426 
427 	p = &ci->i_xattrs.index.rb_node;
428 	while (*p) {
429 		parent = *p;
430 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
431 		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
432 		if (c < 0)
433 			p = &(*p)->rb_left;
434 		else if (c > 0)
435 			p = &(*p)->rb_right;
436 		else {
437 			if (name_len == xattr->name_len)
438 				break;
439 			else if (name_len < xattr->name_len)
440 				p = &(*p)->rb_left;
441 			else
442 				p = &(*p)->rb_right;
443 		}
444 		xattr = NULL;
445 	}
446 
447 	if (update_xattr) {
448 		int err = 0;
449 
450 		if (xattr && (flags & XATTR_CREATE))
451 			err = -EEXIST;
452 		else if (!xattr && (flags & XATTR_REPLACE))
453 			err = -ENODATA;
454 		if (err) {
455 			kfree(name);
456 			kfree(val);
457 			kfree(*newxattr);
458 			return err;
459 		}
460 		if (update_xattr < 0) {
461 			if (xattr)
462 				__remove_xattr(ci, xattr);
463 			kfree(name);
464 			kfree(*newxattr);
465 			return 0;
466 		}
467 	}
468 
469 	if (!xattr) {
470 		new = 1;
471 		xattr = *newxattr;
472 		xattr->name = name;
473 		xattr->name_len = name_len;
474 		xattr->should_free_name = update_xattr;
475 
476 		ci->i_xattrs.count++;
477 		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
478 	} else {
479 		kfree(*newxattr);
480 		*newxattr = NULL;
481 		if (xattr->should_free_val)
482 			kfree((void *)xattr->val);
483 
484 		if (update_xattr) {
485 			kfree((void *)name);
486 			name = xattr->name;
487 		}
488 		ci->i_xattrs.names_size -= xattr->name_len;
489 		ci->i_xattrs.vals_size -= xattr->val_len;
490 	}
491 	ci->i_xattrs.names_size += name_len;
492 	ci->i_xattrs.vals_size += val_len;
493 	if (val)
494 		xattr->val = val;
495 	else
496 		xattr->val = "";
497 
498 	xattr->val_len = val_len;
499 	xattr->dirty = update_xattr;
500 	xattr->should_free_val = (val && update_xattr);
501 
502 	if (new) {
503 		rb_link_node(&xattr->node, parent, p);
504 		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
505 		dout("__set_xattr_val p=%p\n", p);
506 	}
507 
508 	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
509 	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
510 
511 	return 0;
512 }
513 
514 static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
515 			   const char *name)
516 {
517 	struct rb_node **p;
518 	struct rb_node *parent = NULL;
519 	struct ceph_inode_xattr *xattr = NULL;
520 	int name_len = strlen(name);
521 	int c;
522 
523 	p = &ci->i_xattrs.index.rb_node;
524 	while (*p) {
525 		parent = *p;
526 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
527 		c = strncmp(name, xattr->name, xattr->name_len);
528 		if (c == 0 && name_len > xattr->name_len)
529 			c = 1;
530 		if (c < 0)
531 			p = &(*p)->rb_left;
532 		else if (c > 0)
533 			p = &(*p)->rb_right;
534 		else {
535 			dout("__get_xattr %s: found %.*s\n", name,
536 			     xattr->val_len, xattr->val);
537 			return xattr;
538 		}
539 	}
540 
541 	dout("__get_xattr %s: not found\n", name);
542 
543 	return NULL;
544 }
545 
546 static void __free_xattr(struct ceph_inode_xattr *xattr)
547 {
548 	BUG_ON(!xattr);
549 
550 	if (xattr->should_free_name)
551 		kfree((void *)xattr->name);
552 	if (xattr->should_free_val)
553 		kfree((void *)xattr->val);
554 
555 	kfree(xattr);
556 }
557 
558 static int __remove_xattr(struct ceph_inode_info *ci,
559 			  struct ceph_inode_xattr *xattr)
560 {
561 	if (!xattr)
562 		return -ENODATA;
563 
564 	rb_erase(&xattr->node, &ci->i_xattrs.index);
565 
566 	if (xattr->should_free_name)
567 		kfree((void *)xattr->name);
568 	if (xattr->should_free_val)
569 		kfree((void *)xattr->val);
570 
571 	ci->i_xattrs.names_size -= xattr->name_len;
572 	ci->i_xattrs.vals_size -= xattr->val_len;
573 	ci->i_xattrs.count--;
574 	kfree(xattr);
575 
576 	return 0;
577 }
578 
579 static char *__copy_xattr_names(struct ceph_inode_info *ci,
580 				char *dest)
581 {
582 	struct rb_node *p;
583 	struct ceph_inode_xattr *xattr = NULL;
584 
585 	p = rb_first(&ci->i_xattrs.index);
586 	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
587 
588 	while (p) {
589 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
590 		memcpy(dest, xattr->name, xattr->name_len);
591 		dest[xattr->name_len] = '\0';
592 
593 		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
594 		     xattr->name_len, ci->i_xattrs.names_size);
595 
596 		dest += xattr->name_len + 1;
597 		p = rb_next(p);
598 	}
599 
600 	return dest;
601 }
602 
603 void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
604 {
605 	struct rb_node *p, *tmp;
606 	struct ceph_inode_xattr *xattr = NULL;
607 
608 	p = rb_first(&ci->i_xattrs.index);
609 
610 	dout("__ceph_destroy_xattrs p=%p\n", p);
611 
612 	while (p) {
613 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
614 		tmp = p;
615 		p = rb_next(tmp);
616 		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
617 		     xattr->name_len, xattr->name);
618 		rb_erase(tmp, &ci->i_xattrs.index);
619 
620 		__free_xattr(xattr);
621 	}
622 
623 	ci->i_xattrs.names_size = 0;
624 	ci->i_xattrs.vals_size = 0;
625 	ci->i_xattrs.index_version = 0;
626 	ci->i_xattrs.count = 0;
627 	ci->i_xattrs.index = RB_ROOT;
628 }
629 
630 static int __build_xattrs(struct inode *inode)
631 	__releases(ci->i_ceph_lock)
632 	__acquires(ci->i_ceph_lock)
633 {
634 	u32 namelen;
635 	u32 numattr = 0;
636 	void *p, *end;
637 	u32 len;
638 	const char *name, *val;
639 	struct ceph_inode_info *ci = ceph_inode(inode);
640 	int xattr_version;
641 	struct ceph_inode_xattr **xattrs = NULL;
642 	int err = 0;
643 	int i;
644 
645 	dout("__build_xattrs() len=%d\n",
646 	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
647 
648 	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
649 		return 0; /* already built */
650 
651 	__ceph_destroy_xattrs(ci);
652 
653 start:
654 	/* updated internal xattr rb tree */
655 	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
656 		p = ci->i_xattrs.blob->vec.iov_base;
657 		end = p + ci->i_xattrs.blob->vec.iov_len;
658 		ceph_decode_32_safe(&p, end, numattr, bad);
659 		xattr_version = ci->i_xattrs.version;
660 		spin_unlock(&ci->i_ceph_lock);
661 
662 		xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
663 				 GFP_NOFS);
664 		err = -ENOMEM;
665 		if (!xattrs)
666 			goto bad_lock;
667 
668 		for (i = 0; i < numattr; i++) {
669 			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
670 					    GFP_NOFS);
671 			if (!xattrs[i])
672 				goto bad_lock;
673 		}
674 
675 		spin_lock(&ci->i_ceph_lock);
676 		if (ci->i_xattrs.version != xattr_version) {
677 			/* lost a race, retry */
678 			for (i = 0; i < numattr; i++)
679 				kfree(xattrs[i]);
680 			kfree(xattrs);
681 			xattrs = NULL;
682 			goto start;
683 		}
684 		err = -EIO;
685 		while (numattr--) {
686 			ceph_decode_32_safe(&p, end, len, bad);
687 			namelen = len;
688 			name = p;
689 			p += len;
690 			ceph_decode_32_safe(&p, end, len, bad);
691 			val = p;
692 			p += len;
693 
694 			err = __set_xattr(ci, name, namelen, val, len,
695 					  0, 0, &xattrs[numattr]);
696 
697 			if (err < 0)
698 				goto bad;
699 		}
700 		kfree(xattrs);
701 	}
702 	ci->i_xattrs.index_version = ci->i_xattrs.version;
703 	ci->i_xattrs.dirty = false;
704 
705 	return err;
706 bad_lock:
707 	spin_lock(&ci->i_ceph_lock);
708 bad:
709 	if (xattrs) {
710 		for (i = 0; i < numattr; i++)
711 			kfree(xattrs[i]);
712 		kfree(xattrs);
713 	}
714 	ci->i_xattrs.names_size = 0;
715 	return err;
716 }
717 
718 static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
719 				    int val_size)
720 {
721 	/*
722 	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
723 	 * 4 bytes per each value
724 	 */
725 	int size = 4 + ci->i_xattrs.count*(4 + 4) +
726 			     ci->i_xattrs.names_size +
727 			     ci->i_xattrs.vals_size;
728 	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
729 	     ci->i_xattrs.count, ci->i_xattrs.names_size,
730 	     ci->i_xattrs.vals_size);
731 
732 	if (name_size)
733 		size += 4 + 4 + name_size + val_size;
734 
735 	return size;
736 }
737 
738 /*
739  * If there are dirty xattrs, reencode xattrs into the prealloc_blob
740  * and swap into place.
741  */
742 void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
743 {
744 	struct rb_node *p;
745 	struct ceph_inode_xattr *xattr = NULL;
746 	void *dest;
747 
748 	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
749 	if (ci->i_xattrs.dirty) {
750 		int need = __get_required_blob_size(ci, 0, 0);
751 
752 		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
753 
754 		p = rb_first(&ci->i_xattrs.index);
755 		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
756 
757 		ceph_encode_32(&dest, ci->i_xattrs.count);
758 		while (p) {
759 			xattr = rb_entry(p, struct ceph_inode_xattr, node);
760 
761 			ceph_encode_32(&dest, xattr->name_len);
762 			memcpy(dest, xattr->name, xattr->name_len);
763 			dest += xattr->name_len;
764 			ceph_encode_32(&dest, xattr->val_len);
765 			memcpy(dest, xattr->val, xattr->val_len);
766 			dest += xattr->val_len;
767 
768 			p = rb_next(p);
769 		}
770 
771 		/* adjust buffer len; it may be larger than we need */
772 		ci->i_xattrs.prealloc_blob->vec.iov_len =
773 			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
774 
775 		if (ci->i_xattrs.blob)
776 			ceph_buffer_put(ci->i_xattrs.blob);
777 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
778 		ci->i_xattrs.prealloc_blob = NULL;
779 		ci->i_xattrs.dirty = false;
780 		ci->i_xattrs.version++;
781 	}
782 }
783 
784 static inline int __get_request_mask(struct inode *in) {
785 	struct ceph_mds_request *req = current->journal_info;
786 	int mask = 0;
787 	if (req && req->r_target_inode == in) {
788 		if (req->r_op == CEPH_MDS_OP_LOOKUP ||
789 		    req->r_op == CEPH_MDS_OP_LOOKUPINO ||
790 		    req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
791 		    req->r_op == CEPH_MDS_OP_GETATTR) {
792 			mask = le32_to_cpu(req->r_args.getattr.mask);
793 		} else if (req->r_op == CEPH_MDS_OP_OPEN ||
794 			   req->r_op == CEPH_MDS_OP_CREATE) {
795 			mask = le32_to_cpu(req->r_args.open.mask);
796 		}
797 	}
798 	return mask;
799 }
800 
801 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
802 		      size_t size)
803 {
804 	struct ceph_inode_info *ci = ceph_inode(inode);
805 	struct ceph_inode_xattr *xattr;
806 	struct ceph_vxattr *vxattr = NULL;
807 	int req_mask;
808 	int err;
809 
810 	/* let's see if a virtual xattr was requested */
811 	vxattr = ceph_match_vxattr(inode, name);
812 	if (vxattr) {
813 		int mask = 0;
814 		if (vxattr->flags & VXATTR_FLAG_RSTAT)
815 			mask |= CEPH_STAT_RSTAT;
816 		err = ceph_do_getattr(inode, mask, true);
817 		if (err)
818 			return err;
819 		err = -ENODATA;
820 		if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
821 			err = vxattr->getxattr_cb(ci, value, size);
822 		return err;
823 	}
824 
825 	req_mask = __get_request_mask(inode);
826 
827 	spin_lock(&ci->i_ceph_lock);
828 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
829 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
830 
831 	if (ci->i_xattrs.version == 0 ||
832 	    !((req_mask & CEPH_CAP_XATTR_SHARED) ||
833 	      __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
834 		spin_unlock(&ci->i_ceph_lock);
835 
836 		/* security module gets xattr while filling trace */
837 		if (current->journal_info) {
838 			pr_warn_ratelimited("sync getxattr %p "
839 					    "during filling trace\n", inode);
840 			return -EBUSY;
841 		}
842 
843 		/* get xattrs from mds (if we don't already have them) */
844 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
845 		if (err)
846 			return err;
847 		spin_lock(&ci->i_ceph_lock);
848 	}
849 
850 	err = __build_xattrs(inode);
851 	if (err < 0)
852 		goto out;
853 
854 	err = -ENODATA;  /* == ENOATTR */
855 	xattr = __get_xattr(ci, name);
856 	if (!xattr)
857 		goto out;
858 
859 	err = -ERANGE;
860 	if (size && size < xattr->val_len)
861 		goto out;
862 
863 	err = xattr->val_len;
864 	if (size == 0)
865 		goto out;
866 
867 	memcpy(value, xattr->val, xattr->val_len);
868 
869 	if (current->journal_info &&
870 	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
871 		ci->i_ceph_flags |= CEPH_I_SEC_INITED;
872 out:
873 	spin_unlock(&ci->i_ceph_lock);
874 	return err;
875 }
876 
877 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
878 {
879 	struct inode *inode = d_inode(dentry);
880 	struct ceph_inode_info *ci = ceph_inode(inode);
881 	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
882 	u32 vir_namelen = 0;
883 	u32 namelen;
884 	int err;
885 	u32 len;
886 	int i;
887 
888 	spin_lock(&ci->i_ceph_lock);
889 	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
890 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
891 
892 	if (ci->i_xattrs.version == 0 ||
893 	    !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
894 		spin_unlock(&ci->i_ceph_lock);
895 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
896 		if (err)
897 			return err;
898 		spin_lock(&ci->i_ceph_lock);
899 	}
900 
901 	err = __build_xattrs(inode);
902 	if (err < 0)
903 		goto out;
904 	/*
905 	 * Start with virtual dir xattr names (if any) (including
906 	 * terminating '\0' characters for each).
907 	 */
908 	vir_namelen = ceph_vxattrs_name_size(vxattrs);
909 
910 	/* adding 1 byte per each variable due to the null termination */
911 	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
912 	err = -ERANGE;
913 	if (size && vir_namelen + namelen > size)
914 		goto out;
915 
916 	err = namelen + vir_namelen;
917 	if (size == 0)
918 		goto out;
919 
920 	names = __copy_xattr_names(ci, names);
921 
922 	/* virtual xattr names, too */
923 	err = namelen;
924 	if (vxattrs) {
925 		for (i = 0; vxattrs[i].name; i++) {
926 			if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) &&
927 			    !(vxattrs[i].exists_cb &&
928 			      !vxattrs[i].exists_cb(ci))) {
929 				len = sprintf(names, "%s", vxattrs[i].name);
930 				names += len + 1;
931 				err += len + 1;
932 			}
933 		}
934 	}
935 
936 out:
937 	spin_unlock(&ci->i_ceph_lock);
938 	return err;
939 }
940 
941 static int ceph_sync_setxattr(struct inode *inode, const char *name,
942 			      const char *value, size_t size, int flags)
943 {
944 	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
945 	struct ceph_inode_info *ci = ceph_inode(inode);
946 	struct ceph_mds_request *req;
947 	struct ceph_mds_client *mdsc = fsc->mdsc;
948 	struct ceph_pagelist *pagelist = NULL;
949 	int op = CEPH_MDS_OP_SETXATTR;
950 	int err;
951 
952 	if (size > 0) {
953 		/* copy value into pagelist */
954 		pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
955 		if (!pagelist)
956 			return -ENOMEM;
957 
958 		ceph_pagelist_init(pagelist);
959 		err = ceph_pagelist_append(pagelist, value, size);
960 		if (err)
961 			goto out;
962 	} else if (!value) {
963 		if (flags & CEPH_XATTR_REPLACE)
964 			op = CEPH_MDS_OP_RMXATTR;
965 		else
966 			flags |= CEPH_XATTR_REMOVE;
967 	}
968 
969 	dout("setxattr value=%.*s\n", (int)size, value);
970 
971 	/* do request */
972 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
973 	if (IS_ERR(req)) {
974 		err = PTR_ERR(req);
975 		goto out;
976 	}
977 
978 	req->r_path2 = kstrdup(name, GFP_NOFS);
979 	if (!req->r_path2) {
980 		ceph_mdsc_put_request(req);
981 		err = -ENOMEM;
982 		goto out;
983 	}
984 
985 	if (op == CEPH_MDS_OP_SETXATTR) {
986 		req->r_args.setxattr.flags = cpu_to_le32(flags);
987 		req->r_pagelist = pagelist;
988 		pagelist = NULL;
989 	}
990 
991 	req->r_inode = inode;
992 	ihold(inode);
993 	req->r_num_caps = 1;
994 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
995 
996 	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
997 	err = ceph_mdsc_do_request(mdsc, NULL, req);
998 	ceph_mdsc_put_request(req);
999 	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
1000 
1001 out:
1002 	if (pagelist)
1003 		ceph_pagelist_release(pagelist);
1004 	return err;
1005 }
1006 
1007 int __ceph_setxattr(struct inode *inode, const char *name,
1008 			const void *value, size_t size, int flags)
1009 {
1010 	struct ceph_vxattr *vxattr;
1011 	struct ceph_inode_info *ci = ceph_inode(inode);
1012 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1013 	struct ceph_cap_flush *prealloc_cf = NULL;
1014 	int issued;
1015 	int err;
1016 	int dirty = 0;
1017 	int name_len = strlen(name);
1018 	int val_len = size;
1019 	char *newname = NULL;
1020 	char *newval = NULL;
1021 	struct ceph_inode_xattr *xattr = NULL;
1022 	int required_blob_size;
1023 	bool check_realm = false;
1024 	bool lock_snap_rwsem = false;
1025 
1026 	if (ceph_snap(inode) != CEPH_NOSNAP)
1027 		return -EROFS;
1028 
1029 	vxattr = ceph_match_vxattr(inode, name);
1030 	if (vxattr) {
1031 		if (vxattr->flags & VXATTR_FLAG_READONLY)
1032 			return -EOPNOTSUPP;
1033 		if (value && !strncmp(vxattr->name, "ceph.quota", 10))
1034 			check_realm = true;
1035 	}
1036 
1037 	/* pass any unhandled ceph.* xattrs through to the MDS */
1038 	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
1039 		goto do_sync_unlocked;
1040 
1041 	/* preallocate memory for xattr name, value, index node */
1042 	err = -ENOMEM;
1043 	newname = kmemdup(name, name_len + 1, GFP_NOFS);
1044 	if (!newname)
1045 		goto out;
1046 
1047 	if (val_len) {
1048 		newval = kmemdup(value, val_len, GFP_NOFS);
1049 		if (!newval)
1050 			goto out;
1051 	}
1052 
1053 	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
1054 	if (!xattr)
1055 		goto out;
1056 
1057 	prealloc_cf = ceph_alloc_cap_flush();
1058 	if (!prealloc_cf)
1059 		goto out;
1060 
1061 	spin_lock(&ci->i_ceph_lock);
1062 retry:
1063 	issued = __ceph_caps_issued(ci, NULL);
1064 	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
1065 		goto do_sync;
1066 
1067 	if (!lock_snap_rwsem && !ci->i_head_snapc) {
1068 		lock_snap_rwsem = true;
1069 		if (!down_read_trylock(&mdsc->snap_rwsem)) {
1070 			spin_unlock(&ci->i_ceph_lock);
1071 			down_read(&mdsc->snap_rwsem);
1072 			spin_lock(&ci->i_ceph_lock);
1073 			goto retry;
1074 		}
1075 	}
1076 
1077 	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
1078 	__build_xattrs(inode);
1079 
1080 	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
1081 
1082 	if (!ci->i_xattrs.prealloc_blob ||
1083 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
1084 		struct ceph_buffer *blob;
1085 
1086 		spin_unlock(&ci->i_ceph_lock);
1087 		dout(" preaallocating new blob size=%d\n", required_blob_size);
1088 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
1089 		if (!blob)
1090 			goto do_sync_unlocked;
1091 		spin_lock(&ci->i_ceph_lock);
1092 		if (ci->i_xattrs.prealloc_blob)
1093 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
1094 		ci->i_xattrs.prealloc_blob = blob;
1095 		goto retry;
1096 	}
1097 
1098 	err = __set_xattr(ci, newname, name_len, newval, val_len,
1099 			  flags, value ? 1 : -1, &xattr);
1100 
1101 	if (!err) {
1102 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
1103 					       &prealloc_cf);
1104 		ci->i_xattrs.dirty = true;
1105 		inode->i_ctime = current_time(inode);
1106 	}
1107 
1108 	spin_unlock(&ci->i_ceph_lock);
1109 	if (lock_snap_rwsem)
1110 		up_read(&mdsc->snap_rwsem);
1111 	if (dirty)
1112 		__mark_inode_dirty(inode, dirty);
1113 	ceph_free_cap_flush(prealloc_cf);
1114 	return err;
1115 
1116 do_sync:
1117 	spin_unlock(&ci->i_ceph_lock);
1118 do_sync_unlocked:
1119 	if (lock_snap_rwsem)
1120 		up_read(&mdsc->snap_rwsem);
1121 
1122 	/* security module set xattr while filling trace */
1123 	if (current->journal_info) {
1124 		pr_warn_ratelimited("sync setxattr %p "
1125 				    "during filling trace\n", inode);
1126 		err = -EBUSY;
1127 	} else {
1128 		err = ceph_sync_setxattr(inode, name, value, size, flags);
1129 		if (err >= 0 && check_realm) {
1130 			/* check if snaprealm was created for quota inode */
1131 			spin_lock(&ci->i_ceph_lock);
1132 			if ((ci->i_max_files || ci->i_max_bytes) &&
1133 			    !(ci->i_snap_realm &&
1134 			      ci->i_snap_realm->ino == ci->i_vino.ino))
1135 				err = -EOPNOTSUPP;
1136 			spin_unlock(&ci->i_ceph_lock);
1137 		}
1138 	}
1139 out:
1140 	ceph_free_cap_flush(prealloc_cf);
1141 	kfree(newname);
1142 	kfree(newval);
1143 	kfree(xattr);
1144 	return err;
1145 }
1146 
1147 static int ceph_get_xattr_handler(const struct xattr_handler *handler,
1148 				  struct dentry *dentry, struct inode *inode,
1149 				  const char *name, void *value, size_t size)
1150 {
1151 	if (!ceph_is_valid_xattr(name))
1152 		return -EOPNOTSUPP;
1153 	return __ceph_getxattr(inode, name, value, size);
1154 }
1155 
1156 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
1157 				  struct dentry *unused, struct inode *inode,
1158 				  const char *name, const void *value,
1159 				  size_t size, int flags)
1160 {
1161 	if (!ceph_is_valid_xattr(name))
1162 		return -EOPNOTSUPP;
1163 	return __ceph_setxattr(inode, name, value, size, flags);
1164 }
1165 
1166 static const struct xattr_handler ceph_other_xattr_handler = {
1167 	.prefix = "",  /* match any name => handlers called with full name */
1168 	.get = ceph_get_xattr_handler,
1169 	.set = ceph_set_xattr_handler,
1170 };
1171 
1172 #ifdef CONFIG_SECURITY
1173 bool ceph_security_xattr_wanted(struct inode *in)
1174 {
1175 	return in->i_security != NULL;
1176 }
1177 
1178 bool ceph_security_xattr_deadlock(struct inode *in)
1179 {
1180 	struct ceph_inode_info *ci;
1181 	bool ret;
1182 	if (!in->i_security)
1183 		return false;
1184 	ci = ceph_inode(in);
1185 	spin_lock(&ci->i_ceph_lock);
1186 	ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
1187 	      !(ci->i_xattrs.version > 0 &&
1188 		__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
1189 	spin_unlock(&ci->i_ceph_lock);
1190 	return ret;
1191 }
1192 #endif
1193