1 /*
2 * QEMU Xen emulation: The actual implementation of XenStore
3 *
4 * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
14
15 #include "hw/xen/xen.h"
16
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
19
20 #include "hw/xen/interface/io/xs_wire.h"
21
22 #define XS_MAX_WATCHES 128
23 #define XS_MAX_DOMAIN_NODES 1000
24 #define XS_MAX_NODE_SIZE 2048
25 #define XS_MAX_TRANSACTIONS 10
26 #define XS_MAX_PERMS_PER_NODE 5
27
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
30 "0123456789-/_"
31
32 typedef struct XsNode {
33 uint32_t ref;
34 GByteArray *content;
35 GList *perms;
36 GHashTable *children;
37 uint64_t gencnt;
38 bool deleted_in_tx;
39 bool modified_in_tx;
40 unsigned int serialized_tx;
41 #ifdef XS_NODE_UNIT_TEST
42 gchar *name; /* debug only */
43 #endif
44 } XsNode;
45
46 typedef struct XsWatch {
47 struct XsWatch *next;
48 xs_impl_watch_fn *cb;
49 void *cb_opaque;
50 char *token;
51 unsigned int dom_id;
52 int rel_prefix;
53 } XsWatch;
54
55 typedef struct XsTransaction {
56 XsNode *root;
57 unsigned int nr_nodes;
58 unsigned int base_tx;
59 unsigned int tx_id;
60 unsigned int dom_id;
61 } XsTransaction;
62
63 struct XenstoreImplState {
64 XsNode *root;
65 unsigned int nr_nodes;
66 GHashTable *watches;
67 unsigned int nr_domu_watches;
68 GHashTable *transactions;
69 unsigned int nr_domu_transactions;
70 unsigned int root_tx;
71 unsigned int last_tx;
72 bool serialized;
73 };
74
75
nobble_tx(gpointer key,gpointer value,gpointer user_data)76 static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
77 {
78 unsigned int *new_tx_id = user_data;
79 XsTransaction *tx = value;
80
81 if (tx->base_tx == *new_tx_id) {
82 /* Transactions based on XBT_NULL will always fail */
83 tx->base_tx = XBT_NULL;
84 }
85 }
86
next_tx(struct XenstoreImplState * s)87 static inline unsigned int next_tx(struct XenstoreImplState *s)
88 {
89 unsigned int tx_id;
90
91 /* Find the next TX id which isn't either XBT_NULL or in use. */
92 do {
93 tx_id = ++s->last_tx;
94 } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
95 g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
96
97 /*
98 * It is vanishingly unlikely, but ensure that no outstanding transaction
99 * is based on the (previous incarnation of the) newly-allocated TX id.
100 */
101 g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
102
103 return tx_id;
104 }
105
xs_node_new(void)106 static inline XsNode *xs_node_new(void)
107 {
108 XsNode *n = g_new0(XsNode, 1);
109 n->ref = 1;
110
111 #ifdef XS_NODE_UNIT_TEST
112 nr_xs_nodes++;
113 xs_node_list = g_list_prepend(xs_node_list, n);
114 #endif
115 return n;
116 }
117
xs_node_ref(XsNode * n)118 static inline XsNode *xs_node_ref(XsNode *n)
119 {
120 /* With just 10 transactions, it can never get anywhere near this. */
121 g_assert(n->ref < INT_MAX);
122
123 g_assert(n->ref);
124 n->ref++;
125 return n;
126 }
127
xs_node_unref(XsNode * n)128 static inline void xs_node_unref(XsNode *n)
129 {
130 if (!n) {
131 return;
132 }
133 g_assert(n->ref);
134 if (--n->ref) {
135 return;
136 }
137
138 if (n->content) {
139 g_byte_array_unref(n->content);
140 }
141 if (n->perms) {
142 g_list_free_full(n->perms, g_free);
143 }
144 if (n->children) {
145 g_hash_table_unref(n->children);
146 }
147 #ifdef XS_NODE_UNIT_TEST
148 g_free(n->name);
149 nr_xs_nodes--;
150 xs_node_list = g_list_remove(xs_node_list, n);
151 #endif
152 g_free(n);
153 }
154
xs_perm_as_string(unsigned int perm,unsigned int domid)155 char *xs_perm_as_string(unsigned int perm, unsigned int domid)
156 {
157 char letter;
158
159 switch (perm) {
160 case XS_PERM_READ | XS_PERM_WRITE:
161 letter = 'b';
162 break;
163 case XS_PERM_READ:
164 letter = 'r';
165 break;
166 case XS_PERM_WRITE:
167 letter = 'w';
168 break;
169 case XS_PERM_NONE:
170 default:
171 letter = 'n';
172 break;
173 }
174
175 return g_strdup_printf("%c%u", letter, domid);
176 }
177
do_perm_copy(gconstpointer src,gpointer user_data)178 static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
179 {
180 return g_strdup(src);
181 }
182
xs_node_create(const char * name,GList * perms)183 static XsNode *xs_node_create(const char *name, GList *perms)
184 {
185 XsNode *n = xs_node_new();
186
187 #ifdef XS_NODE_UNIT_TEST
188 if (name) {
189 n->name = g_strdup(name);
190 }
191 #endif
192
193 n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
194
195 return n;
196 }
197
198 /* For copying from one hash table to another using g_hash_table_foreach() */
do_child_insert(gpointer key,gpointer value,gpointer user_data)199 static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
200 {
201 g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
202 }
203
xs_node_copy(XsNode * old)204 static XsNode *xs_node_copy(XsNode *old)
205 {
206 XsNode *n = xs_node_new();
207
208 n->gencnt = old->gencnt;
209
210 #ifdef XS_NODE_UNIT_TEST
211 if (n->name) {
212 n->name = g_strdup(old->name);
213 }
214 #endif
215
216 assert(old);
217 if (old->children) {
218 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
219 (GDestroyNotify)xs_node_unref);
220 g_hash_table_foreach(old->children, do_child_insert, n->children);
221 }
222 if (old->perms) {
223 n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
224 }
225 if (old->content) {
226 n->content = g_byte_array_ref(old->content);
227 }
228 return n;
229 }
230
231 /* Returns true if it made a change to the hash table */
xs_node_add_child(XsNode * n,const char * path_elem,XsNode * child)232 static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
233 {
234 assert(!strchr(path_elem, '/'));
235
236 if (!child) {
237 assert(n->children);
238 return g_hash_table_remove(n->children, path_elem);
239 }
240
241 #ifdef XS_NODE_UNIT_TEST
242 g_free(child->name);
243 child->name = g_strdup(path_elem);
244 #endif
245 if (!n->children) {
246 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
247 (GDestroyNotify)xs_node_unref);
248 }
249
250 /*
251 * The documentation for g_hash_table_insert() says that it "returns a
252 * boolean value to indicate whether the newly added value was already
253 * in the hash table or not."
254 *
255 * It could perhaps be clearer that returning TRUE means it wasn't,
256 */
257 return g_hash_table_insert(n->children, g_strdup(path_elem), child);
258 }
259
260 struct walk_op {
261 struct XenstoreImplState *s;
262 char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
263 int (*op_fn)(XsNode **n, struct walk_op *op);
264 void *op_opaque;
265 void *op_opaque2;
266
267 GList *watches;
268 unsigned int dom_id;
269 unsigned int tx_id;
270
271 /* The number of nodes which will exist in the tree if this op succeeds. */
272 unsigned int new_nr_nodes;
273
274 /*
275 * This is maintained on the way *down* the walk to indicate
276 * whether nodes can be modified in place or whether COW is
277 * required. It starts off being true, as we're always going to
278 * replace the root node. If we walk into a shared subtree it
279 * becomes false. If we start *creating* new nodes for a write,
280 * it becomes true again.
281 *
282 * Do not use it on the way back up.
283 */
284 bool inplace;
285 bool mutating;
286 bool create_dirs;
287 bool in_transaction;
288
289 /* Tracking during recursion so we know which is first. */
290 bool deleted_in_tx;
291 };
292
fire_watches(struct walk_op * op,bool parents)293 static void fire_watches(struct walk_op *op, bool parents)
294 {
295 GList *l = NULL;
296 XsWatch *w;
297
298 if (!op->mutating || op->in_transaction) {
299 return;
300 }
301
302 if (parents) {
303 l = op->watches;
304 }
305
306 w = g_hash_table_lookup(op->s->watches, op->path);
307 while (w || l) {
308 if (!w) {
309 /* Fire the parent nodes from 'op' if asked to */
310 w = l->data;
311 l = l->next;
312 continue;
313 }
314
315 assert(strlen(op->path) > w->rel_prefix);
316 w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
317
318 w = w->next;
319 }
320 }
321
xs_node_add_content(XsNode ** n,struct walk_op * op)322 static int xs_node_add_content(XsNode **n, struct walk_op *op)
323 {
324 GByteArray *data = op->op_opaque;
325
326 if (op->dom_id) {
327 /*
328 * The real XenStored includes permissions and names of child nodes
329 * in the calculated datasize but life's too short. For a single
330 * tenant internal XenStore, we don't have to be quite as pedantic.
331 */
332 if (data->len > XS_MAX_NODE_SIZE) {
333 return E2BIG;
334 }
335 }
336 /* We *are* the node to be written. Either this or a copy. */
337 if (!op->inplace) {
338 XsNode *old = *n;
339 *n = xs_node_copy(old);
340 xs_node_unref(old);
341 }
342
343 if ((*n)->content) {
344 g_byte_array_unref((*n)->content);
345 }
346 (*n)->content = g_byte_array_ref(data);
347 if (op->tx_id != XBT_NULL) {
348 (*n)->modified_in_tx = true;
349 }
350 return 0;
351 }
352
xs_node_get_content(XsNode ** n,struct walk_op * op)353 static int xs_node_get_content(XsNode **n, struct walk_op *op)
354 {
355 GByteArray *data = op->op_opaque;
356 GByteArray *node_data;
357
358 assert(op->inplace);
359 assert(*n);
360
361 node_data = (*n)->content;
362 if (node_data) {
363 g_byte_array_append(data, node_data->data, node_data->len);
364 }
365
366 return 0;
367 }
368
node_rm_recurse(gpointer key,gpointer value,gpointer user_data)369 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
370 {
371 struct walk_op *op = user_data;
372 int path_len = strlen(op->path);
373 int key_len = strlen(key);
374 XsNode *n = value;
375 bool this_inplace = op->inplace;
376
377 if (n->ref != 1) {
378 op->inplace = 0;
379 }
380
381 assert(key_len + path_len + 2 <= sizeof(op->path));
382 op->path[path_len] = '/';
383 memcpy(op->path + path_len + 1, key, key_len + 1);
384
385 if (n->children) {
386 g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
387 }
388 op->new_nr_nodes--;
389
390 /*
391 * Fire watches on *this* node but not the parents because they are
392 * going to be deleted too, so the watch will fire for them anyway.
393 */
394 fire_watches(op, false);
395 op->path[path_len] = '\0';
396
397 /*
398 * Actually deleting the child here is just an optimisation; if we
399 * don't then the final unref on the topmost victim will just have
400 * to cascade down again repeating all the g_hash_table_foreach()
401 * calls.
402 */
403 return this_inplace;
404 }
405
406 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
copy_deleted_recurse(gpointer key,gpointer value,gpointer user_data)407 static void copy_deleted_recurse(gpointer key, gpointer value,
408 gpointer user_data)
409 {
410 struct walk_op *op = user_data;
411 GHashTable *siblings = op->op_opaque2;
412 XsNode *n = xs_node_copy_deleted(value, op);
413
414 /*
415 * Reinsert the deleted_in_tx copy of the node into the parent's
416 * 'children' hash table. Having stashed it from op->op_opaque2
417 * before the recursive call to xs_node_copy_deleted() scribbled
418 * over it.
419 */
420 g_hash_table_insert(siblings, g_strdup(key), n);
421 }
422
xs_node_copy_deleted(XsNode * old,struct walk_op * op)423 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
424 {
425 XsNode *n = xs_node_new();
426
427 n->gencnt = old->gencnt;
428
429 #ifdef XS_NODE_UNIT_TEST
430 if (old->name) {
431 n->name = g_strdup(old->name);
432 }
433 #endif
434
435 if (old->children) {
436 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
437 (GDestroyNotify)xs_node_unref);
438 op->op_opaque2 = n->children;
439 g_hash_table_foreach(old->children, copy_deleted_recurse, op);
440 }
441 if (old->perms) {
442 n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
443 }
444 n->deleted_in_tx = true;
445 /* If it gets resurrected we only fire a watch if it lost its content */
446 if (old->content) {
447 n->modified_in_tx = true;
448 }
449 op->new_nr_nodes--;
450 return n;
451 }
452
xs_node_rm(XsNode ** n,struct walk_op * op)453 static int xs_node_rm(XsNode **n, struct walk_op *op)
454 {
455 bool this_inplace = op->inplace;
456
457 if (op->tx_id != XBT_NULL) {
458 /* It's not trivial to do inplace handling for this one */
459 XsNode *old = *n;
460 *n = xs_node_copy_deleted(old, op);
461 xs_node_unref(old);
462 return 0;
463 }
464
465 /* Fire watches for, and count, nodes in the subtree which get deleted */
466 if ((*n)->children) {
467 g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
468 }
469 op->new_nr_nodes--;
470
471 if (this_inplace) {
472 xs_node_unref(*n);
473 }
474 *n = NULL;
475 return 0;
476 }
477
xs_node_get_perms(XsNode ** n,struct walk_op * op)478 static int xs_node_get_perms(XsNode **n, struct walk_op *op)
479 {
480 GList **perms = op->op_opaque;
481
482 assert(op->inplace);
483 assert(*n);
484
485 *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
486 return 0;
487 }
488
parse_perm(const char * perm,char * letter,unsigned int * dom_id)489 static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
490 {
491 unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
492
493 assert(n == 2);
494 }
495
can_access(unsigned int dom_id,GList * perms,const char * letters)496 static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
497 {
498 unsigned int i, n;
499 char perm_letter;
500 unsigned int perm_dom_id;
501 bool access;
502
503 if (dom_id == 0) {
504 return true;
505 }
506
507 n = g_list_length(perms);
508 assert(n >= 1);
509
510 /*
511 * The dom_id of the first perm is the owner, and the owner always has
512 * read-write access.
513 */
514 parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
515 if (dom_id == perm_dom_id) {
516 return true;
517 }
518
519 /*
520 * The letter of the first perm specified the default access for all other
521 * domains.
522 */
523 access = !!strchr(letters, perm_letter);
524 for (i = 1; i < n; i++) {
525 parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
526 if (dom_id != perm_dom_id) {
527 continue;
528 }
529 access = !!strchr(letters, perm_letter);
530 }
531
532 return access;
533 }
534
xs_node_set_perms(XsNode ** n,struct walk_op * op)535 static int xs_node_set_perms(XsNode **n, struct walk_op *op)
536 {
537 GList *perms = op->op_opaque;
538
539 if (op->dom_id) {
540 unsigned int perm_dom_id;
541 char perm_letter;
542
543 /* A guest may not change permissions on nodes it does not own */
544 if (!can_access(op->dom_id, (*n)->perms, "")) {
545 return EPERM;
546 }
547
548 /* A guest may not change the owner of a node it owns. */
549 parse_perm(perms->data, &perm_letter, &perm_dom_id);
550 if (perm_dom_id != op->dom_id) {
551 return EPERM;
552 }
553
554 if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
555 return ENOSPC;
556 }
557 }
558
559 /* We *are* the node to be written. Either this or a copy. */
560 if (!op->inplace) {
561 XsNode *old = *n;
562 *n = xs_node_copy(old);
563 xs_node_unref(old);
564 }
565
566 if ((*n)->perms) {
567 g_list_free_full((*n)->perms, g_free);
568 }
569 (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
570 if (op->tx_id != XBT_NULL) {
571 (*n)->modified_in_tx = true;
572 }
573 return 0;
574 }
575
576 /*
577 * Passed a full reference in *n which it may free if it needs to COW.
578 *
579 * When changing the tree, the op->inplace flag indicates whether this
580 * node may be modified in place (i.e. it and all its parents had a
581 * refcount of one). If walking down the tree we find a node whose
582 * refcount is higher, we must clear op->inplace and COW from there
583 * down. Unless we are creating new nodes as scaffolding for a write
584 * (which works like 'mkdir -p' does). In which case those newly
585 * created nodes can (and must) be modified in place again.
586 */
xs_node_walk(XsNode ** n,struct walk_op * op)587 static int xs_node_walk(XsNode **n, struct walk_op *op)
588 {
589 char *child_name = NULL;
590 size_t namelen;
591 XsNode *old = *n, *child = NULL;
592 bool stole_child = false;
593 bool this_inplace;
594 XsWatch *watch;
595 int err;
596
597 namelen = strlen(op->path);
598 watch = g_hash_table_lookup(op->s->watches, op->path);
599
600 /* Is there a child, or do we hit the double-NUL termination? */
601 if (op->path[namelen + 1]) {
602 char *slash;
603 child_name = op->path + namelen + 1;
604 slash = strchr(child_name, '/');
605 if (slash) {
606 *slash = '\0';
607 }
608 op->path[namelen] = '/';
609 }
610
611 /* If we walk into a subtree which is shared, we must COW */
612 if (op->mutating && old->ref != 1) {
613 op->inplace = false;
614 }
615
616 if (!child_name) {
617 const char *letters = op->mutating ? "wb" : "rb";
618
619 if (!can_access(op->dom_id, old->perms, letters)) {
620 err = EACCES;
621 goto out;
622 }
623
624 /* This is the actual node on which the operation shall be performed */
625 err = op->op_fn(n, op);
626 if (!err) {
627 fire_watches(op, true);
628 }
629 goto out;
630 }
631
632 /* op->inplace will be further modified during the recursion */
633 this_inplace = op->inplace;
634
635 if (old && old->children) {
636 child = g_hash_table_lookup(old->children, child_name);
637 /* This is a *weak* reference to 'child', owned by the hash table */
638 }
639
640 if (child) {
641 if (child->deleted_in_tx) {
642 assert(child->ref == 1);
643 /* Cannot actually set child->deleted_in_tx = false until later */
644 }
645 xs_node_ref(child);
646 /*
647 * Now we own it too. But if we can modify inplace, that's going to
648 * foil the check and force it to COW. We want to be the *only* owner
649 * so that it can be modified in place, so remove it from the hash
650 * table in that case. We'll add it (or its replacement) back later.
651 */
652 if (op->mutating && this_inplace) {
653 g_hash_table_remove(old->children, child_name);
654 stole_child = true;
655 }
656 } else if (op->create_dirs) {
657 assert(op->mutating);
658
659 if (!can_access(op->dom_id, old->perms, "wb")) {
660 err = EACCES;
661 goto out;
662 }
663
664 if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
665 err = ENOSPC;
666 goto out;
667 }
668
669 child = xs_node_create(child_name, old->perms);
670 op->new_nr_nodes++;
671
672 /*
673 * If we're creating a new child, we can clearly modify it (and its
674 * children) in place from here on down.
675 */
676 op->inplace = true;
677 } else {
678 err = ENOENT;
679 goto out;
680 }
681
682 /*
683 * If there's a watch on this node, add it to the list to be fired
684 * (with the correct full pathname for the modified node) at the end.
685 */
686 if (watch) {
687 op->watches = g_list_append(op->watches, watch);
688 }
689
690 /*
691 * Except for the temporary child-stealing as noted, our node has not
692 * changed yet. We don't yet know the overall operation will complete.
693 */
694 err = xs_node_walk(&child, op);
695
696 if (watch) {
697 op->watches = g_list_remove(op->watches, watch);
698 }
699
700 if (err || !op->mutating) {
701 if (stole_child) {
702 /* Put it back as it was. */
703 g_hash_table_replace(old->children, g_strdup(child_name), child);
704 } else {
705 xs_node_unref(child);
706 }
707 goto out;
708 }
709
710 /*
711 * Now we know the operation has completed successfully and we're on
712 * the way back up. Make the change, substituting 'child' in the
713 * node at our level.
714 */
715 if (!this_inplace) {
716 *n = xs_node_copy(old);
717 xs_node_unref(old);
718 }
719
720 /*
721 * If we resurrected a deleted_in_tx node, we can mark it as no longer
722 * deleted now that we know the overall operation has succeeded.
723 */
724 if (op->create_dirs && child && child->deleted_in_tx) {
725 op->new_nr_nodes++;
726 child->deleted_in_tx = false;
727 }
728
729 /*
730 * The child may be NULL here, for a remove operation. Either way,
731 * xs_node_add_child() will do the right thing and return a value
732 * indicating whether it changed the parent's hash table or not.
733 *
734 * We bump the parent gencnt if it adds a child that we *didn't*
735 * steal from it in the first place, or if child==NULL and was
736 * thus removed (whether we stole it earlier and didn't put it
737 * back, or xs_node_add_child() actually removed it now).
738 */
739 if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
740 (*n)->gencnt++;
741 }
742
743 out:
744 op->path[namelen] = '\0';
745 if (!namelen) {
746 assert(!op->watches);
747 /*
748 * On completing the recursion back up the path walk and reaching the
749 * top, assign the new node count if the operation was successful. If
750 * the main tree was changed, bump its tx ID so that outstanding
751 * transactions correctly fail. But don't bump it every time; only
752 * if it makes a difference.
753 */
754 if (!err && op->mutating) {
755 if (!op->in_transaction) {
756 if (op->s->root_tx != op->s->last_tx) {
757 op->s->root_tx = next_tx(op->s);
758 }
759 op->s->nr_nodes = op->new_nr_nodes;
760 } else {
761 XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
762 GINT_TO_POINTER(op->tx_id));
763 assert(tx);
764 tx->nr_nodes = op->new_nr_nodes;
765 }
766 }
767 }
768 return err;
769 }
770
append_directory_item(gpointer key,gpointer value,gpointer user_data)771 static void append_directory_item(gpointer key, gpointer value,
772 gpointer user_data)
773 {
774 GList **items = user_data;
775
776 *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
777 }
778
779 /* Populates items with char * names which caller must free. */
xs_node_directory(XsNode ** n,struct walk_op * op)780 static int xs_node_directory(XsNode **n, struct walk_op *op)
781 {
782 GList **items = op->op_opaque;
783
784 assert(op->inplace);
785 assert(*n);
786
787 if ((*n)->children) {
788 g_hash_table_foreach((*n)->children, append_directory_item, items);
789 }
790
791 if (op->op_opaque2) {
792 *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
793 }
794
795 return 0;
796 }
797
validate_path(char * outpath,const char * userpath,unsigned int dom_id)798 static int validate_path(char *outpath, const char *userpath,
799 unsigned int dom_id)
800 {
801 size_t i, pathlen = strlen(userpath);
802
803 if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
804 return EINVAL;
805 }
806 for (i = 0; i < pathlen; i++) {
807 if (!strchr(XS_VALID_CHARS, userpath[i])) {
808 return EINVAL;
809 }
810 }
811 if (userpath[0] == '/') {
812 if (pathlen > XENSTORE_ABS_PATH_MAX) {
813 return E2BIG;
814 }
815 memcpy(outpath, userpath, pathlen + 1);
816 } else {
817 if (pathlen > XENSTORE_REL_PATH_MAX) {
818 return E2BIG;
819 }
820 snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
821 userpath);
822 }
823 return 0;
824 }
825
826
init_walk_op(XenstoreImplState * s,struct walk_op * op,xs_transaction_t tx_id,unsigned int dom_id,const char * path,XsNode *** rootp)827 static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
828 xs_transaction_t tx_id, unsigned int dom_id,
829 const char *path, XsNode ***rootp)
830 {
831 int ret = validate_path(op->path, path, dom_id);
832 if (ret) {
833 return ret;
834 }
835
836 /*
837 * We use *two* NUL terminators at the end of the path, as during the walk
838 * we will temporarily turn each '/' into a NUL to allow us to use that
839 * path element for the lookup.
840 */
841 op->path[strlen(op->path) + 1] = '\0';
842 op->watches = NULL;
843 op->path[0] = '\0';
844 op->inplace = true;
845 op->mutating = false;
846 op->create_dirs = false;
847 op->in_transaction = false;
848 op->dom_id = dom_id;
849 op->tx_id = tx_id;
850 op->s = s;
851
852 if (tx_id == XBT_NULL) {
853 *rootp = &s->root;
854 op->new_nr_nodes = s->nr_nodes;
855 } else {
856 XsTransaction *tx = g_hash_table_lookup(s->transactions,
857 GINT_TO_POINTER(tx_id));
858 if (!tx) {
859 return ENOENT;
860 }
861 *rootp = &tx->root;
862 op->new_nr_nodes = tx->nr_nodes;
863 op->in_transaction = true;
864 }
865
866 return 0;
867 }
868
xs_impl_read(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path,GByteArray * data)869 int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
870 xs_transaction_t tx_id, const char *path, GByteArray *data)
871 {
872 /*
873 * The data GByteArray shall exist, and will be freed by caller.
874 * Just g_byte_array_append() to it.
875 */
876 struct walk_op op;
877 XsNode **n;
878 int ret;
879
880 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
881 if (ret) {
882 return ret;
883 }
884 op.op_fn = xs_node_get_content;
885 op.op_opaque = data;
886 return xs_node_walk(n, &op);
887 }
888
xs_impl_write(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path,GByteArray * data)889 int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
890 xs_transaction_t tx_id, const char *path, GByteArray *data)
891 {
892 /*
893 * The data GByteArray shall exist, will be freed by caller. You are
894 * free to use g_byte_array_steal() and keep the data. Or just ref it.
895 */
896 struct walk_op op;
897 XsNode **n;
898 int ret;
899
900 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
901 if (ret) {
902 return ret;
903 }
904 op.op_fn = xs_node_add_content;
905 op.op_opaque = data;
906 op.mutating = true;
907 op.create_dirs = true;
908 return xs_node_walk(n, &op);
909 }
910
xs_impl_directory(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path,uint64_t * gencnt,GList ** items)911 int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
912 xs_transaction_t tx_id, const char *path,
913 uint64_t *gencnt, GList **items)
914 {
915 /*
916 * The items are (char *) to be freed by caller. Although it's consumed
917 * immediately so if you want to change it to (const char *) and keep
918 * them, go ahead and change the caller.
919 */
920 struct walk_op op;
921 XsNode **n;
922 int ret;
923
924 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
925 if (ret) {
926 return ret;
927 }
928 op.op_fn = xs_node_directory;
929 op.op_opaque = items;
930 op.op_opaque2 = gencnt;
931 return xs_node_walk(n, &op);
932 }
933
xs_impl_transaction_start(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t * tx_id)934 int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
935 xs_transaction_t *tx_id)
936 {
937 XsTransaction *tx;
938
939 if (*tx_id != XBT_NULL) {
940 return EINVAL;
941 }
942
943 if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
944 return ENOSPC;
945 }
946
947 tx = g_new0(XsTransaction, 1);
948
949 tx->nr_nodes = s->nr_nodes;
950 tx->tx_id = next_tx(s);
951 tx->base_tx = s->root_tx;
952 tx->root = xs_node_ref(s->root);
953 tx->dom_id = dom_id;
954
955 g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
956 if (dom_id) {
957 s->nr_domu_transactions++;
958 }
959 *tx_id = tx->tx_id;
960 return 0;
961 }
962
tx_commit_walk(gpointer key,gpointer value,gpointer user_data)963 static gboolean tx_commit_walk(gpointer key, gpointer value,
964 gpointer user_data)
965 {
966 struct walk_op *op = user_data;
967 int path_len = strlen(op->path);
968 int key_len = strlen(key);
969 bool fire_parents = true;
970 XsWatch *watch;
971 XsNode *n = value;
972
973 if (n->ref != 1) {
974 return false;
975 }
976
977 if (n->deleted_in_tx) {
978 /*
979 * We fire watches on our parents if we are the *first* node
980 * to be deleted (the topmost one). This matches the behaviour
981 * when deleting in the live tree.
982 */
983 fire_parents = !op->deleted_in_tx;
984
985 /* Only used on the way down so no need to clear it later */
986 op->deleted_in_tx = true;
987 }
988
989 assert(key_len + path_len + 2 <= sizeof(op->path));
990 op->path[path_len] = '/';
991 memcpy(op->path + path_len + 1, key, key_len + 1);
992
993 watch = g_hash_table_lookup(op->s->watches, op->path);
994 if (watch) {
995 op->watches = g_list_append(op->watches, watch);
996 }
997
998 if (n->children) {
999 g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
1000 }
1001
1002 if (watch) {
1003 op->watches = g_list_remove(op->watches, watch);
1004 }
1005
1006 /*
1007 * Don't fire watches if this node was only copied because a
1008 * descendent was changed. The modified_in_tx flag indicates the
1009 * ones which were really changed.
1010 */
1011 if (n->modified_in_tx || n->deleted_in_tx) {
1012 fire_watches(op, fire_parents);
1013 n->modified_in_tx = false;
1014 }
1015 op->path[path_len] = '\0';
1016
1017 /* Deleted nodes really do get expunged when we commit */
1018 return n->deleted_in_tx;
1019 }
1020
transaction_commit(XenstoreImplState * s,XsTransaction * tx)1021 static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
1022 {
1023 struct walk_op op;
1024 XsNode **n;
1025 int ret;
1026
1027 if (s->root_tx != tx->base_tx) {
1028 return EAGAIN;
1029 }
1030 xs_node_unref(s->root);
1031 s->root = tx->root;
1032 tx->root = NULL;
1033 s->root_tx = tx->tx_id;
1034 s->nr_nodes = tx->nr_nodes;
1035
1036 ret = init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
1037 /*
1038 * There are two reasons why init_walk_op() may fail: an invalid tx_id,
1039 * or an invalid path. We pass XBT_NULL and "/", and it cannot fail.
1040 * If it does, the world is broken. And returning 'ret' would be weird
1041 * because the transaction *was* committed, and all this tree walk is
1042 * trying to do is fire the resulting watches on newly-committed nodes.
1043 */
1044 g_assert(!ret);
1045
1046 op.deleted_in_tx = false;
1047 op.mutating = true;
1048
1049 /*
1050 * Walk the new root and fire watches on any node which has a
1051 * refcount of one (which is therefore unique to this transaction).
1052 */
1053 if (s->root->children) {
1054 g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
1055 }
1056
1057 return 0;
1058 }
1059
xs_impl_transaction_end(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,bool commit)1060 int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
1061 xs_transaction_t tx_id, bool commit)
1062 {
1063 int ret = 0;
1064 XsTransaction *tx = g_hash_table_lookup(s->transactions,
1065 GINT_TO_POINTER(tx_id));
1066
1067 if (!tx || tx->dom_id != dom_id) {
1068 return ENOENT;
1069 }
1070
1071 if (commit) {
1072 ret = transaction_commit(s, tx);
1073 }
1074
1075 g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
1076 if (dom_id) {
1077 assert(s->nr_domu_transactions);
1078 s->nr_domu_transactions--;
1079 }
1080 return ret;
1081 }
1082
xs_impl_rm(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path)1083 int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
1084 xs_transaction_t tx_id, const char *path)
1085 {
1086 struct walk_op op;
1087 XsNode **n;
1088 int ret;
1089
1090 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1091 if (ret) {
1092 return ret;
1093 }
1094 op.op_fn = xs_node_rm;
1095 op.mutating = true;
1096 return xs_node_walk(n, &op);
1097 }
1098
xs_impl_get_perms(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path,GList ** perms)1099 int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
1100 xs_transaction_t tx_id, const char *path, GList **perms)
1101 {
1102 struct walk_op op;
1103 XsNode **n;
1104 int ret;
1105
1106 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1107 if (ret) {
1108 return ret;
1109 }
1110 op.op_fn = xs_node_get_perms;
1111 op.op_opaque = perms;
1112 return xs_node_walk(n, &op);
1113 }
1114
is_valid_perm(gpointer data,gpointer user_data)1115 static void is_valid_perm(gpointer data, gpointer user_data)
1116 {
1117 char *perm = data;
1118 bool *valid = user_data;
1119 char letter;
1120 unsigned int dom_id;
1121
1122 if (!*valid) {
1123 return;
1124 }
1125
1126 if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
1127 *valid = false;
1128 return;
1129 }
1130
1131 switch (letter) {
1132 case 'n':
1133 case 'r':
1134 case 'w':
1135 case 'b':
1136 break;
1137
1138 default:
1139 *valid = false;
1140 break;
1141 }
1142 }
1143
xs_impl_set_perms(XenstoreImplState * s,unsigned int dom_id,xs_transaction_t tx_id,const char * path,GList * perms)1144 int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
1145 xs_transaction_t tx_id, const char *path, GList *perms)
1146 {
1147 struct walk_op op;
1148 XsNode **n;
1149 bool valid = true;
1150 int ret;
1151
1152 if (!g_list_length(perms)) {
1153 return EINVAL;
1154 }
1155
1156 g_list_foreach(perms, is_valid_perm, &valid);
1157 if (!valid) {
1158 return EINVAL;
1159 }
1160
1161 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1162 if (ret) {
1163 return ret;
1164 }
1165 op.op_fn = xs_node_set_perms;
1166 op.op_opaque = perms;
1167 op.mutating = true;
1168 return xs_node_walk(n, &op);
1169 }
1170
do_xs_impl_watch(XenstoreImplState * s,unsigned int dom_id,const char * path,const char * token,xs_impl_watch_fn fn,void * opaque)1171 static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
1172 const char *path, const char *token,
1173 xs_impl_watch_fn fn, void *opaque)
1174
1175 {
1176 char abspath[XENSTORE_ABS_PATH_MAX + 1];
1177 XsWatch *w, *l;
1178 int ret;
1179
1180 ret = validate_path(abspath, path, dom_id);
1181 if (ret) {
1182 return ret;
1183 }
1184
1185 /* Check for duplicates */
1186 l = w = g_hash_table_lookup(s->watches, abspath);
1187 while (w) {
1188 if (!g_strcmp0(token, w->token) && opaque == w->cb_opaque &&
1189 fn == w->cb && dom_id == w->dom_id) {
1190 return EEXIST;
1191 }
1192 w = w->next;
1193 }
1194
1195 if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
1196 return E2BIG;
1197 }
1198
1199 w = g_new0(XsWatch, 1);
1200 w->token = g_strdup(token);
1201 w->cb = fn;
1202 w->cb_opaque = opaque;
1203 w->dom_id = dom_id;
1204 w->rel_prefix = strlen(abspath) - strlen(path);
1205
1206 /* l was looked up above when checking for duplicates */
1207 if (l) {
1208 w->next = l->next;
1209 l->next = w;
1210 } else {
1211 g_hash_table_insert(s->watches, g_strdup(abspath), w);
1212 }
1213 if (dom_id) {
1214 s->nr_domu_watches++;
1215 }
1216
1217 return 0;
1218 }
1219
xs_impl_watch(XenstoreImplState * s,unsigned int dom_id,const char * path,const char * token,xs_impl_watch_fn fn,void * opaque)1220 int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
1221 const char *token, xs_impl_watch_fn fn, void *opaque)
1222 {
1223 int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
1224
1225 if (!ret) {
1226 /* A new watch should fire immediately */
1227 fn(opaque, path, token);
1228 }
1229
1230 return ret;
1231 }
1232
free_watch(XenstoreImplState * s,XsWatch * w)1233 static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
1234 {
1235 XsWatch *next = w->next;
1236
1237 if (w->dom_id) {
1238 assert(s->nr_domu_watches);
1239 s->nr_domu_watches--;
1240 }
1241
1242 g_free(w->token);
1243 g_free(w);
1244
1245 return next;
1246 }
1247
xs_impl_unwatch(XenstoreImplState * s,unsigned int dom_id,const char * path,const char * token,xs_impl_watch_fn fn,void * opaque)1248 int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
1249 const char *path, const char *token,
1250 xs_impl_watch_fn fn, void *opaque)
1251 {
1252 char abspath[XENSTORE_ABS_PATH_MAX + 1];
1253 XsWatch *w, **l;
1254 int ret;
1255
1256 ret = validate_path(abspath, path, dom_id);
1257 if (ret) {
1258 return ret;
1259 }
1260
1261 w = g_hash_table_lookup(s->watches, abspath);
1262 if (!w) {
1263 return ENOENT;
1264 }
1265
1266 /*
1267 * The hash table contains the first element of a list of
1268 * watches. Removing the first element in the list is a
1269 * special case because we have to update the hash table to
1270 * point to the next (or remove it if there's nothing left).
1271 */
1272 if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
1273 dom_id == w->dom_id) {
1274 if (w->next) {
1275 /* Insert the previous 'next' into the hash table */
1276 g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
1277 } else {
1278 /* Nothing left; remove from hash table */
1279 g_hash_table_remove(s->watches, abspath);
1280 }
1281 free_watch(s, w);
1282 return 0;
1283 }
1284
1285 /*
1286 * We're all done messing with the hash table because the element
1287 * it points to has survived the cull. Now it's just a simple
1288 * linked list removal operation.
1289 */
1290 for (l = &w->next; *l; l = &w->next) {
1291 w = *l;
1292
1293 if (!g_strcmp0(token, w->token) && fn == w->cb &&
1294 opaque != w->cb_opaque && dom_id == w->dom_id) {
1295 *l = free_watch(s, w);
1296 return 0;
1297 }
1298 }
1299
1300 return ENOENT;
1301 }
1302
xs_impl_reset_watches(XenstoreImplState * s,unsigned int dom_id)1303 int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
1304 {
1305 char **watch_paths;
1306 guint nr_watch_paths;
1307 guint i;
1308
1309 watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
1310 &nr_watch_paths);
1311
1312 for (i = 0; i < nr_watch_paths; i++) {
1313 XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
1314 XsWatch *w2, *w, **l;
1315
1316 /*
1317 * w1 is the original list. The hash table has this pointer.
1318 * w2 is the head of our newly-filtered list.
1319 * w and l are temporary for processing. w is somewhat redundant
1320 * with *l but makes my eyes bleed less.
1321 */
1322
1323 w = w2 = w1;
1324 l = &w;
1325 while (w) {
1326 if (w->dom_id == dom_id) {
1327 /* If we're freeing the head of the list, bump w2 */
1328 if (w2 == w) {
1329 w2 = w->next;
1330 }
1331 *l = free_watch(s, w);
1332 } else {
1333 l = &w->next;
1334 }
1335 w = *l;
1336 }
1337 /*
1338 * If the head of the list survived the cull, we don't need to
1339 * touch the hash table and we're done with this path. Else...
1340 */
1341 if (w1 != w2) {
1342 g_hash_table_steal(s->watches, watch_paths[i]);
1343
1344 /*
1345 * It was already freed. (Don't worry, this whole thing is
1346 * single-threaded and nobody saw it in the meantime). And
1347 * having *stolen* it, we now own the watch_paths[i] string
1348 * so if we don't give it back to the hash table, we need
1349 * to free it.
1350 */
1351 if (w2) {
1352 g_hash_table_insert(s->watches, watch_paths[i], w2);
1353 } else {
1354 g_free(watch_paths[i]);
1355 }
1356 }
1357 }
1358 g_free(watch_paths);
1359 return 0;
1360 }
1361
xs_tx_free(void * _tx)1362 static void xs_tx_free(void *_tx)
1363 {
1364 XsTransaction *tx = _tx;
1365 if (tx->root) {
1366 xs_node_unref(tx->root);
1367 }
1368 g_free(tx);
1369 }
1370
xs_impl_create(unsigned int dom_id)1371 XenstoreImplState *xs_impl_create(unsigned int dom_id)
1372 {
1373 XenstoreImplState *s = g_new0(XenstoreImplState, 1);
1374 GList *perms;
1375
1376 s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
1377 s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
1378 NULL, xs_tx_free);
1379
1380 perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
1381 s->root = xs_node_create("/", perms);
1382 g_list_free_full(perms, g_free);
1383 s->nr_nodes = 1;
1384
1385 s->root_tx = s->last_tx = 1;
1386 return s;
1387 }
1388
1389
clear_serialized_tx(gpointer key,gpointer value,gpointer opaque)1390 static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
1391 {
1392 XsNode *n = value;
1393
1394 n->serialized_tx = XBT_NULL;
1395 if (n->children) {
1396 g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
1397 }
1398 }
1399
clear_tx_serialized_tx(gpointer key,gpointer value,gpointer opaque)1400 static void clear_tx_serialized_tx(gpointer key, gpointer value,
1401 gpointer opaque)
1402 {
1403 XsTransaction *t = value;
1404
1405 clear_serialized_tx(NULL, t->root, NULL);
1406 }
1407
write_be32(GByteArray * save,uint32_t val)1408 static void write_be32(GByteArray *save, uint32_t val)
1409 {
1410 uint32_t be = htonl(val);
1411 g_byte_array_append(save, (void *)&be, sizeof(be));
1412 }
1413
1414
1415 struct save_state {
1416 GByteArray *bytes;
1417 unsigned int tx_id;
1418 };
1419
1420 #define MODIFIED_IN_TX (1U << 0)
1421 #define DELETED_IN_TX (1U << 1)
1422 #define NODE_REF (1U << 2)
1423
save_node(gpointer key,gpointer value,gpointer opaque)1424 static void save_node(gpointer key, gpointer value, gpointer opaque)
1425 {
1426 struct save_state *ss = opaque;
1427 XsNode *n = value;
1428 char *name = key;
1429 uint8_t flag = 0;
1430
1431 /* Child nodes (i.e. anything but the root) have a name */
1432 if (name) {
1433 g_byte_array_append(ss->bytes, key, strlen(key) + 1);
1434 }
1435
1436 /*
1437 * If we already wrote this node, refer to the previous copy.
1438 * There's no rename/move in XenStore, so all we need to find
1439 * it is the tx_id of the transaction in which it exists. Which
1440 * may be the root tx.
1441 */
1442 if (n->serialized_tx != XBT_NULL) {
1443 flag = NODE_REF;
1444 g_byte_array_append(ss->bytes, &flag, 1);
1445 write_be32(ss->bytes, n->serialized_tx);
1446 } else {
1447 GList *l;
1448 n->serialized_tx = ss->tx_id;
1449
1450 if (n->modified_in_tx) {
1451 flag |= MODIFIED_IN_TX;
1452 }
1453 if (n->deleted_in_tx) {
1454 flag |= DELETED_IN_TX;
1455 }
1456 g_byte_array_append(ss->bytes, &flag, 1);
1457
1458 if (n->content) {
1459 write_be32(ss->bytes, n->content->len);
1460 g_byte_array_append(ss->bytes, n->content->data, n->content->len);
1461 } else {
1462 write_be32(ss->bytes, 0);
1463 }
1464
1465 for (l = n->perms; l; l = l->next) {
1466 g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
1467 }
1468 /* NUL termination after perms */
1469 g_byte_array_append(ss->bytes, (void *)"", 1);
1470
1471 if (n->children) {
1472 g_hash_table_foreach(n->children, save_node, ss);
1473 }
1474 /* NUL termination after children (child name is NUL) */
1475 g_byte_array_append(ss->bytes, (void *)"", 1);
1476 }
1477 }
1478
save_tree(struct save_state * ss,uint32_t tx_id,XsNode * root)1479 static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
1480 {
1481 write_be32(ss->bytes, tx_id);
1482 ss->tx_id = tx_id;
1483 save_node(NULL, root, ss);
1484 }
1485
save_tx(gpointer key,gpointer value,gpointer opaque)1486 static void save_tx(gpointer key, gpointer value, gpointer opaque)
1487 {
1488 uint32_t tx_id = GPOINTER_TO_INT(key);
1489 struct save_state *ss = opaque;
1490 XsTransaction *n = value;
1491
1492 write_be32(ss->bytes, n->base_tx);
1493 write_be32(ss->bytes, n->dom_id);
1494
1495 save_tree(ss, tx_id, n->root);
1496 }
1497
save_watch(gpointer key,gpointer value,gpointer opaque)1498 static void save_watch(gpointer key, gpointer value, gpointer opaque)
1499 {
1500 struct save_state *ss = opaque;
1501 XsWatch *w = value;
1502
1503 /* We only save the *guest* watches. */
1504 if (w->dom_id) {
1505 gpointer relpath = key + w->rel_prefix;
1506 g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
1507 g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
1508 }
1509 }
1510
xs_impl_serialize(XenstoreImplState * s)1511 GByteArray *xs_impl_serialize(XenstoreImplState *s)
1512 {
1513 struct save_state ss;
1514
1515 ss.bytes = g_byte_array_new();
1516
1517 /*
1518 * node = flags [ real_node / node_ref ]
1519 * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1520 * node_ref = tx_id (in which the original version of this node exists)
1521 * real_node = content perms child* NUL
1522 * content = len data
1523 * len = uint32_t
1524 * data = uint8_t{len}
1525 * perms = perm* NUL
1526 * perm = asciiz
1527 * child = name node
1528 * name = asciiz
1529 *
1530 * tree = tx_id node
1531 * tx_id = uint32_t
1532 *
1533 * transaction = base_tx_id dom_id tree
1534 * base_tx_id = uint32_t
1535 * dom_id = uint32_t
1536 *
1537 * tx_list = tree transaction* XBT_NULL
1538 *
1539 * watch = path token
1540 * path = asciiz
1541 * token = asciiz
1542 *
1543 * watch_list = watch* NUL
1544 *
1545 * xs_serialize_stream = last_tx tx_list watch_list
1546 * last_tx = uint32_t
1547 */
1548
1549 /* Clear serialized_tx in every node. */
1550 if (s->serialized) {
1551 clear_serialized_tx(NULL, s->root, NULL);
1552 g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
1553 }
1554
1555 s->serialized = true;
1556
1557 write_be32(ss.bytes, s->last_tx);
1558 save_tree(&ss, s->root_tx, s->root);
1559 g_hash_table_foreach(s->transactions, save_tx, &ss);
1560
1561 write_be32(ss.bytes, XBT_NULL);
1562
1563 g_hash_table_foreach(s->watches, save_watch, &ss);
1564 g_byte_array_append(ss.bytes, (void *)"", 1);
1565
1566 return ss.bytes;
1567 }
1568
1569 struct unsave_state {
1570 char path[XENSTORE_ABS_PATH_MAX + 1];
1571 XenstoreImplState *s;
1572 GByteArray *bytes;
1573 uint8_t *d;
1574 size_t l;
1575 bool root_walk;
1576 };
1577
consume_be32(struct unsave_state * us,unsigned int * val)1578 static int consume_be32(struct unsave_state *us, unsigned int *val)
1579 {
1580 uint32_t d;
1581
1582 if (us->l < sizeof(d)) {
1583 return -EINVAL;
1584 }
1585 memcpy(&d, us->d, sizeof(d));
1586 *val = ntohl(d);
1587 us->d += sizeof(d);
1588 us->l -= sizeof(d);
1589 return 0;
1590 }
1591
consume_string(struct unsave_state * us,char ** str,size_t * len)1592 static int consume_string(struct unsave_state *us, char **str, size_t *len)
1593 {
1594 size_t l;
1595
1596 if (!us->l) {
1597 return -EINVAL;
1598 }
1599
1600 l = strnlen((void *)us->d, us->l);
1601 if (l == us->l) {
1602 return -EINVAL;
1603 }
1604
1605 if (str) {
1606 *str = (void *)us->d;
1607 }
1608 if (len) {
1609 *len = l;
1610 }
1611
1612 us->d += l + 1;
1613 us->l -= l + 1;
1614 return 0;
1615 }
1616
lookup_node(XsNode * n,char * path)1617 static XsNode *lookup_node(XsNode *n, char *path)
1618 {
1619 char *slash = strchr(path, '/');
1620 XsNode *child;
1621
1622 if (path[0] == '\0') {
1623 return n;
1624 }
1625
1626 if (slash) {
1627 *slash = '\0';
1628 }
1629
1630 if (!n->children) {
1631 return NULL;
1632 }
1633 child = g_hash_table_lookup(n->children, path);
1634 if (!slash) {
1635 return child;
1636 }
1637
1638 *slash = '/';
1639 if (!child) {
1640 return NULL;
1641 }
1642 return lookup_node(child, slash + 1);
1643 }
1644
lookup_tx_node(struct unsave_state * us,unsigned int tx_id)1645 static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
1646 {
1647 XsTransaction *t;
1648 if (tx_id == us->s->root_tx) {
1649 return lookup_node(us->s->root, us->path + 1);
1650 }
1651
1652 t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
1653 if (!t) {
1654 return NULL;
1655 }
1656 g_assert(t->root);
1657 return lookup_node(t->root, us->path + 1);
1658 }
1659
count_child_nodes(gpointer key,gpointer value,gpointer user_data)1660 static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
1661 {
1662 unsigned int *nr_nodes = user_data;
1663 XsNode *n = value;
1664
1665 (*nr_nodes)++;
1666
1667 if (n->children) {
1668 g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1669 }
1670 }
1671
consume_node(struct unsave_state * us,XsNode ** nodep,unsigned int * nr_nodes)1672 static int consume_node(struct unsave_state *us, XsNode **nodep,
1673 unsigned int *nr_nodes)
1674 {
1675 XsNode *n = NULL;
1676 uint8_t flags;
1677 int ret;
1678
1679 if (us->l < 1) {
1680 return -EINVAL;
1681 }
1682 flags = us->d[0];
1683 us->d++;
1684 us->l--;
1685
1686 if (flags == NODE_REF) {
1687 unsigned int tx;
1688
1689 ret = consume_be32(us, &tx);
1690 if (ret) {
1691 return ret;
1692 }
1693
1694 n = lookup_tx_node(us, tx);
1695 if (!n) {
1696 return -EINVAL;
1697 }
1698 n->ref++;
1699 if (n->children) {
1700 g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1701 }
1702 } else {
1703 uint32_t datalen;
1704
1705 if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
1706 return -EINVAL;
1707 }
1708 n = xs_node_new();
1709
1710 if (flags & DELETED_IN_TX) {
1711 n->deleted_in_tx = true;
1712 }
1713 if (flags & MODIFIED_IN_TX) {
1714 n->modified_in_tx = true;
1715 }
1716 ret = consume_be32(us, &datalen);
1717 if (ret) {
1718 xs_node_unref(n);
1719 return -EINVAL;
1720 }
1721 if (datalen) {
1722 if (datalen > us->l) {
1723 xs_node_unref(n);
1724 return -EINVAL;
1725 }
1726
1727 GByteArray *node_data = g_byte_array_new();
1728 g_byte_array_append(node_data, us->d, datalen);
1729 us->d += datalen;
1730 us->l -= datalen;
1731 n->content = node_data;
1732
1733 if (us->root_walk) {
1734 n->modified_in_tx = true;
1735 }
1736 }
1737 while (1) {
1738 char *perm = NULL;
1739 size_t permlen = 0;
1740
1741 ret = consume_string(us, &perm, &permlen);
1742 if (ret) {
1743 xs_node_unref(n);
1744 return ret;
1745 }
1746
1747 if (!permlen) {
1748 break;
1749 }
1750
1751 n->perms = g_list_append(n->perms, g_strdup(perm));
1752 }
1753
1754 /* Now children */
1755 while (1) {
1756 size_t childlen;
1757 char *childname;
1758 char *pathend;
1759 XsNode *child = NULL;
1760
1761 ret = consume_string(us, &childname, &childlen);
1762 if (ret) {
1763 xs_node_unref(n);
1764 return ret;
1765 }
1766
1767 if (!childlen) {
1768 break;
1769 }
1770
1771 pathend = us->path + strlen(us->path);
1772 strncat(us->path, "/", sizeof(us->path) - 1);
1773 strncat(us->path, childname, sizeof(us->path) - 1);
1774
1775 ret = consume_node(us, &child, nr_nodes);
1776 *pathend = '\0';
1777 if (ret) {
1778 xs_node_unref(n);
1779 return ret;
1780 }
1781 g_assert(child);
1782 xs_node_add_child(n, childname, child);
1783 }
1784
1785 /*
1786 * If the node has no data and no children we still want to fire
1787 * a watch on it.
1788 */
1789 if (us->root_walk && !n->children) {
1790 n->modified_in_tx = true;
1791 }
1792 }
1793
1794 if (!n->deleted_in_tx) {
1795 (*nr_nodes)++;
1796 }
1797
1798 *nodep = n;
1799 return 0;
1800 }
1801
consume_tree(struct unsave_state * us,XsTransaction * t)1802 static int consume_tree(struct unsave_state *us, XsTransaction *t)
1803 {
1804 int ret;
1805
1806 ret = consume_be32(us, &t->tx_id);
1807 if (ret) {
1808 return ret;
1809 }
1810
1811 if (t->tx_id > us->s->last_tx) {
1812 return -EINVAL;
1813 }
1814
1815 us->path[0] = '\0';
1816
1817 return consume_node(us, &t->root, &t->nr_nodes);
1818 }
1819
xs_impl_deserialize(XenstoreImplState * s,GByteArray * bytes,unsigned int dom_id,xs_impl_watch_fn watch_fn,void * watch_opaque)1820 int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
1821 unsigned int dom_id, xs_impl_watch_fn watch_fn,
1822 void *watch_opaque)
1823 {
1824 struct unsave_state us;
1825 XsTransaction base_t = { 0 };
1826 int ret;
1827
1828 us.s = s;
1829 us.bytes = bytes;
1830 us.d = bytes->data;
1831 us.l = bytes->len;
1832
1833 xs_impl_reset_watches(s, dom_id);
1834 g_hash_table_remove_all(s->transactions);
1835
1836 xs_node_unref(s->root);
1837 s->root = NULL;
1838 s->root_tx = s->last_tx = XBT_NULL;
1839
1840 ret = consume_be32(&us, &s->last_tx);
1841 if (ret) {
1842 return ret;
1843 }
1844
1845 /*
1846 * Consume the base tree into a transaction so that watches can be
1847 * fired as we commit it. By setting us.root_walk we cause the nodes
1848 * to be marked as 'modified_in_tx' as they are created, so that the
1849 * watches are triggered on them.
1850 */
1851 base_t.dom_id = dom_id;
1852 base_t.base_tx = XBT_NULL;
1853 us.root_walk = true;
1854 ret = consume_tree(&us, &base_t);
1855 if (ret) {
1856 return ret;
1857 }
1858 us.root_walk = false;
1859
1860 /*
1861 * Commit the transaction now while the refcount on all nodes is 1.
1862 * Note that we haven't yet reinstated the *guest* watches but that's
1863 * OK because we don't want the guest to see any changes. Even any
1864 * backend nodes which get recreated should be *precisely* as they
1865 * were before the migration. Back ends may have been instantiated
1866 * already, and will see the frontend magically blink into existence
1867 * now (well, from the aio_bh which fires the watches). It's their
1868 * responsibility to rebuild everything precisely as it was before.
1869 */
1870 ret = transaction_commit(s, &base_t);
1871 if (ret) {
1872 return ret;
1873 }
1874
1875 while (1) {
1876 unsigned int base_tx;
1877 XsTransaction *t;
1878
1879 ret = consume_be32(&us, &base_tx);
1880 if (ret) {
1881 return ret;
1882 }
1883 if (base_tx == XBT_NULL) {
1884 break;
1885 }
1886
1887 t = g_new0(XsTransaction, 1);
1888 t->base_tx = base_tx;
1889
1890 ret = consume_be32(&us, &t->dom_id);
1891 if (!ret) {
1892 ret = consume_tree(&us, t);
1893 }
1894 if (ret) {
1895 g_free(t);
1896 return ret;
1897 }
1898 g_assert(t->root);
1899 if (t->dom_id) {
1900 s->nr_domu_transactions++;
1901 }
1902 g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
1903 }
1904
1905 while (1) {
1906 char *path, *token;
1907 size_t pathlen, toklen;
1908
1909 ret = consume_string(&us, &path, &pathlen);
1910 if (ret) {
1911 return ret;
1912 }
1913 if (!pathlen) {
1914 break;
1915 }
1916
1917 ret = consume_string(&us, &token, &toklen);
1918 if (ret) {
1919 return ret;
1920 }
1921
1922 if (!watch_fn) {
1923 continue;
1924 }
1925
1926 ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
1927 if (ret) {
1928 return ret;
1929 }
1930 }
1931
1932 if (us.l) {
1933 return -EINVAL;
1934 }
1935
1936 return 0;
1937 }
1938