xref: /openbmc/qemu/hw/i386/kvm/xenstore_impl.c (revision 90bb6d676489b5cc063858ece263e1586795803f)
1 /*
2  * QEMU Xen emulation: The actual implementation of XenStore
3  *
4  * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5  *
6  * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
7  *
8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
9  * See the COPYING file in the top-level directory.
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
14 
15 #include "hw/xen/xen.h"
16 
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
19 
20 #include "hw/xen/interface/io/xs_wire.h"
21 
22 #define XS_MAX_WATCHES          128
23 #define XS_MAX_DOMAIN_NODES     1000
24 #define XS_MAX_NODE_SIZE        2048
25 #define XS_MAX_TRANSACTIONS     10
26 #define XS_MAX_PERMS_PER_NODE   5
27 
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
30                        "0123456789-/_"
31 
32 typedef struct XsNode {
33     uint32_t ref;
34     GByteArray *content;
35     GList *perms;
36     GHashTable *children;
37     uint64_t gencnt;
38     bool deleted_in_tx;
39     bool modified_in_tx;
40     unsigned int serialized_tx;
41 #ifdef XS_NODE_UNIT_TEST
42     gchar *name; /* debug only */
43 #endif
44 } XsNode;
45 
46 typedef struct XsWatch {
47     struct XsWatch *next;
48     xs_impl_watch_fn *cb;
49     void *cb_opaque;
50     char *token;
51     unsigned int dom_id;
52     int rel_prefix;
53 } XsWatch;
54 
55 typedef struct XsTransaction {
56     XsNode *root;
57     unsigned int nr_nodes;
58     unsigned int base_tx;
59     unsigned int tx_id;
60     unsigned int dom_id;
61 } XsTransaction;
62 
63 struct XenstoreImplState {
64     XsNode *root;
65     unsigned int nr_nodes;
66     GHashTable *watches;
67     unsigned int nr_domu_watches;
68     GHashTable *transactions;
69     unsigned int nr_domu_transactions;
70     unsigned int root_tx;
71     unsigned int last_tx;
72     bool serialized;
73 };
74 
75 
76 static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
77 {
78     unsigned int *new_tx_id = user_data;
79     XsTransaction *tx = value;
80 
81     if (tx->base_tx == *new_tx_id) {
82         /* Transactions based on XBT_NULL will always fail */
83         tx->base_tx = XBT_NULL;
84     }
85 }
86 
87 static inline unsigned int next_tx(struct XenstoreImplState *s)
88 {
89     unsigned int tx_id;
90 
91     /* Find the next TX id which isn't either XBT_NULL or in use. */
92     do {
93         tx_id = ++s->last_tx;
94     } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
95              g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
96 
97     /*
98      * It is vanishingly unlikely, but ensure that no outstanding transaction
99      * is based on the (previous incarnation of the) newly-allocated TX id.
100      */
101     g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
102 
103     return tx_id;
104 }
105 
106 static inline XsNode *xs_node_new(void)
107 {
108     XsNode *n = g_new0(XsNode, 1);
109     n->ref = 1;
110 
111 #ifdef XS_NODE_UNIT_TEST
112     nr_xs_nodes++;
113     xs_node_list = g_list_prepend(xs_node_list, n);
114 #endif
115     return n;
116 }
117 
118 static inline XsNode *xs_node_ref(XsNode *n)
119 {
120     /* With just 10 transactions, it can never get anywhere near this. */
121     g_assert(n->ref < INT_MAX);
122 
123     g_assert(n->ref);
124     n->ref++;
125     return n;
126 }
127 
128 static inline void xs_node_unref(XsNode *n)
129 {
130     if (!n) {
131         return;
132     }
133     g_assert(n->ref);
134     if (--n->ref) {
135         return;
136     }
137 
138     if (n->content) {
139         g_byte_array_unref(n->content);
140     }
141     if (n->perms) {
142         g_list_free_full(n->perms, g_free);
143     }
144     if (n->children) {
145         g_hash_table_unref(n->children);
146     }
147 #ifdef XS_NODE_UNIT_TEST
148     g_free(n->name);
149     nr_xs_nodes--;
150     xs_node_list = g_list_remove(xs_node_list, n);
151 #endif
152     g_free(n);
153 }
154 
155 char *xs_perm_as_string(unsigned int perm, unsigned int domid)
156 {
157     char letter;
158 
159     switch (perm) {
160     case XS_PERM_READ | XS_PERM_WRITE:
161         letter = 'b';
162         break;
163     case XS_PERM_READ:
164         letter = 'r';
165         break;
166     case XS_PERM_WRITE:
167         letter = 'w';
168         break;
169     case XS_PERM_NONE:
170     default:
171         letter = 'n';
172         break;
173     }
174 
175     return g_strdup_printf("%c%u", letter, domid);
176 }
177 
178 static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
179 {
180     return g_strdup(src);
181 }
182 
183 static XsNode *xs_node_create(const char *name, GList *perms)
184 {
185     XsNode *n = xs_node_new();
186 
187 #ifdef XS_NODE_UNIT_TEST
188     if (name) {
189         n->name = g_strdup(name);
190     }
191 #endif
192 
193     n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
194 
195     return n;
196 }
197 
198 /* For copying from one hash table to another using g_hash_table_foreach() */
199 static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
200 {
201     g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
202 }
203 
204 static XsNode *xs_node_copy(XsNode *old)
205 {
206     XsNode *n = xs_node_new();
207 
208     n->gencnt = old->gencnt;
209 
210 #ifdef XS_NODE_UNIT_TEST
211     if (n->name) {
212         n->name = g_strdup(old->name);
213     }
214 #endif
215 
216     assert(old);
217     if (old->children) {
218         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
219                                             (GDestroyNotify)xs_node_unref);
220         g_hash_table_foreach(old->children, do_child_insert, n->children);
221     }
222     if (old->perms) {
223         n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
224     }
225     if (old->content) {
226         n->content = g_byte_array_ref(old->content);
227     }
228     return n;
229 }
230 
231 /* Returns true if it made a change to the hash table */
232 static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
233 {
234     assert(!strchr(path_elem, '/'));
235 
236     if (!child) {
237         assert(n->children);
238         return g_hash_table_remove(n->children, path_elem);
239     }
240 
241 #ifdef XS_NODE_UNIT_TEST
242     g_free(child->name);
243     child->name = g_strdup(path_elem);
244 #endif
245     if (!n->children) {
246         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
247                                             (GDestroyNotify)xs_node_unref);
248     }
249 
250     /*
251      * The documentation for g_hash_table_insert() says that it "returns a
252      * boolean value to indicate whether the newly added value was already
253      * in the hash table or not."
254      *
255      * It could perhaps be clearer that returning TRUE means it wasn't,
256      */
257     return g_hash_table_insert(n->children, g_strdup(path_elem), child);
258 }
259 
260 struct walk_op {
261     struct XenstoreImplState *s;
262     char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
263     int (*op_fn)(XsNode **n, struct walk_op *op);
264     void *op_opaque;
265     void *op_opaque2;
266 
267     GList *watches;
268     unsigned int dom_id;
269     unsigned int tx_id;
270 
271     /* The number of nodes which will exist in the tree if this op succeeds. */
272     unsigned int new_nr_nodes;
273 
274     /*
275      * This is maintained on the way *down* the walk to indicate
276      * whether nodes can be modified in place or whether COW is
277      * required. It starts off being true, as we're always going to
278      * replace the root node. If we walk into a shared subtree it
279      * becomes false. If we start *creating* new nodes for a write,
280      * it becomes true again.
281      *
282      * Do not use it on the way back up.
283      */
284     bool inplace;
285     bool mutating;
286     bool create_dirs;
287     bool in_transaction;
288 
289     /* Tracking during recursion so we know which is first. */
290     bool deleted_in_tx;
291 };
292 
293 static void fire_watches(struct walk_op *op, bool parents)
294 {
295     GList *l = NULL;
296     XsWatch *w;
297 
298     if (!op->mutating || op->in_transaction) {
299         return;
300     }
301 
302     if (parents) {
303         l = op->watches;
304     }
305 
306     w = g_hash_table_lookup(op->s->watches, op->path);
307     while (w || l) {
308         if (!w) {
309             /* Fire the parent nodes from 'op' if asked to */
310             w = l->data;
311             l = l->next;
312             continue;
313         }
314 
315         assert(strlen(op->path) > w->rel_prefix);
316         w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
317 
318         w = w->next;
319     }
320 }
321 
322 static int xs_node_add_content(XsNode **n, struct walk_op *op)
323 {
324     GByteArray *data = op->op_opaque;
325 
326     if (op->dom_id) {
327         /*
328          * The real XenStored includes permissions and names of child nodes
329          * in the calculated datasize but life's too short. For a single
330          * tenant internal XenStore, we don't have to be quite as pedantic.
331          */
332         if (data->len > XS_MAX_NODE_SIZE) {
333             return E2BIG;
334         }
335     }
336     /* We *are* the node to be written. Either this or a copy. */
337     if (!op->inplace) {
338         XsNode *old = *n;
339         *n = xs_node_copy(old);
340         xs_node_unref(old);
341     }
342 
343     if ((*n)->content) {
344         g_byte_array_unref((*n)->content);
345     }
346     (*n)->content = g_byte_array_ref(data);
347     if (op->tx_id != XBT_NULL) {
348         (*n)->modified_in_tx = true;
349     }
350     return 0;
351 }
352 
353 static int xs_node_get_content(XsNode **n, struct walk_op *op)
354 {
355     GByteArray *data = op->op_opaque;
356     GByteArray *node_data;
357 
358     assert(op->inplace);
359     assert(*n);
360 
361     node_data = (*n)->content;
362     if (node_data) {
363         g_byte_array_append(data, node_data->data, node_data->len);
364     }
365 
366     return 0;
367 }
368 
369 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
370 {
371     struct walk_op *op = user_data;
372     int path_len = strlen(op->path);
373     int key_len = strlen(key);
374     XsNode *n = value;
375     bool this_inplace = op->inplace;
376 
377     if (n->ref != 1) {
378         op->inplace = 0;
379     }
380 
381     assert(key_len + path_len + 2 <= sizeof(op->path));
382     op->path[path_len] = '/';
383     memcpy(op->path + path_len + 1, key, key_len + 1);
384 
385     if (n->children) {
386         g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
387     }
388     op->new_nr_nodes--;
389 
390     /*
391      * Fire watches on *this* node but not the parents because they are
392      * going to be deleted too, so the watch will fire for them anyway.
393      */
394     fire_watches(op, false);
395     op->path[path_len] = '\0';
396 
397     /*
398      * Actually deleting the child here is just an optimisation; if we
399      * don't then the final unref on the topmost victim will just have
400      * to cascade down again repeating all the g_hash_table_foreach()
401      * calls.
402      */
403     return this_inplace;
404 }
405 
406 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
407 static void copy_deleted_recurse(gpointer key, gpointer value,
408                                  gpointer user_data)
409 {
410     struct walk_op *op = user_data;
411     GHashTable *siblings = op->op_opaque2;
412     XsNode *n = xs_node_copy_deleted(value, op);
413 
414     /*
415      * Reinsert the deleted_in_tx copy of the node into the parent's
416      * 'children' hash table. Having stashed it from op->op_opaque2
417      * before the recursive call to xs_node_copy_deleted() scribbled
418      * over it.
419      */
420     g_hash_table_insert(siblings, g_strdup(key), n);
421 }
422 
423 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
424 {
425     XsNode *n = xs_node_new();
426 
427     n->gencnt = old->gencnt;
428 
429 #ifdef XS_NODE_UNIT_TEST
430     if (old->name) {
431         n->name = g_strdup(old->name);
432     }
433 #endif
434 
435     if (old->children) {
436         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
437                                             (GDestroyNotify)xs_node_unref);
438         op->op_opaque2 = n->children;
439         g_hash_table_foreach(old->children, copy_deleted_recurse, op);
440     }
441     if (old->perms) {
442         n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
443     }
444     n->deleted_in_tx = true;
445     /* If it gets resurrected we only fire a watch if it lost its content */
446     if (old->content) {
447         n->modified_in_tx = true;
448     }
449     op->new_nr_nodes--;
450     return n;
451 }
452 
453 static int xs_node_rm(XsNode **n, struct walk_op *op)
454 {
455     bool this_inplace = op->inplace;
456 
457     if (op->tx_id != XBT_NULL) {
458         /* It's not trivial to do inplace handling for this one */
459         XsNode *old = *n;
460         *n = xs_node_copy_deleted(old, op);
461         xs_node_unref(old);
462         return 0;
463     }
464 
465     /* Fire watches for, and count, nodes in the subtree which get deleted */
466     if ((*n)->children) {
467         g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
468     }
469     op->new_nr_nodes--;
470 
471     if (this_inplace) {
472         xs_node_unref(*n);
473     }
474     *n = NULL;
475     return 0;
476 }
477 
478 static int xs_node_get_perms(XsNode **n, struct walk_op *op)
479 {
480     GList **perms = op->op_opaque;
481 
482     assert(op->inplace);
483     assert(*n);
484 
485     *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
486     return 0;
487 }
488 
489 static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
490 {
491     unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
492 
493     assert(n == 2);
494 }
495 
496 static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
497 {
498     unsigned int i, n;
499     char perm_letter;
500     unsigned int perm_dom_id;
501     bool access;
502 
503     if (dom_id == 0) {
504         return true;
505     }
506 
507     n = g_list_length(perms);
508     assert(n >= 1);
509 
510     /*
511      * The dom_id of the first perm is the owner, and the owner always has
512      * read-write access.
513      */
514     parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
515     if (dom_id == perm_dom_id) {
516         return true;
517     }
518 
519     /*
520      * The letter of the first perm specified the default access for all other
521      * domains.
522      */
523     access = !!strchr(letters, perm_letter);
524     for (i = 1; i < n; i++) {
525         parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
526         if (dom_id != perm_dom_id) {
527             continue;
528         }
529         access = !!strchr(letters, perm_letter);
530     }
531 
532     return access;
533 }
534 
535 static int xs_node_set_perms(XsNode **n, struct walk_op *op)
536 {
537     GList *perms = op->op_opaque;
538 
539     if (op->dom_id) {
540         unsigned int perm_dom_id;
541         char perm_letter;
542 
543         /* A guest may not change permissions on nodes it does not own */
544         if (!can_access(op->dom_id, (*n)->perms, "")) {
545             return EPERM;
546         }
547 
548         /* A guest may not change the owner of a node it owns. */
549         parse_perm(perms->data, &perm_letter, &perm_dom_id);
550         if (perm_dom_id != op->dom_id) {
551             return EPERM;
552         }
553 
554         if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
555             return ENOSPC;
556         }
557     }
558 
559     /* We *are* the node to be written. Either this or a copy. */
560     if (!op->inplace) {
561         XsNode *old = *n;
562         *n = xs_node_copy(old);
563         xs_node_unref(old);
564     }
565 
566     if ((*n)->perms) {
567         g_list_free_full((*n)->perms, g_free);
568     }
569     (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
570     if (op->tx_id != XBT_NULL) {
571         (*n)->modified_in_tx = true;
572     }
573     return 0;
574 }
575 
576 /*
577  * Passed a full reference in *n which it may free if it needs to COW.
578  *
579  * When changing the tree, the op->inplace flag indicates whether this
580  * node may be modified in place (i.e. it and all its parents had a
581  * refcount of one). If walking down the tree we find a node whose
582  * refcount is higher, we must clear op->inplace and COW from there
583  * down. Unless we are creating new nodes as scaffolding for a write
584  * (which works like 'mkdir -p' does). In which case those newly
585  * created nodes can (and must) be modified in place again.
586  */
587 static int xs_node_walk(XsNode **n, struct walk_op *op)
588 {
589     char *child_name = NULL;
590     size_t namelen;
591     XsNode *old = *n, *child = NULL;
592     bool stole_child = false;
593     bool this_inplace;
594     XsWatch *watch;
595     int err;
596 
597     namelen = strlen(op->path);
598     watch = g_hash_table_lookup(op->s->watches, op->path);
599 
600     /* Is there a child, or do we hit the double-NUL termination? */
601     if (op->path[namelen + 1]) {
602         char *slash;
603         child_name = op->path + namelen + 1;
604         slash = strchr(child_name, '/');
605         if (slash) {
606             *slash = '\0';
607         }
608         op->path[namelen] = '/';
609     }
610 
611     /* If we walk into a subtree which is shared, we must COW */
612     if (op->mutating && old->ref != 1) {
613         op->inplace = false;
614     }
615 
616     if (!child_name) {
617         const char *letters = op->mutating ? "wb" : "rb";
618 
619         if (!can_access(op->dom_id, old->perms, letters)) {
620             err = EACCES;
621             goto out;
622         }
623 
624         /* This is the actual node on which the operation shall be performed */
625         err = op->op_fn(n, op);
626         if (!err) {
627             fire_watches(op, true);
628         }
629         goto out;
630     }
631 
632     /* op->inplace will be further modified during the recursion */
633     this_inplace = op->inplace;
634 
635     if (old && old->children) {
636         child = g_hash_table_lookup(old->children, child_name);
637         /* This is a *weak* reference to 'child', owned by the hash table */
638     }
639 
640     if (child) {
641         if (child->deleted_in_tx) {
642             assert(child->ref == 1);
643             /* Cannot actually set child->deleted_in_tx = false until later */
644         }
645         xs_node_ref(child);
646         /*
647          * Now we own it too. But if we can modify inplace, that's going to
648          * foil the check and force it to COW. We want to be the *only* owner
649          * so that it can be modified in place, so remove it from the hash
650          * table in that case. We'll add it (or its replacement) back later.
651          */
652         if (op->mutating && this_inplace) {
653             g_hash_table_remove(old->children, child_name);
654             stole_child = true;
655         }
656     } else if (op->create_dirs) {
657         assert(op->mutating);
658 
659         if (!can_access(op->dom_id, old->perms, "wb")) {
660             err = EACCES;
661             goto out;
662         }
663 
664         if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
665             err = ENOSPC;
666             goto out;
667         }
668 
669         child = xs_node_create(child_name, old->perms);
670         op->new_nr_nodes++;
671 
672         /*
673          * If we're creating a new child, we can clearly modify it (and its
674          * children) in place from here on down.
675          */
676         op->inplace = true;
677     } else {
678         err = ENOENT;
679         goto out;
680     }
681 
682     /*
683      * If there's a watch on this node, add it to the list to be fired
684      * (with the correct full pathname for the modified node) at the end.
685      */
686     if (watch) {
687         op->watches = g_list_append(op->watches, watch);
688     }
689 
690     /*
691      * Except for the temporary child-stealing as noted, our node has not
692      * changed yet. We don't yet know the overall operation will complete.
693      */
694     err = xs_node_walk(&child, op);
695 
696     if (watch) {
697         op->watches = g_list_remove(op->watches, watch);
698     }
699 
700     if (err || !op->mutating) {
701         if (stole_child) {
702             /* Put it back as it was. */
703             g_hash_table_replace(old->children, g_strdup(child_name), child);
704         } else {
705             xs_node_unref(child);
706         }
707         goto out;
708     }
709 
710     /*
711      * Now we know the operation has completed successfully and we're on
712      * the way back up. Make the change, substituting 'child' in the
713      * node at our level.
714      */
715     if (!this_inplace) {
716         *n = xs_node_copy(old);
717         xs_node_unref(old);
718     }
719 
720     /*
721      * If we resurrected a deleted_in_tx node, we can mark it as no longer
722      * deleted now that we know the overall operation has succeeded.
723      */
724     if (op->create_dirs && child && child->deleted_in_tx) {
725         op->new_nr_nodes++;
726         child->deleted_in_tx = false;
727     }
728 
729     /*
730      * The child may be NULL here, for a remove operation. Either way,
731      * xs_node_add_child() will do the right thing and return a value
732      * indicating whether it changed the parent's hash table or not.
733      *
734      * We bump the parent gencnt if it adds a child that we *didn't*
735      * steal from it in the first place, or if child==NULL and was
736      * thus removed (whether we stole it earlier and didn't put it
737      * back, or xs_node_add_child() actually removed it now).
738      */
739     if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
740         (*n)->gencnt++;
741     }
742 
743  out:
744     op->path[namelen] = '\0';
745     if (!namelen) {
746         assert(!op->watches);
747         /*
748          * On completing the recursion back up the path walk and reaching the
749          * top, assign the new node count if the operation was successful. If
750          * the main tree was changed, bump its tx ID so that outstanding
751          * transactions correctly fail. But don't bump it every time; only
752          * if it makes a difference.
753          */
754         if (!err && op->mutating) {
755             if (!op->in_transaction) {
756                 if (op->s->root_tx != op->s->last_tx) {
757                     op->s->root_tx = next_tx(op->s);
758                 }
759                 op->s->nr_nodes = op->new_nr_nodes;
760             } else {
761                 XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
762                                                         GINT_TO_POINTER(op->tx_id));
763                 assert(tx);
764                 tx->nr_nodes = op->new_nr_nodes;
765             }
766         }
767     }
768     return err;
769 }
770 
771 static void append_directory_item(gpointer key, gpointer value,
772                                   gpointer user_data)
773 {
774     GList **items = user_data;
775 
776     *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
777 }
778 
779 /* Populates items with char * names which caller must free. */
780 static int xs_node_directory(XsNode **n, struct walk_op *op)
781 {
782     GList **items = op->op_opaque;
783 
784     assert(op->inplace);
785     assert(*n);
786 
787     if ((*n)->children) {
788         g_hash_table_foreach((*n)->children, append_directory_item, items);
789     }
790 
791     if (op->op_opaque2) {
792         *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
793     }
794 
795     return 0;
796 }
797 
798 static int validate_path(char *outpath, const char *userpath,
799                          unsigned int dom_id)
800 {
801     size_t i, pathlen = strlen(userpath);
802 
803     if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
804         return EINVAL;
805     }
806     for (i = 0; i < pathlen; i++) {
807         if (!strchr(XS_VALID_CHARS, userpath[i])) {
808             return EINVAL;
809         }
810     }
811     if (userpath[0] == '/') {
812         if (pathlen > XENSTORE_ABS_PATH_MAX) {
813             return E2BIG;
814         }
815         memcpy(outpath, userpath, pathlen + 1);
816     } else {
817         if (pathlen > XENSTORE_REL_PATH_MAX) {
818             return E2BIG;
819         }
820         snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
821                  userpath);
822     }
823     return 0;
824 }
825 
826 
827 static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
828                         xs_transaction_t tx_id, unsigned int dom_id,
829                         const char *path, XsNode ***rootp)
830 {
831     int ret = validate_path(op->path, path, dom_id);
832     if (ret) {
833         return ret;
834     }
835 
836     /*
837      * We use *two* NUL terminators at the end of the path, as during the walk
838      * we will temporarily turn each '/' into a NUL to allow us to use that
839      * path element for the lookup.
840      */
841     op->path[strlen(op->path) + 1] = '\0';
842     op->watches = NULL;
843     op->path[0] = '\0';
844     op->inplace = true;
845     op->mutating = false;
846     op->create_dirs = false;
847     op->in_transaction = false;
848     op->dom_id = dom_id;
849     op->tx_id = tx_id;
850     op->s = s;
851 
852     if (tx_id == XBT_NULL) {
853         *rootp = &s->root;
854         op->new_nr_nodes = s->nr_nodes;
855     } else {
856         XsTransaction *tx = g_hash_table_lookup(s->transactions,
857                                                 GINT_TO_POINTER(tx_id));
858         if (!tx) {
859             return ENOENT;
860         }
861         *rootp = &tx->root;
862         op->new_nr_nodes = tx->nr_nodes;
863         op->in_transaction = true;
864     }
865 
866     return 0;
867 }
868 
869 int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
870                  xs_transaction_t tx_id, const char *path, GByteArray *data)
871 {
872     /*
873      * The data GByteArray shall exist, and will be freed by caller.
874      * Just g_byte_array_append() to it.
875      */
876     struct walk_op op;
877     XsNode **n;
878     int ret;
879 
880     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
881     if (ret) {
882         return ret;
883     }
884     op.op_fn = xs_node_get_content;
885     op.op_opaque = data;
886     return xs_node_walk(n, &op);
887 }
888 
889 int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
890                   xs_transaction_t tx_id, const char *path, GByteArray *data)
891 {
892     /*
893      * The data GByteArray shall exist, will be freed by caller. You are
894      * free to use g_byte_array_steal() and keep the data. Or just ref it.
895      */
896     struct walk_op op;
897     XsNode **n;
898     int ret;
899 
900     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
901     if (ret) {
902         return ret;
903     }
904     op.op_fn = xs_node_add_content;
905     op.op_opaque = data;
906     op.mutating = true;
907     op.create_dirs = true;
908     return xs_node_walk(n, &op);
909 }
910 
911 int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
912                       xs_transaction_t tx_id, const char *path,
913                       uint64_t *gencnt, GList **items)
914 {
915     /*
916      * The items are (char *) to be freed by caller. Although it's consumed
917      * immediately so if you want to change it to (const char *) and keep
918      * them, go ahead and change the caller.
919      */
920     struct walk_op op;
921     XsNode **n;
922     int ret;
923 
924     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
925     if (ret) {
926         return ret;
927     }
928     op.op_fn = xs_node_directory;
929     op.op_opaque = items;
930     op.op_opaque2 = gencnt;
931     return xs_node_walk(n, &op);
932 }
933 
934 int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
935                               xs_transaction_t *tx_id)
936 {
937     XsTransaction *tx;
938 
939     if (*tx_id != XBT_NULL) {
940         return EINVAL;
941     }
942 
943     if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
944         return ENOSPC;
945     }
946 
947     tx = g_new0(XsTransaction, 1);
948 
949     tx->nr_nodes = s->nr_nodes;
950     tx->tx_id = next_tx(s);
951     tx->base_tx = s->root_tx;
952     tx->root = xs_node_ref(s->root);
953     tx->dom_id = dom_id;
954 
955     g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
956     if (dom_id) {
957         s->nr_domu_transactions++;
958     }
959     *tx_id = tx->tx_id;
960     return 0;
961 }
962 
963 static gboolean tx_commit_walk(gpointer key, gpointer value,
964                                gpointer user_data)
965 {
966     struct walk_op *op = user_data;
967     int path_len = strlen(op->path);
968     int key_len = strlen(key);
969     bool fire_parents = true;
970     XsWatch *watch;
971     XsNode *n = value;
972 
973     if (n->ref != 1) {
974         return false;
975     }
976 
977     if (n->deleted_in_tx) {
978         /*
979          * We fire watches on our parents if we are the *first* node
980          * to be deleted (the topmost one). This matches the behaviour
981          * when deleting in the live tree.
982          */
983         fire_parents = !op->deleted_in_tx;
984 
985         /* Only used on the way down so no need to clear it later */
986         op->deleted_in_tx = true;
987     }
988 
989     assert(key_len + path_len + 2 <= sizeof(op->path));
990     op->path[path_len] = '/';
991     memcpy(op->path + path_len + 1, key, key_len + 1);
992 
993     watch = g_hash_table_lookup(op->s->watches, op->path);
994     if (watch) {
995         op->watches = g_list_append(op->watches, watch);
996     }
997 
998     if (n->children) {
999         g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
1000     }
1001 
1002     if (watch) {
1003         op->watches = g_list_remove(op->watches, watch);
1004     }
1005 
1006     /*
1007      * Don't fire watches if this node was only copied because a
1008      * descendent was changed. The modified_in_tx flag indicates the
1009      * ones which were really changed.
1010      */
1011     if (n->modified_in_tx || n->deleted_in_tx) {
1012         fire_watches(op, fire_parents);
1013         n->modified_in_tx = false;
1014     }
1015     op->path[path_len] = '\0';
1016 
1017     /* Deleted nodes really do get expunged when we commit */
1018     return n->deleted_in_tx;
1019 }
1020 
1021 static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
1022 {
1023     struct walk_op op;
1024     XsNode **n;
1025     int ret;
1026 
1027     if (s->root_tx != tx->base_tx) {
1028         return EAGAIN;
1029     }
1030     xs_node_unref(s->root);
1031     s->root = tx->root;
1032     tx->root = NULL;
1033     s->root_tx = tx->tx_id;
1034     s->nr_nodes = tx->nr_nodes;
1035 
1036     ret = init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
1037     /*
1038      * There are two reasons why init_walk_op() may fail: an invalid tx_id,
1039      * or an invalid path. We pass XBT_NULL and "/", and it cannot fail.
1040      * If it does, the world is broken. And returning 'ret' would be weird
1041      * because the transaction *was* committed, and all this tree walk is
1042      * trying to do is fire the resulting watches on newly-committed nodes.
1043      */
1044     g_assert(!ret);
1045 
1046     op.deleted_in_tx = false;
1047     op.mutating = true;
1048 
1049     /*
1050      * Walk the new root and fire watches on any node which has a
1051      * refcount of one (which is therefore unique to this transaction).
1052      */
1053     if (s->root->children) {
1054         g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
1055     }
1056 
1057     return 0;
1058 }
1059 
1060 int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
1061                             xs_transaction_t tx_id, bool commit)
1062 {
1063     int ret = 0;
1064     XsTransaction *tx = g_hash_table_lookup(s->transactions,
1065                                             GINT_TO_POINTER(tx_id));
1066 
1067     if (!tx || tx->dom_id != dom_id) {
1068         return ENOENT;
1069     }
1070 
1071     if (commit) {
1072         ret = transaction_commit(s, tx);
1073     }
1074 
1075     g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
1076     if (dom_id) {
1077         assert(s->nr_domu_transactions);
1078         s->nr_domu_transactions--;
1079     }
1080     return ret;
1081 }
1082 
1083 int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
1084                xs_transaction_t tx_id, const char *path)
1085 {
1086     struct walk_op op;
1087     XsNode **n;
1088     int ret;
1089 
1090     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1091     if (ret) {
1092         return ret;
1093     }
1094     op.op_fn = xs_node_rm;
1095     op.mutating = true;
1096     return xs_node_walk(n, &op);
1097 }
1098 
1099 int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
1100                       xs_transaction_t tx_id, const char *path, GList **perms)
1101 {
1102     struct walk_op op;
1103     XsNode **n;
1104     int ret;
1105 
1106     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1107     if (ret) {
1108         return ret;
1109     }
1110     op.op_fn = xs_node_get_perms;
1111     op.op_opaque = perms;
1112     return xs_node_walk(n, &op);
1113 }
1114 
1115 static void is_valid_perm(gpointer data, gpointer user_data)
1116 {
1117     char *perm = data;
1118     bool *valid = user_data;
1119     char letter;
1120     unsigned int dom_id;
1121 
1122     if (!*valid) {
1123         return;
1124     }
1125 
1126     if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
1127         *valid = false;
1128         return;
1129     }
1130 
1131     switch (letter) {
1132     case 'n':
1133     case 'r':
1134     case 'w':
1135     case 'b':
1136         break;
1137 
1138     default:
1139         *valid = false;
1140         break;
1141     }
1142 }
1143 
1144 int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
1145                       xs_transaction_t tx_id, const char *path, GList *perms)
1146 {
1147     struct walk_op op;
1148     XsNode **n;
1149     bool valid = true;
1150     int ret;
1151 
1152     if (!g_list_length(perms)) {
1153         return EINVAL;
1154     }
1155 
1156     g_list_foreach(perms, is_valid_perm, &valid);
1157     if (!valid) {
1158         return EINVAL;
1159     }
1160 
1161     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1162     if (ret) {
1163         return ret;
1164     }
1165     op.op_fn = xs_node_set_perms;
1166     op.op_opaque = perms;
1167     op.mutating = true;
1168     return xs_node_walk(n, &op);
1169 }
1170 
1171 static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
1172                             const char *path, const char *token,
1173                             xs_impl_watch_fn fn, void *opaque)
1174 
1175 {
1176     char abspath[XENSTORE_ABS_PATH_MAX + 1];
1177     XsWatch *w, *l;
1178     int ret;
1179 
1180     ret = validate_path(abspath, path, dom_id);
1181     if (ret) {
1182         return ret;
1183     }
1184 
1185     /* Check for duplicates */
1186     l = w = g_hash_table_lookup(s->watches, abspath);
1187     while (w) {
1188         if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
1189             fn == w->cb && dom_id == w->dom_id) {
1190             return EEXIST;
1191         }
1192         w = w->next;
1193     }
1194 
1195     if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
1196         return E2BIG;
1197     }
1198 
1199     w = g_new0(XsWatch, 1);
1200     w->token = g_strdup(token);
1201     w->cb = fn;
1202     w->cb_opaque = opaque;
1203     w->dom_id = dom_id;
1204     w->rel_prefix = strlen(abspath) - strlen(path);
1205 
1206     /* l was looked up above when checking for duplicates */
1207     if (l) {
1208         w->next = l->next;
1209         l->next = w;
1210     } else {
1211         g_hash_table_insert(s->watches, g_strdup(abspath), w);
1212     }
1213     if (dom_id) {
1214         s->nr_domu_watches++;
1215     }
1216 
1217     return 0;
1218 }
1219 
1220 int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
1221                   const char *token, xs_impl_watch_fn fn, void *opaque)
1222 {
1223     int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
1224 
1225     if (!ret) {
1226         /* A new watch should fire immediately */
1227         fn(opaque, path, token);
1228     }
1229 
1230     return ret;
1231 }
1232 
1233 static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
1234 {
1235     XsWatch *next = w->next;
1236 
1237     if (w->dom_id) {
1238         assert(s->nr_domu_watches);
1239         s->nr_domu_watches--;
1240     }
1241 
1242     g_free(w->token);
1243     g_free(w);
1244 
1245     return next;
1246 }
1247 
1248 int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
1249                     const char *path, const char *token,
1250                     xs_impl_watch_fn fn, void *opaque)
1251 {
1252     char abspath[XENSTORE_ABS_PATH_MAX + 1];
1253     XsWatch *w, **l;
1254     int ret;
1255 
1256     ret = validate_path(abspath, path, dom_id);
1257     if (ret) {
1258         return ret;
1259     }
1260 
1261     w = g_hash_table_lookup(s->watches, abspath);
1262     if (!w) {
1263         return ENOENT;
1264     }
1265 
1266     /*
1267      * The hash table contains the first element of a list of
1268      * watches. Removing the first element in the list is a
1269      * special case because we have to update the hash table to
1270      * point to the next (or remove it if there's nothing left).
1271      */
1272     if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
1273         dom_id == w->dom_id) {
1274         if (w->next) {
1275             /* Insert the previous 'next' into the hash table */
1276             g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
1277         } else {
1278             /* Nothing left; remove from hash table */
1279             g_hash_table_remove(s->watches, abspath);
1280         }
1281         free_watch(s, w);
1282         return 0;
1283     }
1284 
1285     /*
1286      * We're all done messing with the hash table because the element
1287      * it points to has survived the cull. Now it's just a simple
1288      * linked list removal operation.
1289      */
1290     for (l = &w->next; *l; l = &w->next) {
1291         w = *l;
1292 
1293         if (!g_strcmp0(token, w->token) && fn == w->cb &&
1294             opaque != w->cb_opaque && dom_id == w->dom_id) {
1295             *l = free_watch(s, w);
1296             return 0;
1297         }
1298     }
1299 
1300     return ENOENT;
1301 }
1302 
1303 int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
1304 {
1305     char **watch_paths;
1306     guint nr_watch_paths;
1307     guint i;
1308 
1309     watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
1310                                                           &nr_watch_paths);
1311 
1312     for (i = 0; i < nr_watch_paths; i++) {
1313         XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
1314         XsWatch *w2, *w, **l;
1315 
1316         /*
1317          * w1 is the original list. The hash table has this pointer.
1318          * w2 is the head of our newly-filtered list.
1319          * w and l are temporary for processing. w is somewhat redundant
1320          * with *l but makes my eyes bleed less.
1321          */
1322 
1323         w = w2 = w1;
1324         l = &w;
1325         while (w) {
1326             if (w->dom_id == dom_id) {
1327                 /* If we're freeing the head of the list, bump w2 */
1328                 if (w2 == w) {
1329                     w2 = w->next;
1330                 }
1331                 *l = free_watch(s, w);
1332             } else {
1333                 l = &w->next;
1334             }
1335             w = *l;
1336         }
1337         /*
1338          * If the head of the list survived the cull, we don't need to
1339          * touch the hash table and we're done with this path. Else...
1340          */
1341         if (w1 != w2) {
1342             g_hash_table_steal(s->watches, watch_paths[i]);
1343 
1344             /*
1345              * It was already freed. (Don't worry, this whole thing is
1346              * single-threaded and nobody saw it in the meantime). And
1347              * having *stolen* it, we now own the watch_paths[i] string
1348              * so if we don't give it back to the hash table, we need
1349              * to free it.
1350              */
1351             if (w2) {
1352                 g_hash_table_insert(s->watches, watch_paths[i], w2);
1353             } else {
1354                 g_free(watch_paths[i]);
1355             }
1356         }
1357     }
1358     g_free(watch_paths);
1359     return 0;
1360 }
1361 
1362 static void xs_tx_free(void *_tx)
1363 {
1364     XsTransaction *tx = _tx;
1365     if (tx->root) {
1366         xs_node_unref(tx->root);
1367     }
1368     g_free(tx);
1369 }
1370 
1371 XenstoreImplState *xs_impl_create(unsigned int dom_id)
1372 {
1373     XenstoreImplState *s = g_new0(XenstoreImplState, 1);
1374     GList *perms;
1375 
1376     s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
1377     s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
1378                                             NULL, xs_tx_free);
1379 
1380     perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
1381     s->root = xs_node_create("/", perms);
1382     g_list_free_full(perms, g_free);
1383     s->nr_nodes = 1;
1384 
1385     s->root_tx = s->last_tx = 1;
1386     return s;
1387 }
1388 
1389 
1390 static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
1391 {
1392     XsNode *n = value;
1393 
1394     n->serialized_tx = XBT_NULL;
1395     if (n->children) {
1396         g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
1397     }
1398 }
1399 
1400 static void clear_tx_serialized_tx(gpointer key, gpointer value,
1401                                    gpointer opaque)
1402 {
1403     XsTransaction *t = value;
1404 
1405     clear_serialized_tx(NULL, t->root, NULL);
1406 }
1407 
1408 static void write_be32(GByteArray *save, uint32_t val)
1409 {
1410     uint32_t be = htonl(val);
1411     g_byte_array_append(save, (void *)&be, sizeof(be));
1412 }
1413 
1414 
1415 struct save_state {
1416     GByteArray *bytes;
1417     unsigned int tx_id;
1418 };
1419 
1420 #define MODIFIED_IN_TX  (1U << 0)
1421 #define DELETED_IN_TX   (1U << 1)
1422 #define NODE_REF        (1U << 2)
1423 
1424 static void save_node(gpointer key, gpointer value, gpointer opaque)
1425 {
1426     struct save_state *ss = opaque;
1427     XsNode *n = value;
1428     char *name = key;
1429     uint8_t flag = 0;
1430 
1431     /* Child nodes (i.e. anything but the root) have a name */
1432     if (name) {
1433         g_byte_array_append(ss->bytes, key, strlen(key) + 1);
1434     }
1435 
1436     /*
1437      * If we already wrote this node, refer to the previous copy.
1438      * There's no rename/move in XenStore, so all we need to find
1439      * it is the tx_id of the transaction in which it exists. Which
1440      * may be the root tx.
1441      */
1442     if (n->serialized_tx != XBT_NULL) {
1443         flag = NODE_REF;
1444         g_byte_array_append(ss->bytes, &flag, 1);
1445         write_be32(ss->bytes, n->serialized_tx);
1446     } else {
1447         GList *l;
1448         n->serialized_tx = ss->tx_id;
1449 
1450         if (n->modified_in_tx) {
1451             flag |= MODIFIED_IN_TX;
1452         }
1453         if (n->deleted_in_tx) {
1454             flag |= DELETED_IN_TX;
1455         }
1456         g_byte_array_append(ss->bytes, &flag, 1);
1457 
1458         if (n->content) {
1459             write_be32(ss->bytes, n->content->len);
1460             g_byte_array_append(ss->bytes, n->content->data, n->content->len);
1461         } else {
1462             write_be32(ss->bytes, 0);
1463         }
1464 
1465         for (l = n->perms; l; l = l->next) {
1466             g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
1467         }
1468         /* NUL termination after perms */
1469         g_byte_array_append(ss->bytes, (void *)"", 1);
1470 
1471         if (n->children) {
1472             g_hash_table_foreach(n->children, save_node, ss);
1473         }
1474         /* NUL termination after children (child name is NUL) */
1475         g_byte_array_append(ss->bytes, (void *)"", 1);
1476     }
1477 }
1478 
1479 static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
1480 {
1481     write_be32(ss->bytes, tx_id);
1482     ss->tx_id = tx_id;
1483     save_node(NULL, root, ss);
1484 }
1485 
1486 static void save_tx(gpointer key, gpointer value, gpointer opaque)
1487 {
1488     uint32_t tx_id = GPOINTER_TO_INT(key);
1489     struct save_state *ss = opaque;
1490     XsTransaction *n = value;
1491 
1492     write_be32(ss->bytes, n->base_tx);
1493     write_be32(ss->bytes, n->dom_id);
1494 
1495     save_tree(ss, tx_id, n->root);
1496 }
1497 
1498 static void save_watch(gpointer key, gpointer value, gpointer opaque)
1499 {
1500     struct save_state *ss = opaque;
1501     XsWatch *w = value;
1502 
1503     /* We only save the *guest* watches. */
1504     if (w->dom_id) {
1505         gpointer relpath = key + w->rel_prefix;
1506         g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
1507         g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
1508     }
1509 }
1510 
1511 GByteArray *xs_impl_serialize(XenstoreImplState *s)
1512 {
1513     struct save_state ss;
1514 
1515     ss.bytes = g_byte_array_new();
1516 
1517     /*
1518      * node = flags [ real_node / node_ref ]
1519      *   flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1520      *   node_ref = tx_id (in which the original version of this node exists)
1521      *   real_node = content perms child* NUL
1522      *     content = len data
1523      *       len = uint32_t
1524      *       data = uint8_t{len}
1525      *     perms = perm* NUL
1526      *       perm = asciiz
1527      *   child = name node
1528      *     name = asciiz
1529      *
1530      * tree = tx_id node
1531      *   tx_id = uint32_t
1532      *
1533      * transaction = base_tx_id dom_id tree
1534      *   base_tx_id = uint32_t
1535      *   dom_id = uint32_t
1536      *
1537      * tx_list = tree transaction* XBT_NULL
1538      *
1539      * watch = path token
1540      *   path = asciiz
1541      *   token = asciiz
1542      *
1543      * watch_list = watch* NUL
1544      *
1545      * xs_serialize_stream = last_tx tx_list watch_list
1546      *   last_tx = uint32_t
1547      */
1548 
1549     /* Clear serialized_tx in every node. */
1550     if (s->serialized) {
1551         clear_serialized_tx(NULL, s->root, NULL);
1552         g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
1553     }
1554 
1555     s->serialized = true;
1556 
1557     write_be32(ss.bytes, s->last_tx);
1558     save_tree(&ss, s->root_tx, s->root);
1559     g_hash_table_foreach(s->transactions, save_tx, &ss);
1560 
1561     write_be32(ss.bytes, XBT_NULL);
1562 
1563     g_hash_table_foreach(s->watches, save_watch, &ss);
1564     g_byte_array_append(ss.bytes, (void *)"", 1);
1565 
1566     return ss.bytes;
1567 }
1568 
1569 struct unsave_state {
1570     char path[XENSTORE_ABS_PATH_MAX + 1];
1571     XenstoreImplState *s;
1572     GByteArray *bytes;
1573     uint8_t *d;
1574     size_t l;
1575     bool root_walk;
1576 };
1577 
1578 static int consume_be32(struct unsave_state *us, unsigned int *val)
1579 {
1580     uint32_t d;
1581 
1582     if (us->l < sizeof(d)) {
1583         return -EINVAL;
1584     }
1585     memcpy(&d, us->d, sizeof(d));
1586     *val = ntohl(d);
1587     us->d += sizeof(d);
1588     us->l -= sizeof(d);
1589     return 0;
1590 }
1591 
1592 static int consume_string(struct unsave_state *us, char **str, size_t *len)
1593 {
1594     size_t l;
1595 
1596     if (!us->l) {
1597         return -EINVAL;
1598     }
1599 
1600     l = strnlen((void *)us->d, us->l);
1601     if (l == us->l) {
1602         return -EINVAL;
1603     }
1604 
1605     if (str) {
1606         *str = (void *)us->d;
1607     }
1608     if (len) {
1609         *len = l;
1610     }
1611 
1612     us->d += l + 1;
1613     us->l -= l + 1;
1614     return 0;
1615 }
1616 
1617 static XsNode *lookup_node(XsNode *n, char *path)
1618 {
1619     char *slash = strchr(path, '/');
1620     XsNode *child;
1621 
1622     if (path[0] == '\0') {
1623         return n;
1624     }
1625 
1626     if (slash) {
1627         *slash = '\0';
1628     }
1629 
1630     if (!n->children) {
1631         return NULL;
1632     }
1633     child = g_hash_table_lookup(n->children, path);
1634     if (!slash) {
1635         return child;
1636     }
1637 
1638     *slash = '/';
1639     if (!child) {
1640         return NULL;
1641     }
1642     return lookup_node(child, slash + 1);
1643 }
1644 
1645 static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
1646 {
1647     XsTransaction *t;
1648     if (tx_id == us->s->root_tx) {
1649         return lookup_node(us->s->root, us->path + 1);
1650     }
1651 
1652     t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
1653     if (!t) {
1654         return NULL;
1655     }
1656     g_assert(t->root);
1657     return lookup_node(t->root, us->path + 1);
1658 }
1659 
1660 static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
1661 {
1662     unsigned int *nr_nodes = user_data;
1663     XsNode *n = value;
1664 
1665     (*nr_nodes)++;
1666 
1667     if (n->children) {
1668         g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1669     }
1670 }
1671 
1672 static int consume_node(struct unsave_state *us, XsNode **nodep,
1673                         unsigned int *nr_nodes)
1674 {
1675     XsNode *n = NULL;
1676     uint8_t flags;
1677     int ret;
1678 
1679     if (us->l < 1) {
1680         return -EINVAL;
1681     }
1682     flags = us->d[0];
1683     us->d++;
1684     us->l--;
1685 
1686     if (flags == NODE_REF) {
1687         unsigned int tx;
1688 
1689         ret = consume_be32(us, &tx);
1690         if (ret) {
1691             return ret;
1692         }
1693 
1694         n = lookup_tx_node(us, tx);
1695         if (!n) {
1696             return -EINVAL;
1697         }
1698         n->ref++;
1699         if (n->children) {
1700             g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1701         }
1702     } else {
1703         uint32_t datalen;
1704 
1705         if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
1706             return -EINVAL;
1707         }
1708         n = xs_node_new();
1709 
1710         if (flags & DELETED_IN_TX) {
1711             n->deleted_in_tx = true;
1712         }
1713         if (flags & MODIFIED_IN_TX) {
1714             n->modified_in_tx = true;
1715         }
1716         ret = consume_be32(us, &datalen);
1717         if (ret) {
1718             xs_node_unref(n);
1719             return -EINVAL;
1720         }
1721         if (datalen) {
1722             if (datalen > us->l) {
1723                 xs_node_unref(n);
1724                 return -EINVAL;
1725             }
1726 
1727             GByteArray *node_data = g_byte_array_new();
1728             g_byte_array_append(node_data, us->d, datalen);
1729             us->d += datalen;
1730             us->l -= datalen;
1731             n->content = node_data;
1732 
1733             if (us->root_walk) {
1734                 n->modified_in_tx = true;
1735             }
1736         }
1737         while (1) {
1738             char *perm = NULL;
1739             size_t permlen = 0;
1740 
1741             ret = consume_string(us, &perm, &permlen);
1742             if (ret) {
1743                 xs_node_unref(n);
1744                 return ret;
1745             }
1746 
1747             if (!permlen) {
1748                 break;
1749             }
1750 
1751             n->perms = g_list_append(n->perms, g_strdup(perm));
1752         }
1753 
1754         /* Now children */
1755         while (1) {
1756             size_t childlen;
1757             char *childname;
1758             char *pathend;
1759             XsNode *child = NULL;
1760 
1761             ret = consume_string(us, &childname, &childlen);
1762             if (ret) {
1763                 xs_node_unref(n);
1764                 return ret;
1765             }
1766 
1767             if (!childlen) {
1768                 break;
1769             }
1770 
1771             pathend = us->path + strlen(us->path);
1772             strncat(us->path, "/", sizeof(us->path) - 1);
1773             strncat(us->path, childname, sizeof(us->path) - 1);
1774 
1775             ret = consume_node(us, &child, nr_nodes);
1776             *pathend = '\0';
1777             if (ret) {
1778                 xs_node_unref(n);
1779                 return ret;
1780             }
1781             g_assert(child);
1782             xs_node_add_child(n, childname, child);
1783         }
1784 
1785         /*
1786          * If the node has no data and no children we still want to fire
1787          * a watch on it.
1788          */
1789         if (us->root_walk && !n->children) {
1790             n->modified_in_tx = true;
1791         }
1792     }
1793 
1794     if (!n->deleted_in_tx) {
1795         (*nr_nodes)++;
1796     }
1797 
1798     *nodep = n;
1799     return 0;
1800 }
1801 
1802 static int consume_tree(struct unsave_state *us, XsTransaction *t)
1803 {
1804     int ret;
1805 
1806     ret = consume_be32(us, &t->tx_id);
1807     if (ret) {
1808         return ret;
1809     }
1810 
1811     if (t->tx_id > us->s->last_tx) {
1812         return -EINVAL;
1813     }
1814 
1815     us->path[0] = '\0';
1816 
1817     return consume_node(us, &t->root, &t->nr_nodes);
1818 }
1819 
1820 int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
1821                         unsigned int dom_id, xs_impl_watch_fn watch_fn,
1822                         void *watch_opaque)
1823 {
1824     struct unsave_state us;
1825     XsTransaction base_t = { 0 };
1826     int ret;
1827 
1828     us.s = s;
1829     us.bytes = bytes;
1830     us.d = bytes->data;
1831     us.l = bytes->len;
1832 
1833     xs_impl_reset_watches(s, dom_id);
1834     g_hash_table_remove_all(s->transactions);
1835 
1836     xs_node_unref(s->root);
1837     s->root = NULL;
1838     s->root_tx = s->last_tx = XBT_NULL;
1839 
1840     ret = consume_be32(&us, &s->last_tx);
1841     if (ret) {
1842         return ret;
1843     }
1844 
1845     /*
1846      * Consume the base tree into a transaction so that watches can be
1847      * fired as we commit it. By setting us.root_walk we cause the nodes
1848      * to be marked as 'modified_in_tx' as they are created, so that the
1849      * watches are triggered on them.
1850      */
1851     base_t.dom_id = dom_id;
1852     base_t.base_tx = XBT_NULL;
1853     us.root_walk = true;
1854     ret = consume_tree(&us, &base_t);
1855     if (ret) {
1856         return ret;
1857     }
1858     us.root_walk = false;
1859 
1860     /*
1861      * Commit the transaction now while the refcount on all nodes is 1.
1862      * Note that we haven't yet reinstated the *guest* watches but that's
1863      * OK because we don't want the guest to see any changes. Even any
1864      * backend nodes which get recreated should be *precisely* as they
1865      * were before the migration. Back ends may have been instantiated
1866      * already, and will see the frontend magically blink into existence
1867      * now (well, from the aio_bh which fires the watches). It's their
1868      * responsibility to rebuild everything precisely as it was before.
1869      */
1870     ret = transaction_commit(s, &base_t);
1871     if (ret) {
1872         return ret;
1873     }
1874 
1875     while (1) {
1876         unsigned int base_tx;
1877         XsTransaction *t;
1878 
1879         ret = consume_be32(&us, &base_tx);
1880         if (ret) {
1881             return ret;
1882         }
1883         if (base_tx == XBT_NULL) {
1884             break;
1885         }
1886 
1887         t = g_new0(XsTransaction, 1);
1888         t->base_tx = base_tx;
1889 
1890         ret = consume_be32(&us, &t->dom_id);
1891         if (!ret) {
1892             ret = consume_tree(&us, t);
1893         }
1894         if (ret) {
1895             g_free(t);
1896             return ret;
1897         }
1898         g_assert(t->root);
1899         if (t->dom_id) {
1900             s->nr_domu_transactions++;
1901         }
1902         g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
1903     }
1904 
1905     while (1) {
1906         char *path, *token;
1907         size_t pathlen, toklen;
1908 
1909         ret = consume_string(&us, &path, &pathlen);
1910         if (ret) {
1911             return ret;
1912         }
1913         if (!pathlen) {
1914             break;
1915         }
1916 
1917         ret = consume_string(&us, &token, &toklen);
1918         if (ret) {
1919             return ret;
1920         }
1921 
1922         if (!watch_fn) {
1923             continue;
1924         }
1925 
1926         ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
1927         if (ret) {
1928             return ret;
1929         }
1930     }
1931 
1932     if (us.l) {
1933         return -EINVAL;
1934     }
1935 
1936     return 0;
1937 }
1938