xref: /openbmc/u-boot/fs/zfs/zfs.c (revision 13bdce8f8cadf07bc81d7000a04e48f3028de543)
1  /*
2   *
3   * ZFS filesystem ported to u-boot by
4   * Jorgen Lundman <lundman at lundman.net>
5   *
6   *	GRUB  --  GRand Unified Bootloader
7   *	Copyright (C) 1999,2000,2001,2002,2003,2004
8   *	Free Software Foundation, Inc.
9   *	Copyright 2004	Sun Microsystems, Inc.
10   *
11   * SPDX-License-Identifier:	GPL-2.0+
12   */
13  
14  #include <common.h>
15  #include <malloc.h>
16  #include <linux/stat.h>
17  #include <linux/time.h>
18  #include <linux/ctype.h>
19  #include <asm/byteorder.h>
20  #include "zfs_common.h"
21  #include "div64.h"
22  
23  struct blk_desc *zfs_dev_desc;
24  
25  /*
26   * The zfs plug-in routines for GRUB are:
27   *
28   * zfs_mount() - locates a valid uberblock of the root pool and reads
29   *		in its MOS at the memory address MOS.
30   *
31   * zfs_open() - locates a plain file object by following the MOS
32   *		and places its dnode at the memory address DNODE.
33   *
34   * zfs_read() - read in the data blocks pointed by the DNODE.
35   *
36   */
37  
38  #include <zfs/zfs.h>
39  #include <zfs/zio.h>
40  #include <zfs/dnode.h>
41  #include <zfs/uberblock_impl.h>
42  #include <zfs/vdev_impl.h>
43  #include <zfs/zio_checksum.h>
44  #include <zfs/zap_impl.h>
45  #include <zfs/zap_leaf.h>
46  #include <zfs/zfs_znode.h>
47  #include <zfs/dmu.h>
48  #include <zfs/dmu_objset.h>
49  #include <zfs/sa_impl.h>
50  #include <zfs/dsl_dir.h>
51  #include <zfs/dsl_dataset.h>
52  
53  
54  #define	ZPOOL_PROP_BOOTFS		"bootfs"
55  
56  
57  /*
58   * For nvlist manipulation. (from nvpair.h)
59   */
60  #define	NV_ENCODE_NATIVE	0
61  #define	NV_ENCODE_XDR		1
62  #define	NV_BIG_ENDIAN			0
63  #define	NV_LITTLE_ENDIAN	1
64  #define	DATA_TYPE_UINT64	8
65  #define	DATA_TYPE_STRING	9
66  #define	DATA_TYPE_NVLIST	19
67  #define	DATA_TYPE_NVLIST_ARRAY	20
68  
69  
70  /*
71   * Macros to get fields in a bp or DVA.
72   */
73  #define	P2PHASE(x, align)		((x) & ((align) - 1))
74  #define	DVA_OFFSET_TO_PHYS_SECTOR(offset)					\
75  	((offset + VDEV_LABEL_START_SIZE) >> SPA_MINBLOCKSHIFT)
76  
77  /*
78   * return x rounded down to an align boundary
79   * eg, P2ALIGN(1200, 1024) == 1024 (1*align)
80   * eg, P2ALIGN(1024, 1024) == 1024 (1*align)
81   * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
82   * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
83   */
84  #define	P2ALIGN(x, align)		((x) & -(align))
85  
86  /*
87   * FAT ZAP data structures
88   */
89  #define	ZFS_CRC64_POLY 0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
90  #define	ZAP_HASH_IDX(hash, n)	(((n) == 0) ? 0 : ((hash) >> (64 - (n))))
91  #define	CHAIN_END	0xffff	/* end of the chunk chain */
92  
93  /*
94   * The amount of space within the chunk available for the array is:
95   * chunk size - space for type (1) - space for next pointer (2)
96   */
97  #define	ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
98  
99  #define	ZAP_LEAF_HASH_SHIFT(bs)	(bs - 5)
100  #define	ZAP_LEAF_HASH_NUMENTRIES(bs) (1 << ZAP_LEAF_HASH_SHIFT(bs))
101  #define	LEAF_HASH(bs, h)												\
102  	((ZAP_LEAF_HASH_NUMENTRIES(bs)-1) &									\
103  	 ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(bs)-l->l_hdr.lh_prefix_len)))
104  
105  /*
106   * The amount of space available for chunks is:
107   * block size shift - hash entry size (2) * number of hash
108   * entries - header space (2*chunksize)
109   */
110  #define	ZAP_LEAF_NUMCHUNKS(bs)						\
111  	(((1<<bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(bs)) /	\
112  	 ZAP_LEAF_CHUNKSIZE - 2)
113  
114  /*
115   * The chunks start immediately after the hash table.  The end of the
116   * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
117   * chunk_t.
118   */
119  #define	ZAP_LEAF_CHUNK(l, bs, idx)										\
120  	((zap_leaf_chunk_t *)(l->l_hash + ZAP_LEAF_HASH_NUMENTRIES(bs)))[idx]
121  #define	ZAP_LEAF_ENTRY(l, bs, idx) (&ZAP_LEAF_CHUNK(l, bs, idx).l_entry)
122  
123  
124  /*
125   * Decompression Entry - lzjb
126   */
127  #ifndef	NBBY
128  #define	NBBY	8
129  #endif
130  
131  
132  
133  typedef int zfs_decomp_func_t(void *s_start, void *d_start,
134  							  uint32_t s_len, uint32_t d_len);
135  typedef struct decomp_entry {
136  	char *name;
137  	zfs_decomp_func_t *decomp_func;
138  } decomp_entry_t;
139  
140  typedef struct dnode_end {
141  	dnode_phys_t dn;
142  	zfs_endian_t endian;
143  } dnode_end_t;
144  
145  struct zfs_data {
146  	/* cache for a file block of the currently zfs_open()-ed file */
147  	char *file_buf;
148  	uint64_t file_start;
149  	uint64_t file_end;
150  
151  	/* XXX: ashift is per vdev, not per pool.  We currently only ever touch
152  	 * a single vdev, but when/if raid-z or stripes are supported, this
153  	 * may need revision.
154  	 */
155  	uint64_t vdev_ashift;
156  	uint64_t label_txg;
157  	uint64_t pool_guid;
158  
159  	/* cache for a dnode block */
160  	dnode_phys_t *dnode_buf;
161  	dnode_phys_t *dnode_mdn;
162  	uint64_t dnode_start;
163  	uint64_t dnode_end;
164  	zfs_endian_t dnode_endian;
165  
166  	uberblock_t current_uberblock;
167  
168  	dnode_end_t mos;
169  	dnode_end_t mdn;
170  	dnode_end_t dnode;
171  
172  	uint64_t vdev_phys_sector;
173  
174  	int (*userhook)(const char *, const struct zfs_dirhook_info *);
175  	struct zfs_dirhook_info *dirinfo;
176  
177  };
178  
179  
180  
181  
182  static int
183  zlib_decompress(void *s, void *d,
184  				uint32_t slen, uint32_t dlen)
185  {
186  	if (zlib_decompress(s, d, slen, dlen) < 0)
187  		return ZFS_ERR_BAD_FS;
188  	return ZFS_ERR_NONE;
189  }
190  
191  static decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] = {
192  	{"inherit", NULL},		/* ZIO_COMPRESS_INHERIT */
193  	{"on", lzjb_decompress},	/* ZIO_COMPRESS_ON */
194  	{"off", NULL},		/* ZIO_COMPRESS_OFF */
195  	{"lzjb", lzjb_decompress},	/* ZIO_COMPRESS_LZJB */
196  	{"empty", NULL},		/* ZIO_COMPRESS_EMPTY */
197  	{"gzip-1", zlib_decompress},  /* ZIO_COMPRESS_GZIP1 */
198  	{"gzip-2", zlib_decompress},  /* ZIO_COMPRESS_GZIP2 */
199  	{"gzip-3", zlib_decompress},  /* ZIO_COMPRESS_GZIP3 */
200  	{"gzip-4", zlib_decompress},  /* ZIO_COMPRESS_GZIP4 */
201  	{"gzip-5", zlib_decompress},  /* ZIO_COMPRESS_GZIP5 */
202  	{"gzip-6", zlib_decompress},  /* ZIO_COMPRESS_GZIP6 */
203  	{"gzip-7", zlib_decompress},  /* ZIO_COMPRESS_GZIP7 */
204  	{"gzip-8", zlib_decompress},  /* ZIO_COMPRESS_GZIP8 */
205  	{"gzip-9", zlib_decompress},  /* ZIO_COMPRESS_GZIP9 */
206  };
207  
208  
209  
210  static int zio_read_data(blkptr_t *bp, zfs_endian_t endian,
211  						 void *buf, struct zfs_data *data);
212  
213  static int
214  zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
215  		 size_t *size, struct zfs_data *data);
216  
217  /*
218   * Our own version of log2().  Same thing as highbit()-1.
219   */
220  static int
221  zfs_log2(uint64_t num)
222  {
223  	int i = 0;
224  
225  	while (num > 1) {
226  		i++;
227  		num = num >> 1;
228  	}
229  
230  	return i;
231  }
232  
233  
234  /* Checksum Functions */
235  static void
236  zio_checksum_off(const void *buf __attribute__ ((unused)),
237  				 uint64_t size __attribute__ ((unused)),
238  				 zfs_endian_t endian __attribute__ ((unused)),
239  				 zio_cksum_t *zcp)
240  {
241  	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
242  }
243  
244  /* Checksum Table and Values */
245  static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
246  	{NULL, 0, 0, "inherit"},
247  	{NULL, 0, 0, "on"},
248  	{zio_checksum_off, 0, 0, "off"},
249  	{zio_checksum_SHA256, 1, 1, "label"},
250  	{zio_checksum_SHA256, 1, 1, "gang_header"},
251  	{NULL, 0, 0, "zilog"},
252  	{fletcher_2_endian, 0, 0, "fletcher2"},
253  	{fletcher_4_endian, 1, 0, "fletcher4"},
254  	{zio_checksum_SHA256, 1, 0, "SHA256"},
255  	{NULL, 0, 0, "zilog2"},
256  };
257  
258  /*
259   * zio_checksum_verify: Provides support for checksum verification.
260   *
261   * Fletcher2, Fletcher4, and SHA256 are supported.
262   *
263   */
264  static int
265  zio_checksum_verify(zio_cksum_t zc, uint32_t checksum,
266  					zfs_endian_t endian, char *buf, int size)
267  {
268  	zio_eck_t *zec = (zio_eck_t *) (buf + size) - 1;
269  	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270  	zio_cksum_t actual_cksum, expected_cksum;
271  
272  	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func == NULL) {
273  		printf("zfs unknown checksum function %d\n", checksum);
274  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
275  	}
276  
277  	if (ci->ci_eck) {
278  		expected_cksum = zec->zec_cksum;
279  		zec->zec_cksum = zc;
280  		ci->ci_func(buf, size, endian, &actual_cksum);
281  		zec->zec_cksum = expected_cksum;
282  		zc = expected_cksum;
283  	} else {
284  		ci->ci_func(buf, size, endian, &actual_cksum);
285  	}
286  
287  	if ((actual_cksum.zc_word[0] != zc.zc_word[0])
288  		|| (actual_cksum.zc_word[1] != zc.zc_word[1])
289  		|| (actual_cksum.zc_word[2] != zc.zc_word[2])
290  		|| (actual_cksum.zc_word[3] != zc.zc_word[3])) {
291  		return ZFS_ERR_BAD_FS;
292  	}
293  
294  	return ZFS_ERR_NONE;
295  }
296  
297  /*
298   * vdev_uberblock_compare takes two uberblock structures and returns an integer
299   * indicating the more recent of the two.
300   *	Return Value = 1 if ub2 is more recent
301   *	Return Value = -1 if ub1 is more recent
302   * The most recent uberblock is determined using its transaction number and
303   * timestamp.  The uberblock with the highest transaction number is
304   * considered "newer".	If the transaction numbers of the two blocks match, the
305   * timestamps are compared to determine the "newer" of the two.
306   */
307  static int
308  vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
309  {
310  	zfs_endian_t ub1_endian, ub2_endian;
311  	if (zfs_to_cpu64(ub1->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
312  		ub1_endian = LITTLE_ENDIAN;
313  	else
314  		ub1_endian = BIG_ENDIAN;
315  	if (zfs_to_cpu64(ub2->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
316  		ub2_endian = LITTLE_ENDIAN;
317  	else
318  		ub2_endian = BIG_ENDIAN;
319  
320  	if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
321  		< zfs_to_cpu64(ub2->ub_txg, ub2_endian))
322  		return -1;
323  	if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
324  		> zfs_to_cpu64(ub2->ub_txg, ub2_endian))
325  		return 1;
326  
327  	if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
328  		< zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
329  		return -1;
330  	if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
331  		> zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
332  		return 1;
333  
334  	return 0;
335  }
336  
337  /*
338   * Three pieces of information are needed to verify an uberblock: the magic
339   * number, the version number, and the checksum.
340   *
341   * Currently Implemented: version number, magic number, label txg
342   * Need to Implement: checksum
343   *
344   */
345  static int
346  uberblock_verify(uberblock_t *uber, int offset, struct zfs_data *data)
347  {
348  	int err;
349  	zfs_endian_t endian = UNKNOWN_ENDIAN;
350  	zio_cksum_t zc;
351  
352  	if (uber->ub_txg < data->label_txg) {
353  		debug("ignoring partially written label: uber_txg < label_txg %llu %llu\n",
354  			  uber->ub_txg, data->label_txg);
355  		return ZFS_ERR_BAD_FS;
356  	}
357  
358  	if (zfs_to_cpu64(uber->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
359  		&& zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) > 0
360  		&& zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) <= SPA_VERSION)
361  		endian = LITTLE_ENDIAN;
362  
363  	if (zfs_to_cpu64(uber->ub_magic, BIG_ENDIAN) == UBERBLOCK_MAGIC
364  		&& zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) > 0
365  		&& zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) <= SPA_VERSION)
366  		endian = BIG_ENDIAN;
367  
368  	if (endian == UNKNOWN_ENDIAN) {
369  		printf("invalid uberblock magic\n");
370  		return ZFS_ERR_BAD_FS;
371  	}
372  
373  	memset(&zc, 0, sizeof(zc));
374  	zc.zc_word[0] = cpu_to_zfs64(offset, endian);
375  	err = zio_checksum_verify(zc, ZIO_CHECKSUM_LABEL, endian,
376  							  (char *) uber, UBERBLOCK_SIZE(data->vdev_ashift));
377  
378  	if (!err) {
379  		/* Check that the data pointed by the rootbp is usable. */
380  		void *osp = NULL;
381  		size_t ospsize;
382  		err = zio_read(&uber->ub_rootbp, endian, &osp, &ospsize, data);
383  		free(osp);
384  
385  		if (!err && ospsize < OBJSET_PHYS_SIZE_V14) {
386  			printf("uberblock rootbp points to invalid data\n");
387  			return ZFS_ERR_BAD_FS;
388  		}
389  	}
390  
391  	return err;
392  }
393  
394  /*
395   * Find the best uberblock.
396   * Return:
397   *	  Success - Pointer to the best uberblock.
398   *	  Failure - NULL
399   */
400  static uberblock_t *find_bestub(char *ub_array, struct zfs_data *data)
401  {
402  	const uint64_t sector = data->vdev_phys_sector;
403  	uberblock_t *ubbest = NULL;
404  	uberblock_t *ubnext;
405  	unsigned int i, offset, pickedub = 0;
406  	int err = ZFS_ERR_NONE;
407  
408  	const unsigned int UBCOUNT = UBERBLOCK_COUNT(data->vdev_ashift);
409  	const uint64_t UBBYTES = UBERBLOCK_SIZE(data->vdev_ashift);
410  
411  	for (i = 0; i < UBCOUNT; i++) {
412  		ubnext = (uberblock_t *) (i * UBBYTES + ub_array);
413  		offset = (sector << SPA_MINBLOCKSHIFT) + VDEV_PHYS_SIZE + (i * UBBYTES);
414  
415  		err = uberblock_verify(ubnext, offset, data);
416  		if (err)
417  			continue;
418  
419  		if (ubbest == NULL || vdev_uberblock_compare(ubnext, ubbest) > 0) {
420  			ubbest = ubnext;
421  			pickedub = i;
422  		}
423  	}
424  
425  	if (ubbest)
426  		debug("zfs Found best uberblock at idx %d, txg %llu\n",
427  			  pickedub, (unsigned long long) ubbest->ub_txg);
428  
429  	return ubbest;
430  }
431  
432  static inline size_t
433  get_psize(blkptr_t *bp, zfs_endian_t endian)
434  {
435  	return (((zfs_to_cpu64((bp)->blk_prop, endian) >> 16) & 0xffff) + 1)
436  			<< SPA_MINBLOCKSHIFT;
437  }
438  
439  static uint64_t
440  dva_get_offset(dva_t *dva, zfs_endian_t endian)
441  {
442  	return zfs_to_cpu64((dva)->dva_word[1],
443  							 endian) << SPA_MINBLOCKSHIFT;
444  }
445  
446  /*
447   * Read a block of data based on the gang block address dva,
448   * and put its data in buf.
449   *
450   */
451  static int
452  zio_read_gang(blkptr_t *bp, zfs_endian_t endian, dva_t *dva, void *buf,
453  			  struct zfs_data *data)
454  {
455  	zio_gbh_phys_t *zio_gb;
456  	uint64_t offset, sector;
457  	unsigned i;
458  	int err;
459  	zio_cksum_t zc;
460  
461  	memset(&zc, 0, sizeof(zc));
462  
463  	zio_gb = malloc(SPA_GANGBLOCKSIZE);
464  	if (!zio_gb)
465  		return ZFS_ERR_OUT_OF_MEMORY;
466  
467  	offset = dva_get_offset(dva, endian);
468  	sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
469  
470  	/* read in the gang block header */
471  	err = zfs_devread(sector, 0, SPA_GANGBLOCKSIZE, (char *) zio_gb);
472  
473  	if (err) {
474  		free(zio_gb);
475  		return err;
476  	}
477  
478  	/* XXX */
479  	/* self checksuming the gang block header */
480  	ZIO_SET_CHECKSUM(&zc, DVA_GET_VDEV(dva),
481  					 dva_get_offset(dva, endian), bp->blk_birth, 0);
482  	err = zio_checksum_verify(zc, ZIO_CHECKSUM_GANG_HEADER, endian,
483  							  (char *) zio_gb, SPA_GANGBLOCKSIZE);
484  	if (err) {
485  		free(zio_gb);
486  		return err;
487  	}
488  
489  	endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
490  
491  	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
492  		if (zio_gb->zg_blkptr[i].blk_birth == 0)
493  			continue;
494  
495  		err = zio_read_data(&zio_gb->zg_blkptr[i], endian, buf, data);
496  		if (err) {
497  			free(zio_gb);
498  			return err;
499  		}
500  		buf = (char *) buf + get_psize(&zio_gb->zg_blkptr[i], endian);
501  	}
502  	free(zio_gb);
503  	return ZFS_ERR_NONE;
504  }
505  
506  /*
507   * Read in a block of raw data to buf.
508   */
509  static int
510  zio_read_data(blkptr_t *bp, zfs_endian_t endian, void *buf,
511  			  struct zfs_data *data)
512  {
513  	int i, psize;
514  	int err = ZFS_ERR_NONE;
515  
516  	psize = get_psize(bp, endian);
517  
518  	/* pick a good dva from the block pointer */
519  	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
520  		uint64_t offset, sector;
521  
522  		if (bp->blk_dva[i].dva_word[0] == 0 && bp->blk_dva[i].dva_word[1] == 0)
523  			continue;
524  
525  		if ((zfs_to_cpu64(bp->blk_dva[i].dva_word[1], endian)>>63) & 1) {
526  			err = zio_read_gang(bp, endian, &bp->blk_dva[i], buf, data);
527  		} else {
528  			/* read in a data block */
529  			offset = dva_get_offset(&bp->blk_dva[i], endian);
530  			sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
531  
532  			err = zfs_devread(sector, 0, psize, buf);
533  		}
534  
535  		if (!err) {
536  			/*Check the underlying checksum before we rule this DVA as "good"*/
537  			uint32_t checkalgo = (zfs_to_cpu64((bp)->blk_prop, endian) >> 40) & 0xff;
538  
539  			err = zio_checksum_verify(bp->blk_cksum, checkalgo, endian, buf, psize);
540  			if (!err)
541  				return ZFS_ERR_NONE;
542  		}
543  
544  		/* If read failed or checksum bad, reset the error.	 Hopefully we've got some more DVA's to try.*/
545  	}
546  
547  	if (!err) {
548  		printf("couldn't find a valid DVA\n");
549  		err = ZFS_ERR_BAD_FS;
550  	}
551  
552  	return err;
553  }
554  
555  /*
556   * Read in a block of data, verify its checksum, decompress if needed,
557   * and put the uncompressed data in buf.
558   */
559  static int
560  zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
561  		 size_t *size, struct zfs_data *data)
562  {
563  	size_t lsize, psize;
564  	unsigned int comp;
565  	char *compbuf = NULL;
566  	int err;
567  
568  	*buf = NULL;
569  
570  	comp = (zfs_to_cpu64((bp)->blk_prop, endian)>>32) & 0xff;
571  	lsize = (BP_IS_HOLE(bp) ? 0 :
572  			 (((zfs_to_cpu64((bp)->blk_prop, endian) & 0xffff) + 1)
573  			  << SPA_MINBLOCKSHIFT));
574  	psize = get_psize(bp, endian);
575  
576  	if (size)
577  		*size = lsize;
578  
579  	if (comp >= ZIO_COMPRESS_FUNCTIONS) {
580  		printf("compression algorithm %u not supported\n", (unsigned int) comp);
581  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
582  	}
583  
584  	if (comp != ZIO_COMPRESS_OFF && decomp_table[comp].decomp_func == NULL) {
585  		printf("compression algorithm %s not supported\n", decomp_table[comp].name);
586  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
587  	}
588  
589  	if (comp != ZIO_COMPRESS_OFF) {
590  		compbuf = malloc(psize);
591  		if (!compbuf)
592  			return ZFS_ERR_OUT_OF_MEMORY;
593  	} else {
594  		compbuf = *buf = malloc(lsize);
595  	}
596  
597  	err = zio_read_data(bp, endian, compbuf, data);
598  	if (err) {
599  		free(compbuf);
600  		*buf = NULL;
601  		return err;
602  	}
603  
604  	if (comp != ZIO_COMPRESS_OFF) {
605  		*buf = malloc(lsize);
606  		if (!*buf) {
607  			free(compbuf);
608  			return ZFS_ERR_OUT_OF_MEMORY;
609  		}
610  
611  		err = decomp_table[comp].decomp_func(compbuf, *buf, psize, lsize);
612  		free(compbuf);
613  		if (err) {
614  			free(*buf);
615  			*buf = NULL;
616  			return err;
617  		}
618  	}
619  
620  	return ZFS_ERR_NONE;
621  }
622  
623  /*
624   * Get the block from a block id.
625   * push the block onto the stack.
626   *
627   */
628  static int
629  dmu_read(dnode_end_t *dn, uint64_t blkid, void **buf,
630  		 zfs_endian_t *endian_out, struct zfs_data *data)
631  {
632  	int idx, level;
633  	blkptr_t *bp_array = dn->dn.dn_blkptr;
634  	int epbs = dn->dn.dn_indblkshift - SPA_BLKPTRSHIFT;
635  	blkptr_t *bp;
636  	void *tmpbuf = 0;
637  	zfs_endian_t endian;
638  	int err = ZFS_ERR_NONE;
639  
640  	bp = malloc(sizeof(blkptr_t));
641  	if (!bp)
642  		return ZFS_ERR_OUT_OF_MEMORY;
643  
644  	endian = dn->endian;
645  	for (level = dn->dn.dn_nlevels - 1; level >= 0; level--) {
646  		idx = (blkid >> (epbs * level)) & ((1 << epbs) - 1);
647  		*bp = bp_array[idx];
648  		if (bp_array != dn->dn.dn_blkptr) {
649  			free(bp_array);
650  			bp_array = 0;
651  		}
652  
653  		if (BP_IS_HOLE(bp)) {
654  			size_t size = zfs_to_cpu16(dn->dn.dn_datablkszsec,
655  											dn->endian)
656  				<< SPA_MINBLOCKSHIFT;
657  			*buf = malloc(size);
658  			if (*buf) {
659  				err = ZFS_ERR_OUT_OF_MEMORY;
660  				break;
661  			}
662  			memset(*buf, 0, size);
663  			endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
664  			break;
665  		}
666  		if (level == 0) {
667  			err = zio_read(bp, endian, buf, 0, data);
668  			endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
669  			break;
670  		}
671  		err = zio_read(bp, endian, &tmpbuf, 0, data);
672  		endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
673  		if (err)
674  			break;
675  		bp_array = tmpbuf;
676  	}
677  	if (bp_array != dn->dn.dn_blkptr)
678  		free(bp_array);
679  	if (endian_out)
680  		*endian_out = endian;
681  
682  	free(bp);
683  	return err;
684  }
685  
686  /*
687   * mzap_lookup: Looks up property described by "name" and returns the value
688   * in "value".
689   */
690  static int
691  mzap_lookup(mzap_phys_t *zapobj, zfs_endian_t endian,
692  			int objsize, char *name, uint64_t * value)
693  {
694  	int i, chunks;
695  	mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
696  
697  	chunks = objsize / MZAP_ENT_LEN - 1;
698  	for (i = 0; i < chunks; i++) {
699  		if (strcmp(mzap_ent[i].mze_name, name) == 0) {
700  			*value = zfs_to_cpu64(mzap_ent[i].mze_value, endian);
701  			return ZFS_ERR_NONE;
702  		}
703  	}
704  
705  	printf("couldn't find '%s'\n", name);
706  	return ZFS_ERR_FILE_NOT_FOUND;
707  }
708  
709  static int
710  mzap_iterate(mzap_phys_t *zapobj, zfs_endian_t endian, int objsize,
711  			 int (*hook)(const char *name,
712  						 uint64_t val,
713  						 struct zfs_data *data),
714  			 struct zfs_data *data)
715  {
716  	int i, chunks;
717  	mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
718  
719  	chunks = objsize / MZAP_ENT_LEN - 1;
720  	for (i = 0; i < chunks; i++) {
721  		if (hook(mzap_ent[i].mze_name,
722  				 zfs_to_cpu64(mzap_ent[i].mze_value, endian),
723  				 data))
724  			return 1;
725  	}
726  
727  	return 0;
728  }
729  
730  static uint64_t
731  zap_hash(uint64_t salt, const char *name)
732  {
733  	static uint64_t table[256];
734  	const uint8_t *cp;
735  	uint8_t c;
736  	uint64_t crc = salt;
737  
738  	if (table[128] == 0) {
739  		uint64_t *ct = NULL;
740  		int i, j;
741  		for (i = 0; i < 256; i++) {
742  			for (ct = table + i, *ct = i, j = 8; j > 0; j--)
743  				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
744  		}
745  	}
746  
747  	for (cp = (const uint8_t *) name; (c = *cp) != '\0'; cp++)
748  		crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
749  
750  	/*
751  	 * Only use 28 bits, since we need 4 bits in the cookie for the
752  	 * collision differentiator.  We MUST use the high bits, since
753  	 * those are the onces that we first pay attention to when
754  	 * chosing the bucket.
755  	 */
756  	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
757  
758  	return crc;
759  }
760  
761  /*
762   * Only to be used on 8-bit arrays.
763   * array_len is actual len in bytes (not encoded le_value_length).
764   * buf is null-terminated.
765   */
766  /* XXX */
767  static int
768  zap_leaf_array_equal(zap_leaf_phys_t *l, zfs_endian_t endian,
769  					 int blksft, int chunk, int array_len, const char *buf)
770  {
771  	int bseen = 0;
772  
773  	while (bseen < array_len) {
774  		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
775  		int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
776  
777  		if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
778  			return 0;
779  
780  		if (memcmp(la->la_array, buf + bseen, toread) != 0)
781  			break;
782  		chunk = zfs_to_cpu16(la->la_next, endian);
783  		bseen += toread;
784  	}
785  	return (bseen == array_len);
786  }
787  
788  /* XXX */
789  static int
790  zap_leaf_array_get(zap_leaf_phys_t *l, zfs_endian_t endian, int blksft,
791  				   int chunk, int array_len, char *buf)
792  {
793  	int bseen = 0;
794  
795  	while (bseen < array_len) {
796  		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
797  		int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
798  
799  		if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
800  			/* Don't use errno because this error is to be ignored.  */
801  			return ZFS_ERR_BAD_FS;
802  
803  		memcpy(buf + bseen, la->la_array,  toread);
804  		chunk = zfs_to_cpu16(la->la_next, endian);
805  		bseen += toread;
806  	}
807  	return ZFS_ERR_NONE;
808  }
809  
810  
811  /*
812   * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
813   * value for the property "name".
814   *
815   */
816  /* XXX */
817  static int
818  zap_leaf_lookup(zap_leaf_phys_t *l, zfs_endian_t endian,
819  				int blksft, uint64_t h,
820  				const char *name, uint64_t *value)
821  {
822  	uint16_t chunk;
823  	struct zap_leaf_entry *le;
824  
825  	/* Verify if this is a valid leaf block */
826  	if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
827  		printf("invalid leaf type\n");
828  		return ZFS_ERR_BAD_FS;
829  	}
830  	if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
831  		printf("invalid leaf magic\n");
832  		return ZFS_ERR_BAD_FS;
833  	}
834  
835  	for (chunk = zfs_to_cpu16(l->l_hash[LEAF_HASH(blksft, h)], endian);
836  		 chunk != CHAIN_END; chunk = le->le_next) {
837  
838  		if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) {
839  			printf("invalid chunk number\n");
840  			return ZFS_ERR_BAD_FS;
841  		}
842  
843  		le = ZAP_LEAF_ENTRY(l, blksft, chunk);
844  
845  		/* Verify the chunk entry */
846  		if (le->le_type != ZAP_CHUNK_ENTRY) {
847  			printf("invalid chunk entry\n");
848  			return ZFS_ERR_BAD_FS;
849  		}
850  
851  		if (zfs_to_cpu64(le->le_hash, endian) != h)
852  			continue;
853  
854  		if (zap_leaf_array_equal(l, endian, blksft,
855  								 zfs_to_cpu16(le->le_name_chunk, endian),
856  								 zfs_to_cpu16(le->le_name_length, endian),
857  								 name)) {
858  			struct zap_leaf_array *la;
859  
860  			if (le->le_int_size != 8 || le->le_value_length != 1) {
861  				printf("invalid leaf chunk entry\n");
862  				return ZFS_ERR_BAD_FS;
863  			}
864  			/* get the uint64_t property value */
865  			la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
866  
867  			*value = be64_to_cpu(la->la_array64);
868  
869  			return ZFS_ERR_NONE;
870  		}
871  	}
872  
873  	printf("couldn't find '%s'\n", name);
874  	return ZFS_ERR_FILE_NOT_FOUND;
875  }
876  
877  
878  /* Verify if this is a fat zap header block */
879  static int
880  zap_verify(zap_phys_t *zap)
881  {
882  	if (zap->zap_magic != (uint64_t) ZAP_MAGIC) {
883  		printf("bad ZAP magic\n");
884  		return ZFS_ERR_BAD_FS;
885  	}
886  
887  	if (zap->zap_flags != 0) {
888  		printf("bad ZAP flags\n");
889  		return ZFS_ERR_BAD_FS;
890  	}
891  
892  	if (zap->zap_salt == 0) {
893  		printf("bad ZAP salt\n");
894  		return ZFS_ERR_BAD_FS;
895  	}
896  
897  	return ZFS_ERR_NONE;
898  }
899  
900  /*
901   * Fat ZAP lookup
902   *
903   */
904  /* XXX */
905  static int
906  fzap_lookup(dnode_end_t *zap_dnode, zap_phys_t *zap,
907  			char *name, uint64_t *value, struct zfs_data *data)
908  {
909  	void *l;
910  	uint64_t hash, idx, blkid;
911  	int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
912  											zap_dnode->endian) << DNODE_SHIFT);
913  	int err;
914  	zfs_endian_t leafendian;
915  
916  	err = zap_verify(zap);
917  	if (err)
918  		return err;
919  
920  	hash = zap_hash(zap->zap_salt, name);
921  
922  	/* get block id from index */
923  	if (zap->zap_ptrtbl.zt_numblks != 0) {
924  		printf("external pointer tables not supported\n");
925  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
926  	}
927  	idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
928  	blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
929  
930  	/* Get the leaf block */
931  	if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
932  		printf("ZAP leaf is too small\n");
933  		return ZFS_ERR_BAD_FS;
934  	}
935  	err = dmu_read(zap_dnode, blkid, &l, &leafendian, data);
936  	if (err)
937  		return err;
938  
939  	err = zap_leaf_lookup(l, leafendian, blksft, hash, name, value);
940  	free(l);
941  	return err;
942  }
943  
944  /* XXX */
945  static int
946  fzap_iterate(dnode_end_t *zap_dnode, zap_phys_t *zap,
947  			 int (*hook)(const char *name,
948  						 uint64_t val,
949  						 struct zfs_data *data),
950  			 struct zfs_data *data)
951  {
952  	zap_leaf_phys_t *l;
953  	void *l_in;
954  	uint64_t idx, blkid;
955  	uint16_t chunk;
956  	int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
957  											zap_dnode->endian) << DNODE_SHIFT);
958  	int err;
959  	zfs_endian_t endian;
960  
961  	if (zap_verify(zap))
962  		return 0;
963  
964  	/* get block id from index */
965  	if (zap->zap_ptrtbl.zt_numblks != 0) {
966  		printf("external pointer tables not supported\n");
967  		return 0;
968  	}
969  	/* Get the leaf block */
970  	if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
971  		printf("ZAP leaf is too small\n");
972  		return 0;
973  	}
974  	for (idx = 0; idx < zap->zap_ptrtbl.zt_numblks; idx++) {
975  		blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
976  
977  		err = dmu_read(zap_dnode, blkid, &l_in, &endian, data);
978  		l = l_in;
979  		if (err)
980  			continue;
981  
982  		/* Verify if this is a valid leaf block */
983  		if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
984  			free(l);
985  			continue;
986  		}
987  		if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
988  			free(l);
989  			continue;
990  		}
991  
992  		for (chunk = 0; chunk < ZAP_LEAF_NUMCHUNKS(blksft); chunk++) {
993  			char *buf;
994  			struct zap_leaf_array *la;
995  			struct zap_leaf_entry *le;
996  			uint64_t val;
997  			le = ZAP_LEAF_ENTRY(l, blksft, chunk);
998  
999  			/* Verify the chunk entry */
1000  			if (le->le_type != ZAP_CHUNK_ENTRY)
1001  				continue;
1002  
1003  			buf = malloc(zfs_to_cpu16(le->le_name_length, endian)
1004  						 + 1);
1005  			if (zap_leaf_array_get(l, endian, blksft, le->le_name_chunk,
1006  								   le->le_name_length, buf)) {
1007  				free(buf);
1008  				continue;
1009  			}
1010  			buf[le->le_name_length] = 0;
1011  
1012  			if (le->le_int_size != 8
1013  				|| zfs_to_cpu16(le->le_value_length, endian) != 1)
1014  				continue;
1015  
1016  			/* get the uint64_t property value */
1017  			la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
1018  			val = be64_to_cpu(la->la_array64);
1019  			if (hook(buf, val, data))
1020  				return 1;
1021  			free(buf);
1022  		}
1023  	}
1024  	return 0;
1025  }
1026  
1027  
1028  /*
1029   * Read in the data of a zap object and find the value for a matching
1030   * property name.
1031   *
1032   */
1033  static int
1034  zap_lookup(dnode_end_t *zap_dnode, char *name, uint64_t *val,
1035  		   struct zfs_data *data)
1036  {
1037  	uint64_t block_type;
1038  	int size;
1039  	void *zapbuf;
1040  	int err;
1041  	zfs_endian_t endian;
1042  
1043  	/* Read in the first block of the zap object data. */
1044  	size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
1045  							 zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1046  	err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1047  	if (err)
1048  		return err;
1049  	block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1050  
1051  	if (block_type == ZBT_MICRO) {
1052  		err = (mzap_lookup(zapbuf, endian, size, name, val));
1053  		free(zapbuf);
1054  		return err;
1055  	} else if (block_type == ZBT_HEADER) {
1056  		/* this is a fat zap */
1057  		err = (fzap_lookup(zap_dnode, zapbuf, name, val, data));
1058  		free(zapbuf);
1059  		return err;
1060  	}
1061  
1062  	printf("unknown ZAP type\n");
1063  	free(zapbuf);
1064  	return ZFS_ERR_BAD_FS;
1065  }
1066  
1067  static int
1068  zap_iterate(dnode_end_t *zap_dnode,
1069  			int (*hook)(const char *name, uint64_t val,
1070  						struct zfs_data *data),
1071  			struct zfs_data *data)
1072  {
1073  	uint64_t block_type;
1074  	int size;
1075  	void *zapbuf;
1076  	int err;
1077  	int ret;
1078  	zfs_endian_t endian;
1079  
1080  	/* Read in the first block of the zap object data. */
1081  	size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec, zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1082  	err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1083  	if (err)
1084  		return 0;
1085  	block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1086  
1087  	if (block_type == ZBT_MICRO) {
1088  		ret = mzap_iterate(zapbuf, endian, size, hook, data);
1089  		free(zapbuf);
1090  		return ret;
1091  	} else if (block_type == ZBT_HEADER) {
1092  		/* this is a fat zap */
1093  		ret = fzap_iterate(zap_dnode, zapbuf, hook, data);
1094  		free(zapbuf);
1095  		return ret;
1096  	}
1097  	printf("unknown ZAP type\n");
1098  	free(zapbuf);
1099  	return 0;
1100  }
1101  
1102  
1103  /*
1104   * Get the dnode of an object number from the metadnode of an object set.
1105   *
1106   * Input
1107   *	mdn - metadnode to get the object dnode
1108   *	objnum - object number for the object dnode
1109   *	buf - data buffer that holds the returning dnode
1110   */
1111  static int
1112  dnode_get(dnode_end_t *mdn, uint64_t objnum, uint8_t type,
1113  		  dnode_end_t *buf, struct zfs_data *data)
1114  {
1115  	uint64_t blkid, blksz;	/* the block id this object dnode is in */
1116  	int epbs;			/* shift of number of dnodes in a block */
1117  	int idx;			/* index within a block */
1118  	void *dnbuf;
1119  	int err;
1120  	zfs_endian_t endian;
1121  
1122  	blksz = zfs_to_cpu16(mdn->dn.dn_datablkszsec,
1123  							  mdn->endian) << SPA_MINBLOCKSHIFT;
1124  
1125  	epbs = zfs_log2(blksz) - DNODE_SHIFT;
1126  	blkid = objnum >> epbs;
1127  	idx = objnum & ((1 << epbs) - 1);
1128  
1129  	if (data->dnode_buf != NULL && memcmp(data->dnode_mdn, mdn,
1130  										  sizeof(*mdn)) == 0
1131  		&& objnum >= data->dnode_start && objnum < data->dnode_end) {
1132  		memmove(&(buf->dn), &(data->dnode_buf)[idx], DNODE_SIZE);
1133  		buf->endian = data->dnode_endian;
1134  		if (type && buf->dn.dn_type != type)  {
1135  			printf("incorrect dnode type: %02X != %02x\n", buf->dn.dn_type, type);
1136  			return ZFS_ERR_BAD_FS;
1137  		}
1138  		return ZFS_ERR_NONE;
1139  	}
1140  
1141  	err = dmu_read(mdn, blkid, &dnbuf, &endian, data);
1142  	if (err)
1143  		return err;
1144  
1145  	free(data->dnode_buf);
1146  	free(data->dnode_mdn);
1147  	data->dnode_mdn = malloc(sizeof(*mdn));
1148  	if (!data->dnode_mdn) {
1149  		data->dnode_buf = 0;
1150  	} else {
1151  		memcpy(data->dnode_mdn, mdn, sizeof(*mdn));
1152  		data->dnode_buf = dnbuf;
1153  		data->dnode_start = blkid << epbs;
1154  		data->dnode_end = (blkid + 1) << epbs;
1155  		data->dnode_endian = endian;
1156  	}
1157  
1158  	memmove(&(buf->dn), (dnode_phys_t *) dnbuf + idx, DNODE_SIZE);
1159  	buf->endian = endian;
1160  	if (type && buf->dn.dn_type != type) {
1161  		printf("incorrect dnode type\n");
1162  		return ZFS_ERR_BAD_FS;
1163  	}
1164  
1165  	return ZFS_ERR_NONE;
1166  }
1167  
1168  /*
1169   * Get the file dnode for a given file name where mdn is the meta dnode
1170   * for this ZFS object set. When found, place the file dnode in dn.
1171   * The 'path' argument will be mangled.
1172   *
1173   */
1174  static int
1175  dnode_get_path(dnode_end_t *mdn, const char *path_in, dnode_end_t *dn,
1176  			   struct zfs_data *data)
1177  {
1178  	uint64_t objnum, version;
1179  	char *cname, ch;
1180  	int err = ZFS_ERR_NONE;
1181  	char *path, *path_buf;
1182  	struct dnode_chain {
1183  		struct dnode_chain *next;
1184  		dnode_end_t dn;
1185  	};
1186  	struct dnode_chain *dnode_path = 0, *dn_new, *root;
1187  
1188  	dn_new = malloc(sizeof(*dn_new));
1189  	if (!dn_new)
1190  		return ZFS_ERR_OUT_OF_MEMORY;
1191  	dn_new->next = 0;
1192  	dnode_path = root = dn_new;
1193  
1194  	err = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
1195  					&(dnode_path->dn), data);
1196  	if (err) {
1197  		free(dn_new);
1198  		return err;
1199  	}
1200  
1201  	err = zap_lookup(&(dnode_path->dn), ZPL_VERSION_STR, &version, data);
1202  	if (err) {
1203  		free(dn_new);
1204  		return err;
1205  	}
1206  	if (version > ZPL_VERSION) {
1207  		free(dn_new);
1208  		printf("too new ZPL version\n");
1209  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
1210  	}
1211  
1212  	err = zap_lookup(&(dnode_path->dn), ZFS_ROOT_OBJ, &objnum, data);
1213  	if (err) {
1214  		free(dn_new);
1215  		return err;
1216  	}
1217  
1218  	err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1219  	if (err) {
1220  		free(dn_new);
1221  		return err;
1222  	}
1223  
1224  	path = path_buf = strdup(path_in);
1225  	if (!path_buf) {
1226  		free(dn_new);
1227  		return ZFS_ERR_OUT_OF_MEMORY;
1228  	}
1229  
1230  	while (1) {
1231  		/* skip leading slashes */
1232  		while (*path == '/')
1233  			path++;
1234  		if (!*path)
1235  			break;
1236  		/* get the next component name */
1237  		cname = path;
1238  		while (*path && *path != '/')
1239  			path++;
1240  		/* Skip dot.  */
1241  		if (cname + 1 == path && cname[0] == '.')
1242  			continue;
1243  		/* Handle double dot.  */
1244  		if (cname + 2 == path && cname[0] == '.' && cname[1] == '.')  {
1245  			if (dn_new->next) {
1246  				dn_new = dnode_path;
1247  				dnode_path = dn_new->next;
1248  				free(dn_new);
1249  			} else {
1250  				printf("can't resolve ..\n");
1251  				err = ZFS_ERR_FILE_NOT_FOUND;
1252  				break;
1253  			}
1254  			continue;
1255  		}
1256  
1257  		ch = *path;
1258  		*path = 0;		/* ensure null termination */
1259  
1260  		if (dnode_path->dn.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
1261  			free(path_buf);
1262  			printf("not a directory\n");
1263  			return ZFS_ERR_BAD_FILE_TYPE;
1264  		}
1265  		err = zap_lookup(&(dnode_path->dn), cname, &objnum, data);
1266  		if (err)
1267  			break;
1268  
1269  		dn_new = malloc(sizeof(*dn_new));
1270  		if (!dn_new) {
1271  			err = ZFS_ERR_OUT_OF_MEMORY;
1272  			break;
1273  		}
1274  		dn_new->next = dnode_path;
1275  		dnode_path = dn_new;
1276  
1277  		objnum = ZFS_DIRENT_OBJ(objnum);
1278  		err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1279  		if (err)
1280  			break;
1281  
1282  		*path = ch;
1283  	}
1284  
1285  	if (!err)
1286  		memcpy(dn, &(dnode_path->dn), sizeof(*dn));
1287  
1288  	while (dnode_path) {
1289  		dn_new = dnode_path->next;
1290  		free(dnode_path);
1291  		dnode_path = dn_new;
1292  	}
1293  	free(path_buf);
1294  	return err;
1295  }
1296  
1297  
1298  /*
1299   * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1300   * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1301   * of pool/rootfs.
1302   *
1303   * If no fsname and no obj are given, return the DSL_DIR metadnode.
1304   * If fsname is given, return its metadnode and its matching object number.
1305   * If only obj is given, return the metadnode for this object number.
1306   *
1307   */
1308  static int
1309  get_filesystem_dnode(dnode_end_t *mosmdn, char *fsname,
1310  					 dnode_end_t *mdn, struct zfs_data *data)
1311  {
1312  	uint64_t objnum;
1313  	int err;
1314  
1315  	err = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1316  					DMU_OT_OBJECT_DIRECTORY, mdn, data);
1317  	if (err)
1318  		return err;
1319  
1320  	err = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum, data);
1321  	if (err)
1322  		return err;
1323  
1324  	err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1325  	if (err)
1326  		return err;
1327  
1328  	while (*fsname) {
1329  		uint64_t childobj;
1330  		char *cname, ch;
1331  
1332  		while (*fsname == '/')
1333  			fsname++;
1334  
1335  		if (!*fsname || *fsname == '@')
1336  			break;
1337  
1338  		cname = fsname;
1339  		while (*fsname && !isspace(*fsname) && *fsname != '/')
1340  			fsname++;
1341  		ch = *fsname;
1342  		*fsname = 0;
1343  
1344  		childobj = zfs_to_cpu64((((dsl_dir_phys_t *) DN_BONUS(&mdn->dn)))->dd_child_dir_zapobj, mdn->endian);
1345  		err = dnode_get(mosmdn, childobj,
1346  						DMU_OT_DSL_DIR_CHILD_MAP, mdn, data);
1347  		if (err)
1348  			return err;
1349  
1350  		err = zap_lookup(mdn, cname, &objnum, data);
1351  		if (err)
1352  			return err;
1353  
1354  		err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1355  		if (err)
1356  			return err;
1357  
1358  		*fsname = ch;
1359  	}
1360  	return ZFS_ERR_NONE;
1361  }
1362  
1363  static int
1364  make_mdn(dnode_end_t *mdn, struct zfs_data *data)
1365  {
1366  	void *osp;
1367  	blkptr_t *bp;
1368  	size_t ospsize;
1369  	int err;
1370  
1371  	bp = &(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_bp);
1372  	err = zio_read(bp, mdn->endian, &osp, &ospsize, data);
1373  	if (err)
1374  		return err;
1375  	if (ospsize < OBJSET_PHYS_SIZE_V14) {
1376  		free(osp);
1377  		printf("too small osp\n");
1378  		return ZFS_ERR_BAD_FS;
1379  	}
1380  
1381  	mdn->endian = (zfs_to_cpu64(bp->blk_prop, mdn->endian)>>63) & 1;
1382  	memmove((char *) &(mdn->dn),
1383  			(char *) &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1384  	free(osp);
1385  	return ZFS_ERR_NONE;
1386  }
1387  
1388  static int
1389  dnode_get_fullpath(const char *fullpath, dnode_end_t *mdn,
1390  				   uint64_t *mdnobj, dnode_end_t *dn, int *isfs,
1391  				   struct zfs_data *data)
1392  {
1393  	char *fsname, *snapname;
1394  	const char *ptr_at, *filename;
1395  	uint64_t headobj;
1396  	int err;
1397  
1398  	ptr_at = strchr(fullpath, '@');
1399  	if (!ptr_at) {
1400  		*isfs = 1;
1401  		filename = 0;
1402  		snapname = 0;
1403  		fsname = strdup(fullpath);
1404  	} else {
1405  		const char *ptr_slash = strchr(ptr_at, '/');
1406  
1407  		*isfs = 0;
1408  		fsname = malloc(ptr_at - fullpath + 1);
1409  		if (!fsname)
1410  			return ZFS_ERR_OUT_OF_MEMORY;
1411  		memcpy(fsname, fullpath, ptr_at - fullpath);
1412  		fsname[ptr_at - fullpath] = 0;
1413  		if (ptr_at[1] && ptr_at[1] != '/') {
1414  			snapname = malloc(ptr_slash - ptr_at);
1415  			if (!snapname) {
1416  				free(fsname);
1417  				return ZFS_ERR_OUT_OF_MEMORY;
1418  			}
1419  			memcpy(snapname, ptr_at + 1, ptr_slash - ptr_at - 1);
1420  			snapname[ptr_slash - ptr_at - 1] = 0;
1421  		} else {
1422  			snapname = 0;
1423  		}
1424  		if (ptr_slash)
1425  			filename = ptr_slash;
1426  		else
1427  			filename = "/";
1428  		printf("zfs fsname = '%s' snapname='%s' filename = '%s'\n",
1429  			   fsname, snapname, filename);
1430  	}
1431  
1432  
1433  	err = get_filesystem_dnode(&(data->mos), fsname, dn, data);
1434  
1435  	if (err) {
1436  		free(fsname);
1437  		free(snapname);
1438  		return err;
1439  	}
1440  
1441  	headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&dn->dn))->dd_head_dataset_obj, dn->endian);
1442  
1443  	err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1444  	if (err) {
1445  		free(fsname);
1446  		free(snapname);
1447  		return err;
1448  	}
1449  
1450  	if (snapname) {
1451  		uint64_t snapobj;
1452  
1453  		snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_snapnames_zapobj, mdn->endian);
1454  
1455  		err = dnode_get(&(data->mos), snapobj,
1456  						DMU_OT_DSL_DS_SNAP_MAP, mdn, data);
1457  		if (!err)
1458  			err = zap_lookup(mdn, snapname, &headobj, data);
1459  		if (!err)
1460  			err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1461  		if (err) {
1462  			free(fsname);
1463  			free(snapname);
1464  			return err;
1465  		}
1466  	}
1467  
1468  	if (mdnobj)
1469  		*mdnobj = headobj;
1470  
1471  	make_mdn(mdn, data);
1472  
1473  	if (*isfs) {
1474  		free(fsname);
1475  		free(snapname);
1476  		return ZFS_ERR_NONE;
1477  	}
1478  	err = dnode_get_path(mdn, filename, dn, data);
1479  	free(fsname);
1480  	free(snapname);
1481  	return err;
1482  }
1483  
1484  /*
1485   * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1486   *
1487   * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1488   *
1489   *		encoding method/host endian		(4 bytes)
1490   *		nvl_version						(4 bytes)
1491   *		nvl_nvflag						(4 bytes)
1492   *	encoded nvpairs:
1493   *		encoded size of the nvpair		(4 bytes)
1494   *		decoded size of the nvpair		(4 bytes)
1495   *		name string size				(4 bytes)
1496   *		name string data				(sizeof(NV_ALIGN4(string))
1497   *		data type						(4 bytes)
1498   *		# of elements in the nvpair		(4 bytes)
1499   *		data
1500   *		2 zero's for the last nvpair
1501   *		(end of the entire list)	(8 bytes)
1502   *
1503   */
1504  
1505  static int
1506  nvlist_find_value(char *nvlist, char *name, int valtype, char **val,
1507  				  size_t *size_out, size_t *nelm_out)
1508  {
1509  	int name_len, type, encode_size;
1510  	char *nvpair, *nvp_name;
1511  
1512  	/* Verify if the 1st and 2nd byte in the nvlist are valid. */
1513  	/* NOTE: independently of what endianness header announces all
1514  	   subsequent values are big-endian.  */
1515  	if (nvlist[0] != NV_ENCODE_XDR || (nvlist[1] != NV_LITTLE_ENDIAN
1516  									   && nvlist[1] != NV_BIG_ENDIAN)) {
1517  		printf("zfs incorrect nvlist header\n");
1518  		return ZFS_ERR_BAD_FS;
1519  	}
1520  
1521  	/* skip the header, nvl_version, and nvl_nvflag */
1522  	nvlist = nvlist + 4 * 3;
1523  	/*
1524  	 * Loop thru the nvpair list
1525  	 * The XDR representation of an integer is in big-endian byte order.
1526  	 */
1527  	while ((encode_size = be32_to_cpu(*(uint32_t *) nvlist))) {
1528  		int nelm;
1529  
1530  		nvpair = nvlist + 4 * 2;	/* skip the encode/decode size */
1531  
1532  		name_len = be32_to_cpu(*(uint32_t *) nvpair);
1533  		nvpair += 4;
1534  
1535  		nvp_name = nvpair;
1536  		nvpair = nvpair + ((name_len + 3) & ~3);	/* align */
1537  
1538  		type = be32_to_cpu(*(uint32_t *) nvpair);
1539  		nvpair += 4;
1540  
1541  		nelm = be32_to_cpu(*(uint32_t *) nvpair);
1542  		if (nelm < 1) {
1543  			printf("empty nvpair\n");
1544  			return ZFS_ERR_BAD_FS;
1545  		}
1546  
1547  		nvpair += 4;
1548  
1549  		if ((strncmp(nvp_name, name, name_len) == 0) && type == valtype) {
1550  			*val = nvpair;
1551  			*size_out = encode_size;
1552  			if (nelm_out)
1553  				*nelm_out = nelm;
1554  			return 1;
1555  		}
1556  
1557  		nvlist += encode_size;	/* goto the next nvpair */
1558  	}
1559  	return 0;
1560  }
1561  
1562  int
1563  zfs_nvlist_lookup_uint64(char *nvlist, char *name, uint64_t *out)
1564  {
1565  	char *nvpair;
1566  	size_t size;
1567  	int found;
1568  
1569  	found = nvlist_find_value(nvlist, name, DATA_TYPE_UINT64, &nvpair, &size, 0);
1570  	if (!found)
1571  		return 0;
1572  	if (size < sizeof(uint64_t)) {
1573  		printf("invalid uint64\n");
1574  		return ZFS_ERR_BAD_FS;
1575  	}
1576  
1577  	*out = be64_to_cpu(*(uint64_t *) nvpair);
1578  	return 1;
1579  }
1580  
1581  char *
1582  zfs_nvlist_lookup_string(char *nvlist, char *name)
1583  {
1584  	char *nvpair;
1585  	char *ret;
1586  	size_t slen;
1587  	size_t size;
1588  	int found;
1589  
1590  	found = nvlist_find_value(nvlist, name, DATA_TYPE_STRING, &nvpair, &size, 0);
1591  	if (!found)
1592  		return 0;
1593  	if (size < 4) {
1594  		printf("invalid string\n");
1595  		return 0;
1596  	}
1597  	slen = be32_to_cpu(*(uint32_t *) nvpair);
1598  	if (slen > size - 4)
1599  		slen = size - 4;
1600  	ret = malloc(slen + 1);
1601  	if (!ret)
1602  		return 0;
1603  	memcpy(ret, nvpair + 4, slen);
1604  	ret[slen] = 0;
1605  	return ret;
1606  }
1607  
1608  char *
1609  zfs_nvlist_lookup_nvlist(char *nvlist, char *name)
1610  {
1611  	char *nvpair;
1612  	char *ret;
1613  	size_t size;
1614  	int found;
1615  
1616  	found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1617  							  &size, 0);
1618  	if (!found)
1619  		return 0;
1620  	ret = calloc(1, size + 3 * sizeof(uint32_t));
1621  	if (!ret)
1622  		return 0;
1623  	memcpy(ret, nvlist, sizeof(uint32_t));
1624  
1625  	memcpy(ret + sizeof(uint32_t), nvpair, size);
1626  	return ret;
1627  }
1628  
1629  int
1630  zfs_nvlist_lookup_nvlist_array_get_nelm(char *nvlist, char *name)
1631  {
1632  	char *nvpair;
1633  	size_t nelm, size;
1634  	int found;
1635  
1636  	found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1637  							  &size, &nelm);
1638  	if (!found)
1639  		return -1;
1640  	return nelm;
1641  }
1642  
1643  char *
1644  zfs_nvlist_lookup_nvlist_array(char *nvlist, char *name,
1645  									size_t index)
1646  {
1647  	char *nvpair, *nvpairptr;
1648  	int found;
1649  	char *ret;
1650  	size_t size;
1651  	unsigned i;
1652  	size_t nelm;
1653  
1654  	found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1655  							  &size, &nelm);
1656  	if (!found)
1657  		return 0;
1658  	if (index >= nelm) {
1659  		printf("trying to lookup past nvlist array\n");
1660  		return 0;
1661  	}
1662  
1663  	nvpairptr = nvpair;
1664  
1665  	for (i = 0; i < index; i++) {
1666  		uint32_t encode_size;
1667  
1668  		/* skip the header, nvl_version, and nvl_nvflag */
1669  		nvpairptr = nvpairptr + 4 * 2;
1670  
1671  		while (nvpairptr < nvpair + size
1672  			   && (encode_size = be32_to_cpu(*(uint32_t *) nvpairptr)))
1673  			nvlist += encode_size;	/* goto the next nvpair */
1674  
1675  		nvlist = nvlist + 4 * 2;	/* skip the ending 2 zeros - 8 bytes */
1676  	}
1677  
1678  	if (nvpairptr >= nvpair + size
1679  		|| nvpairptr + be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1680  		>= nvpair + size) {
1681  		printf("incorrect nvlist array\n");
1682  		return 0;
1683  	}
1684  
1685  	ret = calloc(1, be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1686  				 + 3 * sizeof(uint32_t));
1687  	if (!ret)
1688  		return 0;
1689  	memcpy(ret, nvlist, sizeof(uint32_t));
1690  
1691  	memcpy(ret + sizeof(uint32_t), nvpairptr, size);
1692  	return ret;
1693  }
1694  
1695  static int
1696  int_zfs_fetch_nvlist(struct zfs_data *data, char **nvlist)
1697  {
1698  	int err;
1699  
1700  	*nvlist = malloc(VDEV_PHYS_SIZE);
1701  	/* Read in the vdev name-value pair list (112K). */
1702  	err = zfs_devread(data->vdev_phys_sector, 0, VDEV_PHYS_SIZE, *nvlist);
1703  	if (err) {
1704  		free(*nvlist);
1705  		*nvlist = 0;
1706  		return err;
1707  	}
1708  	return ZFS_ERR_NONE;
1709  }
1710  
1711  /*
1712   * Check the disk label information and retrieve needed vdev name-value pairs.
1713   *
1714   */
1715  static int
1716  check_pool_label(struct zfs_data *data)
1717  {
1718  	uint64_t pool_state;
1719  	char *nvlist;			/* for the pool */
1720  	char *vdevnvlist;		/* for the vdev */
1721  	uint64_t diskguid;
1722  	uint64_t version;
1723  	int found;
1724  	int err;
1725  
1726  	err = int_zfs_fetch_nvlist(data, &nvlist);
1727  	if (err)
1728  		return err;
1729  
1730  	found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_STATE,
1731  										  &pool_state);
1732  	if (!found) {
1733  		free(nvlist);
1734  		printf("zfs pool state not found\n");
1735  		return ZFS_ERR_BAD_FS;
1736  	}
1737  
1738  	if (pool_state == POOL_STATE_DESTROYED) {
1739  		free(nvlist);
1740  		printf("zpool is marked as destroyed\n");
1741  		return ZFS_ERR_BAD_FS;
1742  	}
1743  
1744  	data->label_txg = 0;
1745  	found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_TXG,
1746  										  &data->label_txg);
1747  	if (!found) {
1748  		free(nvlist);
1749  		printf("zfs pool txg not found\n");
1750  		return ZFS_ERR_BAD_FS;
1751  	}
1752  
1753  	/* not an active device */
1754  	if (data->label_txg == 0) {
1755  		free(nvlist);
1756  		printf("zpool is not active\n");
1757  		return ZFS_ERR_BAD_FS;
1758  	}
1759  
1760  	found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_VERSION,
1761  										  &version);
1762  	if (!found) {
1763  		free(nvlist);
1764  		printf("zpool config version not found\n");
1765  		return ZFS_ERR_BAD_FS;
1766  	}
1767  
1768  	if (version > SPA_VERSION) {
1769  		free(nvlist);
1770  		printf("SPA version too new %llu > %llu\n",
1771  			   (unsigned long long) version,
1772  			   (unsigned long long) SPA_VERSION);
1773  		return ZFS_ERR_NOT_IMPLEMENTED_YET;
1774  	}
1775  
1776  	vdevnvlist = zfs_nvlist_lookup_nvlist(nvlist, ZPOOL_CONFIG_VDEV_TREE);
1777  	if (!vdevnvlist) {
1778  		free(nvlist);
1779  		printf("ZFS config vdev tree not found\n");
1780  		return ZFS_ERR_BAD_FS;
1781  	}
1782  
1783  	found = zfs_nvlist_lookup_uint64(vdevnvlist, ZPOOL_CONFIG_ASHIFT,
1784  										  &data->vdev_ashift);
1785  	free(vdevnvlist);
1786  	if (!found) {
1787  		free(nvlist);
1788  		printf("ZPOOL config ashift not found\n");
1789  		return ZFS_ERR_BAD_FS;
1790  	}
1791  
1792  	found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_GUID, &diskguid);
1793  	if (!found) {
1794  		free(nvlist);
1795  		printf("ZPOOL config guid not found\n");
1796  		return ZFS_ERR_BAD_FS;
1797  	}
1798  
1799  	found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_GUID, &data->pool_guid);
1800  	if (!found) {
1801  		free(nvlist);
1802  		printf("ZPOOL config pool guid not found\n");
1803  		return ZFS_ERR_BAD_FS;
1804  	}
1805  
1806  	free(nvlist);
1807  
1808  	printf("ZFS Pool GUID: %llu (%016llx) Label: GUID: %llu (%016llx), txg: %llu, SPA v%llu, ashift: %llu\n",
1809  		   (unsigned long long) data->pool_guid,
1810  		   (unsigned long long) data->pool_guid,
1811  		   (unsigned long long) diskguid,
1812  		   (unsigned long long) diskguid,
1813  		   (unsigned long long) data->label_txg,
1814  		   (unsigned long long) version,
1815  		   (unsigned long long) data->vdev_ashift);
1816  
1817  	return ZFS_ERR_NONE;
1818  }
1819  
1820  /*
1821   * vdev_label_start returns the physical disk offset (in bytes) of
1822   * label "l".
1823   */
1824  static uint64_t vdev_label_start(uint64_t psize, int l)
1825  {
1826  	return (l * sizeof(vdev_label_t) + (l < VDEV_LABELS / 2 ?
1827  										0 : psize -
1828  										VDEV_LABELS * sizeof(vdev_label_t)));
1829  }
1830  
1831  void
1832  zfs_unmount(struct zfs_data *data)
1833  {
1834  	free(data->dnode_buf);
1835  	free(data->dnode_mdn);
1836  	free(data->file_buf);
1837  	free(data);
1838  }
1839  
1840  /*
1841   * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1842   * to the memory address MOS.
1843   *
1844   */
1845  struct zfs_data *
1846  zfs_mount(device_t dev)
1847  {
1848  	struct zfs_data *data = 0;
1849  	int label = 0, bestlabel = -1;
1850  	char *ub_array;
1851  	uberblock_t *ubbest;
1852  	uberblock_t *ubcur = NULL;
1853  	void *osp = 0;
1854  	size_t ospsize;
1855  	int err;
1856  
1857  	data = malloc(sizeof(*data));
1858  	if (!data)
1859  		return 0;
1860  	memset(data, 0, sizeof(*data));
1861  
1862  	ub_array = malloc(VDEV_UBERBLOCK_RING);
1863  	if (!ub_array) {
1864  		zfs_unmount(data);
1865  		return 0;
1866  	}
1867  
1868  	ubbest = malloc(sizeof(*ubbest));
1869  	if (!ubbest) {
1870  		free(ub_array);
1871  		zfs_unmount(data);
1872  		return 0;
1873  	}
1874  	memset(ubbest, 0, sizeof(*ubbest));
1875  
1876  	/*
1877  	 * some eltorito stacks don't give us a size and
1878  	 * we end up setting the size to MAXUINT, further
1879  	 * some of these devices stop working once a single
1880  	 * read past the end has been issued. Checking
1881  	 * for a maximum part_length and skipping the backup
1882  	 * labels at the end of the slice/partition/device
1883  	 * avoids breaking down on such devices.
1884  	 */
1885  	const int vdevnum =
1886  		dev->part_length == 0 ?
1887  		VDEV_LABELS / 2 : VDEV_LABELS;
1888  
1889  	/* Size in bytes of the device (disk or partition) aligned to label size*/
1890  	uint64_t device_size =
1891  		dev->part_length << SECTOR_BITS;
1892  
1893  	const uint64_t alignedbytes =
1894  		P2ALIGN(device_size, (uint64_t) sizeof(vdev_label_t));
1895  
1896  	for (label = 0; label < vdevnum; label++) {
1897  		uint64_t labelstartbytes = vdev_label_start(alignedbytes, label);
1898  		uint64_t labelstart = labelstartbytes >> SECTOR_BITS;
1899  
1900  		debug("zfs reading label %d at sector %llu (byte %llu)\n",
1901  			  label, (unsigned long long) labelstart,
1902  			  (unsigned long long) labelstartbytes);
1903  
1904  		data->vdev_phys_sector = labelstart +
1905  			((VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE) >> SECTOR_BITS);
1906  
1907  		err = check_pool_label(data);
1908  		if (err) {
1909  			printf("zfs error checking label %d\n", label);
1910  			continue;
1911  		}
1912  
1913  		/* Read in the uberblock ring (128K). */
1914  		err = zfs_devread(data->vdev_phys_sector  +
1915  						  (VDEV_PHYS_SIZE >> SECTOR_BITS),
1916  						  0, VDEV_UBERBLOCK_RING, ub_array);
1917  		if (err) {
1918  			printf("zfs error reading uberblock ring for label %d\n", label);
1919  			continue;
1920  		}
1921  
1922  		ubcur = find_bestub(ub_array, data);
1923  		if (!ubcur) {
1924  			printf("zfs No good uberblocks found in label %d\n", label);
1925  			continue;
1926  		}
1927  
1928  		if (vdev_uberblock_compare(ubcur, ubbest) > 0) {
1929  			/* Looks like the block is good, so use it.*/
1930  			memcpy(ubbest, ubcur, sizeof(*ubbest));
1931  			bestlabel = label;
1932  			debug("zfs Current best uberblock found in label %d\n", label);
1933  		}
1934  	}
1935  	free(ub_array);
1936  
1937  	/* We zero'd the structure to begin with.  If we never assigned to it,
1938  	   magic will still be zero. */
1939  	if (!ubbest->ub_magic) {
1940  		printf("couldn't find a valid ZFS label\n");
1941  		zfs_unmount(data);
1942  		free(ubbest);
1943  		return 0;
1944  	}
1945  
1946  	debug("zfs ubbest %p in label %d\n", ubbest, bestlabel);
1947  
1948  	zfs_endian_t ub_endian =
1949  		zfs_to_cpu64(ubbest->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
1950  		? LITTLE_ENDIAN : BIG_ENDIAN;
1951  
1952  	debug("zfs endian set to %s\n", !ub_endian ? "big" : "little");
1953  
1954  	err = zio_read(&ubbest->ub_rootbp, ub_endian, &osp, &ospsize, data);
1955  
1956  	if (err) {
1957  		printf("couldn't zio_read object directory\n");
1958  		zfs_unmount(data);
1959  		free(osp);
1960  		free(ubbest);
1961  		return 0;
1962  	}
1963  
1964  	if (ospsize < OBJSET_PHYS_SIZE_V14) {
1965  		printf("osp too small\n");
1966  		zfs_unmount(data);
1967  		free(osp);
1968  		free(ubbest);
1969  		return 0;
1970  	}
1971  
1972  	/* Got the MOS. Save it at the memory addr MOS. */
1973  	memmove(&(data->mos.dn), &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1974  	data->mos.endian =
1975  		(zfs_to_cpu64(ubbest->ub_rootbp.blk_prop, ub_endian) >> 63) & 1;
1976  	memmove(&(data->current_uberblock), ubbest, sizeof(uberblock_t));
1977  
1978  	free(osp);
1979  	free(ubbest);
1980  
1981  	return data;
1982  }
1983  
1984  int
1985  zfs_fetch_nvlist(device_t dev, char **nvlist)
1986  {
1987  	struct zfs_data *zfs;
1988  	int err;
1989  
1990  	zfs = zfs_mount(dev);
1991  	if (!zfs)
1992  		return ZFS_ERR_BAD_FS;
1993  	err = int_zfs_fetch_nvlist(zfs, nvlist);
1994  	zfs_unmount(zfs);
1995  	return err;
1996  }
1997  
1998  /*
1999   * zfs_open() locates a file in the rootpool by following the
2000   * MOS and places the dnode of the file in the memory address DNODE.
2001   */
2002  int
2003  zfs_open(struct zfs_file *file, const char *fsfilename)
2004  {
2005  	struct zfs_data *data;
2006  	int err;
2007  	int isfs;
2008  
2009  	data = zfs_mount(file->device);
2010  	if (!data)
2011  		return ZFS_ERR_BAD_FS;
2012  
2013  	err = dnode_get_fullpath(fsfilename, &(data->mdn), 0,
2014  							 &(data->dnode), &isfs, data);
2015  	if (err) {
2016  		zfs_unmount(data);
2017  		return err;
2018  	}
2019  
2020  	if (isfs) {
2021  		zfs_unmount(data);
2022  		printf("Missing @ or / separator\n");
2023  		return ZFS_ERR_FILE_NOT_FOUND;
2024  	}
2025  
2026  	/* We found the dnode for this file. Verify if it is a plain file. */
2027  	if (data->dnode.dn.dn_type != DMU_OT_PLAIN_FILE_CONTENTS) {
2028  		zfs_unmount(data);
2029  		printf("not a file\n");
2030  		return ZFS_ERR_BAD_FILE_TYPE;
2031  	}
2032  
2033  	/* get the file size and set the file position to 0 */
2034  
2035  	/*
2036  	 * For DMU_OT_SA we will need to locate the SIZE attribute
2037  	 * attribute, which could be either in the bonus buffer
2038  	 * or the "spill" block.
2039  	 */
2040  	if (data->dnode.dn.dn_bonustype == DMU_OT_SA) {
2041  		void *sahdrp;
2042  		int hdrsize;
2043  
2044  		if (data->dnode.dn.dn_bonuslen != 0) {
2045  			sahdrp = (sa_hdr_phys_t *) DN_BONUS(&data->dnode.dn);
2046  		} else if (data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
2047  			blkptr_t *bp = &data->dnode.dn.dn_spill;
2048  
2049  			err = zio_read(bp, data->dnode.endian, &sahdrp, NULL, data);
2050  			if (err)
2051  				return err;
2052  		} else {
2053  			printf("filesystem is corrupt :(\n");
2054  			return ZFS_ERR_BAD_FS;
2055  		}
2056  
2057  		hdrsize = SA_HDR_SIZE(((sa_hdr_phys_t *) sahdrp));
2058  		file->size = *(uint64_t *) ((char *) sahdrp + hdrsize + SA_SIZE_OFFSET);
2059  		if ((data->dnode.dn.dn_bonuslen == 0) &&
2060  			(data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2061  			free(sahdrp);
2062  	} else {
2063  		file->size = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&data->dnode.dn))->zp_size, data->dnode.endian);
2064  	}
2065  
2066  	file->data = data;
2067  	file->offset = 0;
2068  
2069  	return ZFS_ERR_NONE;
2070  }
2071  
2072  uint64_t
2073  zfs_read(zfs_file_t file, char *buf, uint64_t len)
2074  {
2075  	struct zfs_data *data = (struct zfs_data *) file->data;
2076  	int blksz, movesize;
2077  	uint64_t length;
2078  	int64_t red;
2079  	int err;
2080  
2081  	if (data->file_buf == NULL) {
2082  		data->file_buf = malloc(SPA_MAXBLOCKSIZE);
2083  		if (!data->file_buf)
2084  			return -1;
2085  		data->file_start = data->file_end = 0;
2086  	}
2087  
2088  	/*
2089  	 * If offset is in memory, move it into the buffer provided and return.
2090  	 */
2091  	if (file->offset >= data->file_start
2092  		&& file->offset + len <= data->file_end) {
2093  		memmove(buf, data->file_buf + file->offset - data->file_start,
2094  				len);
2095  		return len;
2096  	}
2097  
2098  	blksz = zfs_to_cpu16(data->dnode.dn.dn_datablkszsec,
2099  							  data->dnode.endian) << SPA_MINBLOCKSHIFT;
2100  
2101  	/*
2102  	 * Entire Dnode is too big to fit into the space available.	 We
2103  	 * will need to read it in chunks.	This could be optimized to
2104  	 * read in as large a chunk as there is space available, but for
2105  	 * now, this only reads in one data block at a time.
2106  	 */
2107  	length = len;
2108  	red = 0;
2109  	while (length) {
2110  		void *t;
2111  		/*
2112  		 * Find requested blkid and the offset within that block.
2113  		 */
2114  		uint64_t blkid = file->offset + red;
2115  		blkid = do_div(blkid, blksz);
2116  		free(data->file_buf);
2117  		data->file_buf = 0;
2118  
2119  		err = dmu_read(&(data->dnode), blkid, &t,
2120  					   0, data);
2121  		data->file_buf = t;
2122  		if (err)
2123  			return -1;
2124  
2125  		data->file_start = blkid * blksz;
2126  		data->file_end = data->file_start + blksz;
2127  
2128  		movesize = min(length, data->file_end - (int)file->offset - red);
2129  
2130  		memmove(buf, data->file_buf + file->offset + red
2131  				- data->file_start, movesize);
2132  		buf += movesize;
2133  		length -= movesize;
2134  		red += movesize;
2135  	}
2136  
2137  	return len;
2138  }
2139  
2140  int
2141  zfs_close(zfs_file_t file)
2142  {
2143  	zfs_unmount((struct zfs_data *) file->data);
2144  	return ZFS_ERR_NONE;
2145  }
2146  
2147  int
2148  zfs_getmdnobj(device_t dev, const char *fsfilename,
2149  				   uint64_t *mdnobj)
2150  {
2151  	struct zfs_data *data;
2152  	int err;
2153  	int isfs;
2154  
2155  	data = zfs_mount(dev);
2156  	if (!data)
2157  		return ZFS_ERR_BAD_FS;
2158  
2159  	err = dnode_get_fullpath(fsfilename, &(data->mdn), mdnobj,
2160  							 &(data->dnode), &isfs, data);
2161  	zfs_unmount(data);
2162  	return err;
2163  }
2164  
2165  static void
2166  fill_fs_info(struct zfs_dirhook_info *info,
2167  			 dnode_end_t mdn, struct zfs_data *data)
2168  {
2169  	int err;
2170  	dnode_end_t dn;
2171  	uint64_t objnum;
2172  	uint64_t headobj;
2173  
2174  	memset(info, 0, sizeof(*info));
2175  
2176  	info->dir = 1;
2177  
2178  	if (mdn.dn.dn_type == DMU_OT_DSL_DIR) {
2179  		headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&mdn.dn))->dd_head_dataset_obj, mdn.endian);
2180  
2181  		err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &mdn, data);
2182  		if (err) {
2183  			printf("zfs failed here 1\n");
2184  			return;
2185  		}
2186  	}
2187  	make_mdn(&mdn, data);
2188  	err = dnode_get(&mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
2189  					&dn, data);
2190  	if (err) {
2191  		printf("zfs failed here 2\n");
2192  		return;
2193  	}
2194  
2195  	err = zap_lookup(&dn, ZFS_ROOT_OBJ, &objnum, data);
2196  	if (err) {
2197  		printf("zfs failed here 3\n");
2198  		return;
2199  	}
2200  
2201  	err = dnode_get(&mdn, objnum, 0, &dn, data);
2202  	if (err) {
2203  		printf("zfs failed here 4\n");
2204  		return;
2205  	}
2206  
2207  	info->mtimeset = 1;
2208  	info->mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2209  
2210  	return;
2211  }
2212  
2213  static int iterate_zap(const char *name, uint64_t val, struct zfs_data *data)
2214  {
2215  	struct zfs_dirhook_info info;
2216  	dnode_end_t dn;
2217  
2218  	memset(&info, 0, sizeof(info));
2219  
2220  	dnode_get(&(data->mdn), val, 0, &dn, data);
2221  	info.mtimeset = 1;
2222  	info.mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2223  	info.dir = (dn.dn.dn_type == DMU_OT_DIRECTORY_CONTENTS);
2224  	debug("zfs type=%d, name=%s\n",
2225  		  (int)dn.dn.dn_type, (char *)name);
2226  	if (!data->userhook)
2227  		return 0;
2228  	return data->userhook(name, &info);
2229  }
2230  
2231  static int iterate_zap_fs(const char *name, uint64_t val, struct zfs_data *data)
2232  {
2233  	struct zfs_dirhook_info info;
2234  	dnode_end_t mdn;
2235  	int err;
2236  	err = dnode_get(&(data->mos), val, 0, &mdn, data);
2237  	if (err)
2238  		return 0;
2239  	if (mdn.dn.dn_type != DMU_OT_DSL_DIR)
2240  		return 0;
2241  
2242  	fill_fs_info(&info, mdn, data);
2243  
2244  	if (!data->userhook)
2245  		return 0;
2246  	return data->userhook(name, &info);
2247  }
2248  
2249  static int iterate_zap_snap(const char *name, uint64_t val, struct zfs_data *data)
2250  {
2251  	struct zfs_dirhook_info info;
2252  	char *name2;
2253  	int ret = 0;
2254  	dnode_end_t mdn;
2255  	int err;
2256  
2257  	err = dnode_get(&(data->mos), val, 0, &mdn, data);
2258  	if (err)
2259  		return 0;
2260  
2261  	if (mdn.dn.dn_type != DMU_OT_DSL_DATASET)
2262  		return 0;
2263  
2264  	fill_fs_info(&info, mdn, data);
2265  
2266  	name2 = malloc(strlen(name) + 2);
2267  	name2[0] = '@';
2268  	memcpy(name2 + 1, name, strlen(name) + 1);
2269  	if (data->userhook)
2270  		ret = data->userhook(name2, &info);
2271  	free(name2);
2272  	return ret;
2273  }
2274  
2275  int
2276  zfs_ls(device_t device, const char *path,
2277  	   int (*hook)(const char *, const struct zfs_dirhook_info *))
2278  {
2279  	struct zfs_data *data;
2280  	int err;
2281  	int isfs;
2282  
2283  	data = zfs_mount(device);
2284  	if (!data)
2285  		return ZFS_ERR_BAD_FS;
2286  
2287  	data->userhook = hook;
2288  
2289  	err = dnode_get_fullpath(path, &(data->mdn), 0, &(data->dnode), &isfs, data);
2290  	if (err) {
2291  		zfs_unmount(data);
2292  		return err;
2293  	}
2294  	if (isfs) {
2295  		uint64_t childobj, headobj;
2296  		uint64_t snapobj;
2297  		dnode_end_t dn;
2298  		struct zfs_dirhook_info info;
2299  
2300  		fill_fs_info(&info, data->dnode, data);
2301  		hook("@", &info);
2302  
2303  		childobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_child_dir_zapobj, data->dnode.endian);
2304  		headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_head_dataset_obj, data->dnode.endian);
2305  		err = dnode_get(&(data->mos), childobj,
2306  						DMU_OT_DSL_DIR_CHILD_MAP, &dn, data);
2307  		if (err) {
2308  			zfs_unmount(data);
2309  			return err;
2310  		}
2311  
2312  
2313  		zap_iterate(&dn, iterate_zap_fs, data);
2314  
2315  		err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &dn, data);
2316  		if (err) {
2317  			zfs_unmount(data);
2318  			return err;
2319  		}
2320  
2321  		snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&dn.dn))->ds_snapnames_zapobj, dn.endian);
2322  
2323  		err = dnode_get(&(data->mos), snapobj,
2324  						DMU_OT_DSL_DS_SNAP_MAP, &dn, data);
2325  		if (err) {
2326  			zfs_unmount(data);
2327  			return err;
2328  		}
2329  
2330  		zap_iterate(&dn, iterate_zap_snap, data);
2331  	} else {
2332  		if (data->dnode.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
2333  			zfs_unmount(data);
2334  			printf("not a directory\n");
2335  			return ZFS_ERR_BAD_FILE_TYPE;
2336  		}
2337  		zap_iterate(&(data->dnode), iterate_zap, data);
2338  	}
2339  	zfs_unmount(data);
2340  	return ZFS_ERR_NONE;
2341  }
2342