xref: /openbmc/linux/drivers/md/dm-switch.c (revision b9b77222)
1 /*
2  * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
3  * Copyright (C) 2011-2013 Red Hat, Inc.
4  *
5  * This file is released under the GPL.
6  *
7  * dm-switch is a device-mapper target that maps IO to underlying block
8  * devices efficiently when there are a large number of fixed-sized
9  * address regions but there is no simple pattern to allow for a compact
10  * mapping representation such as dm-stripe.
11  */
12 
13 #include <linux/device-mapper.h>
14 
15 #include <linux/module.h>
16 #include <linux/init.h>
17 #include <linux/vmalloc.h>
18 
19 #define DM_MSG_PREFIX "switch"
20 
21 /*
22  * One region_table_slot_t holds <region_entries_per_slot> region table
23  * entries each of which is <region_table_entry_bits> in size.
24  */
25 typedef unsigned long region_table_slot_t;
26 
27 /*
28  * A device with the offset to its start sector.
29  */
30 struct switch_path {
31 	struct dm_dev *dmdev;
32 	sector_t start;
33 };
34 
35 /*
36  * Context block for a dm switch device.
37  */
38 struct switch_ctx {
39 	struct dm_target *ti;
40 
41 	unsigned nr_paths;		/* Number of paths in path_list. */
42 
43 	unsigned region_size;		/* Region size in 512-byte sectors */
44 	unsigned long nr_regions;	/* Number of regions making up the device */
45 	signed char region_size_bits;	/* log2 of region_size or -1 */
46 
47 	unsigned char region_table_entry_bits;	/* Number of bits in one region table entry */
48 	unsigned char region_entries_per_slot;	/* Number of entries in one region table slot */
49 	signed char region_entries_per_slot_bits;	/* log2 of region_entries_per_slot or -1 */
50 
51 	region_table_slot_t *region_table;	/* Region table */
52 
53 	/*
54 	 * Array of dm devices to switch between.
55 	 */
56 	struct switch_path path_list[0];
57 };
58 
59 static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60 					   unsigned region_size)
61 {
62 	struct switch_ctx *sctx;
63 
64 	sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
65 		       GFP_KERNEL);
66 	if (!sctx)
67 		return NULL;
68 
69 	sctx->ti = ti;
70 	sctx->region_size = region_size;
71 
72 	ti->private = sctx;
73 
74 	return sctx;
75 }
76 
77 static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
78 {
79 	struct switch_ctx *sctx = ti->private;
80 	sector_t nr_regions = ti->len;
81 	sector_t nr_slots;
82 
83 	if (!(sctx->region_size & (sctx->region_size - 1)))
84 		sctx->region_size_bits = __ffs(sctx->region_size);
85 	else
86 		sctx->region_size_bits = -1;
87 
88 	sctx->region_table_entry_bits = 1;
89 	while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90 	       (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91 		sctx->region_table_entry_bits++;
92 
93 	sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94 	if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95 		sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96 	else
97 		sctx->region_entries_per_slot_bits = -1;
98 
99 	if (sector_div(nr_regions, sctx->region_size))
100 		nr_regions++;
101 
102 	if (nr_regions >= ULONG_MAX) {
103 		ti->error = "Region table too large";
104 		return -EINVAL;
105 	}
106 	sctx->nr_regions = nr_regions;
107 
108 	nr_slots = nr_regions;
109 	if (sector_div(nr_slots, sctx->region_entries_per_slot))
110 		nr_slots++;
111 
112 	if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113 		ti->error = "Region table too large";
114 		return -EINVAL;
115 	}
116 
117 	sctx->region_table = vmalloc(array_size(nr_slots,
118 						sizeof(region_table_slot_t)));
119 	if (!sctx->region_table) {
120 		ti->error = "Cannot allocate region table";
121 		return -ENOMEM;
122 	}
123 
124 	return 0;
125 }
126 
127 static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
128 				unsigned long *region_index, unsigned *bit)
129 {
130 	if (sctx->region_entries_per_slot_bits >= 0) {
131 		*region_index = region_nr >> sctx->region_entries_per_slot_bits;
132 		*bit = region_nr & (sctx->region_entries_per_slot - 1);
133 	} else {
134 		*region_index = region_nr / sctx->region_entries_per_slot;
135 		*bit = region_nr % sctx->region_entries_per_slot;
136 	}
137 
138 	*bit *= sctx->region_table_entry_bits;
139 }
140 
141 static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
142 {
143 	unsigned long region_index;
144 	unsigned bit;
145 
146 	switch_get_position(sctx, region_nr, &region_index, &bit);
147 
148 	return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
149 		((1 << sctx->region_table_entry_bits) - 1);
150 }
151 
152 /*
153  * Find which path to use at given offset.
154  */
155 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
156 {
157 	unsigned path_nr;
158 	sector_t p;
159 
160 	p = offset;
161 	if (sctx->region_size_bits >= 0)
162 		p >>= sctx->region_size_bits;
163 	else
164 		sector_div(p, sctx->region_size);
165 
166 	path_nr = switch_region_table_read(sctx, p);
167 
168 	/* This can only happen if the processor uses non-atomic stores. */
169 	if (unlikely(path_nr >= sctx->nr_paths))
170 		path_nr = 0;
171 
172 	return path_nr;
173 }
174 
175 static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
176 				      unsigned value)
177 {
178 	unsigned long region_index;
179 	unsigned bit;
180 	region_table_slot_t pte;
181 
182 	switch_get_position(sctx, region_nr, &region_index, &bit);
183 
184 	pte = sctx->region_table[region_index];
185 	pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
186 	pte |= (region_table_slot_t)value << bit;
187 	sctx->region_table[region_index] = pte;
188 }
189 
190 /*
191  * Fill the region table with an initial round robin pattern.
192  */
193 static void initialise_region_table(struct switch_ctx *sctx)
194 {
195 	unsigned path_nr = 0;
196 	unsigned long region_nr;
197 
198 	for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
199 		switch_region_table_write(sctx, region_nr, path_nr);
200 		if (++path_nr >= sctx->nr_paths)
201 			path_nr = 0;
202 	}
203 }
204 
205 static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
206 {
207 	struct switch_ctx *sctx = ti->private;
208 	unsigned long long start;
209 	int r;
210 
211 	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
212 			  &sctx->path_list[sctx->nr_paths].dmdev);
213 	if (r) {
214 		ti->error = "Device lookup failed";
215 		return r;
216 	}
217 
218 	if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
219 		ti->error = "Invalid device starting offset";
220 		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
221 		return -EINVAL;
222 	}
223 
224 	sctx->path_list[sctx->nr_paths].start = start;
225 
226 	sctx->nr_paths++;
227 
228 	return 0;
229 }
230 
231 /*
232  * Destructor: Don't free the dm_target, just the ti->private data (if any).
233  */
234 static void switch_dtr(struct dm_target *ti)
235 {
236 	struct switch_ctx *sctx = ti->private;
237 
238 	while (sctx->nr_paths--)
239 		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
240 
241 	vfree(sctx->region_table);
242 	kfree(sctx);
243 }
244 
245 /*
246  * Constructor arguments:
247  *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
248  *   [<dev_path> <offset>]+
249  *
250  * Optional args are to allow for future extension: currently this
251  * parameter must be 0.
252  */
253 static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
254 {
255 	static const struct dm_arg _args[] = {
256 		{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
257 		{1, UINT_MAX, "Invalid region size"},
258 		{0, 0, "Invalid number of optional args"},
259 	};
260 
261 	struct switch_ctx *sctx;
262 	struct dm_arg_set as;
263 	unsigned nr_paths, region_size, nr_optional_args;
264 	int r;
265 
266 	as.argc = argc;
267 	as.argv = argv;
268 
269 	r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
270 	if (r)
271 		return -EINVAL;
272 
273 	r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
274 	if (r)
275 		return r;
276 
277 	r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
278 	if (r)
279 		return r;
280 	/* parse optional arguments here, if we add any */
281 
282 	if (as.argc != nr_paths * 2) {
283 		ti->error = "Incorrect number of path arguments";
284 		return -EINVAL;
285 	}
286 
287 	sctx = alloc_switch_ctx(ti, nr_paths, region_size);
288 	if (!sctx) {
289 		ti->error = "Cannot allocate redirection context";
290 		return -ENOMEM;
291 	}
292 
293 	r = dm_set_target_max_io_len(ti, region_size);
294 	if (r)
295 		goto error;
296 
297 	while (as.argc) {
298 		r = parse_path(&as, ti);
299 		if (r)
300 			goto error;
301 	}
302 
303 	r = alloc_region_table(ti, nr_paths);
304 	if (r)
305 		goto error;
306 
307 	initialise_region_table(sctx);
308 
309 	/* For UNMAP, sending the request down any path is sufficient */
310 	ti->num_discard_bios = 1;
311 
312 	return 0;
313 
314 error:
315 	switch_dtr(ti);
316 
317 	return r;
318 }
319 
320 static int switch_map(struct dm_target *ti, struct bio *bio)
321 {
322 	struct switch_ctx *sctx = ti->private;
323 	sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
324 	unsigned path_nr = switch_get_path_nr(sctx, offset);
325 
326 	bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
327 	bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
328 
329 	return DM_MAPIO_REMAPPED;
330 }
331 
332 /*
333  * We need to parse hex numbers in the message as quickly as possible.
334  *
335  * This table-based hex parser improves performance.
336  * It improves a time to load 1000000 entries compared to the condition-based
337  * parser.
338  *		table-based parser	condition-based parser
339  * PA-RISC	0.29s			0.31s
340  * Opteron	0.0495s			0.0498s
341  */
342 static const unsigned char hex_table[256] = {
343 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
346 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
347 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
350 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
351 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
352 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
353 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
354 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
355 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
356 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
357 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
358 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
359 };
360 
361 static __always_inline unsigned long parse_hex(const char **string)
362 {
363 	unsigned char d;
364 	unsigned long r = 0;
365 
366 	while ((d = hex_table[(unsigned char)**string]) < 16) {
367 		r = (r << 4) | d;
368 		(*string)++;
369 	}
370 
371 	return r;
372 }
373 
374 static int process_set_region_mappings(struct switch_ctx *sctx,
375 				       unsigned argc, char **argv)
376 {
377 	unsigned i;
378 	unsigned long region_index = 0;
379 
380 	for (i = 1; i < argc; i++) {
381 		unsigned long path_nr;
382 		const char *string = argv[i];
383 
384 		if ((*string & 0xdf) == 'R') {
385 			unsigned long cycle_length, num_write;
386 
387 			string++;
388 			if (unlikely(*string == ',')) {
389 				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
390 				return -EINVAL;
391 			}
392 			cycle_length = parse_hex(&string);
393 			if (unlikely(*string != ',')) {
394 				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
395 				return -EINVAL;
396 			}
397 			string++;
398 			if (unlikely(!*string)) {
399 				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
400 				return -EINVAL;
401 			}
402 			num_write = parse_hex(&string);
403 			if (unlikely(*string)) {
404 				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
405 				return -EINVAL;
406 			}
407 
408 			if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
409 				DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
410 				       cycle_length - 1, region_index);
411 				return -EINVAL;
412 			}
413 			if (unlikely(region_index + num_write < region_index) ||
414 			    unlikely(region_index + num_write >= sctx->nr_regions)) {
415 				DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
416 				       region_index, num_write, sctx->nr_regions);
417 				return -EINVAL;
418 			}
419 
420 			while (num_write--) {
421 				region_index++;
422 				path_nr = switch_region_table_read(sctx, region_index - cycle_length);
423 				switch_region_table_write(sctx, region_index, path_nr);
424 			}
425 
426 			continue;
427 		}
428 
429 		if (*string == ':')
430 			region_index++;
431 		else {
432 			region_index = parse_hex(&string);
433 			if (unlikely(*string != ':')) {
434 				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
435 				return -EINVAL;
436 			}
437 		}
438 
439 		string++;
440 		if (unlikely(!*string)) {
441 			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
442 			return -EINVAL;
443 		}
444 
445 		path_nr = parse_hex(&string);
446 		if (unlikely(*string)) {
447 			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
448 			return -EINVAL;
449 		}
450 		if (unlikely(region_index >= sctx->nr_regions)) {
451 			DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
452 			return -EINVAL;
453 		}
454 		if (unlikely(path_nr >= sctx->nr_paths)) {
455 			DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
456 			return -EINVAL;
457 		}
458 
459 		switch_region_table_write(sctx, region_index, path_nr);
460 	}
461 
462 	return 0;
463 }
464 
465 /*
466  * Messages are processed one-at-a-time.
467  *
468  * Only set_region_mappings is supported.
469  */
470 static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
471 			  char *result, unsigned maxlen)
472 {
473 	static DEFINE_MUTEX(message_mutex);
474 
475 	struct switch_ctx *sctx = ti->private;
476 	int r = -EINVAL;
477 
478 	mutex_lock(&message_mutex);
479 
480 	if (!strcasecmp(argv[0], "set_region_mappings"))
481 		r = process_set_region_mappings(sctx, argc, argv);
482 	else
483 		DMWARN("Unrecognised message received.");
484 
485 	mutex_unlock(&message_mutex);
486 
487 	return r;
488 }
489 
490 static void switch_status(struct dm_target *ti, status_type_t type,
491 			  unsigned status_flags, char *result, unsigned maxlen)
492 {
493 	struct switch_ctx *sctx = ti->private;
494 	unsigned sz = 0;
495 	int path_nr;
496 
497 	switch (type) {
498 	case STATUSTYPE_INFO:
499 		result[0] = '\0';
500 		break;
501 
502 	case STATUSTYPE_TABLE:
503 		DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
504 		for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
505 			DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
506 			       (unsigned long long)sctx->path_list[path_nr].start);
507 		break;
508 	}
509 }
510 
511 /*
512  * Switch ioctl:
513  *
514  * Passthrough all ioctls to the path for sector 0
515  */
516 static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
517 {
518 	struct switch_ctx *sctx = ti->private;
519 	unsigned path_nr;
520 
521 	path_nr = switch_get_path_nr(sctx, 0);
522 
523 	*bdev = sctx->path_list[path_nr].dmdev->bdev;
524 
525 	/*
526 	 * Only pass ioctls through if the device sizes match exactly.
527 	 */
528 	if (ti->len + sctx->path_list[path_nr].start !=
529 	    i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
530 		return 1;
531 	return 0;
532 }
533 
534 static int switch_iterate_devices(struct dm_target *ti,
535 				  iterate_devices_callout_fn fn, void *data)
536 {
537 	struct switch_ctx *sctx = ti->private;
538 	int path_nr;
539 	int r;
540 
541 	for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
542 		r = fn(ti, sctx->path_list[path_nr].dmdev,
543 			 sctx->path_list[path_nr].start, ti->len, data);
544 		if (r)
545 			return r;
546 	}
547 
548 	return 0;
549 }
550 
551 static struct target_type switch_target = {
552 	.name = "switch",
553 	.version = {1, 1, 0},
554 	.module = THIS_MODULE,
555 	.ctr = switch_ctr,
556 	.dtr = switch_dtr,
557 	.map = switch_map,
558 	.message = switch_message,
559 	.status = switch_status,
560 	.prepare_ioctl = switch_prepare_ioctl,
561 	.iterate_devices = switch_iterate_devices,
562 };
563 
564 static int __init dm_switch_init(void)
565 {
566 	int r;
567 
568 	r = dm_register_target(&switch_target);
569 	if (r < 0)
570 		DMERR("dm_register_target() failed %d", r);
571 
572 	return r;
573 }
574 
575 static void __exit dm_switch_exit(void)
576 {
577 	dm_unregister_target(&switch_target);
578 }
579 
580 module_init(dm_switch_init);
581 module_exit(dm_switch_exit);
582 
583 MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
584 MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
585 MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
586 MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
587 MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
588 MODULE_LICENSE("GPL");
589