xref: /openbmc/linux/kernel/trace/blktrace.c (revision b04b4f78)
1 /*
2  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16  *
17  */
18 #include <linux/kernel.h>
19 #include <linux/blkdev.h>
20 #include <linux/blktrace_api.h>
21 #include <linux/percpu.h>
22 #include <linux/init.h>
23 #include <linux/mutex.h>
24 #include <linux/debugfs.h>
25 #include <linux/time.h>
26 #include <trace/block.h>
27 #include <linux/uaccess.h>
28 #include "trace_output.h"
29 
30 static unsigned int blktrace_seq __read_mostly = 1;
31 
32 static struct trace_array *blk_tr;
33 static bool blk_tracer_enabled __read_mostly;
34 
35 /* Select an alternative, minimalistic output than the original one */
36 #define TRACE_BLK_OPT_CLASSIC	0x1
37 
38 static struct tracer_opt blk_tracer_opts[] = {
39 	/* Default disable the minimalistic output */
40 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
41 	{ }
42 };
43 
44 static struct tracer_flags blk_tracer_flags = {
45 	.val  = 0,
46 	.opts = blk_tracer_opts,
47 };
48 
49 /* Global reference count of probes */
50 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
51 
52 static void blk_register_tracepoints(void);
53 static void blk_unregister_tracepoints(void);
54 
55 /*
56  * Send out a notify message.
57  */
58 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
59 		       const void *data, size_t len)
60 {
61 	struct blk_io_trace *t;
62 	struct ring_buffer_event *event = NULL;
63 	int pc = 0;
64 	int cpu = smp_processor_id();
65 	bool blk_tracer = blk_tracer_enabled;
66 
67 	if (blk_tracer) {
68 		pc = preempt_count();
69 		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
70 						  sizeof(*t) + len,
71 						  0, pc);
72 		if (!event)
73 			return;
74 		t = ring_buffer_event_data(event);
75 		goto record_it;
76 	}
77 
78 	if (!bt->rchan)
79 		return;
80 
81 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
82 	if (t) {
83 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
84 		t->time = ktime_to_ns(ktime_get());
85 record_it:
86 		t->device = bt->dev;
87 		t->action = action;
88 		t->pid = pid;
89 		t->cpu = cpu;
90 		t->pdu_len = len;
91 		memcpy((void *) t + sizeof(*t), data, len);
92 
93 		if (blk_tracer)
94 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
95 	}
96 }
97 
98 /*
99  * Send out a notify for this process, if we haven't done so since a trace
100  * started
101  */
102 static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
103 {
104 	tsk->btrace_seq = blktrace_seq;
105 	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
106 }
107 
108 static void trace_note_time(struct blk_trace *bt)
109 {
110 	struct timespec now;
111 	unsigned long flags;
112 	u32 words[2];
113 
114 	getnstimeofday(&now);
115 	words[0] = now.tv_sec;
116 	words[1] = now.tv_nsec;
117 
118 	local_irq_save(flags);
119 	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
120 	local_irq_restore(flags);
121 }
122 
123 void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
124 {
125 	int n;
126 	va_list args;
127 	unsigned long flags;
128 	char *buf;
129 
130 	if (unlikely(bt->trace_state != Blktrace_running &&
131 		     !blk_tracer_enabled))
132 		return;
133 
134 	local_irq_save(flags);
135 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
136 	va_start(args, fmt);
137 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
138 	va_end(args);
139 
140 	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
141 	local_irq_restore(flags);
142 }
143 EXPORT_SYMBOL_GPL(__trace_note_message);
144 
145 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
146 			 pid_t pid)
147 {
148 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 		return 1;
150 	if (sector < bt->start_lba || sector > bt->end_lba)
151 		return 1;
152 	if (bt->pid && pid != bt->pid)
153 		return 1;
154 
155 	return 0;
156 }
157 
158 /*
159  * Data direction bit lookup
160  */
161 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
162 				 BLK_TC_ACT(BLK_TC_WRITE) };
163 
164 /* The ilog2() calls fall out because they're constant */
165 #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
166 	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
167 
168 /*
169  * The worker for the various blk_add_trace*() types. Fills out a
170  * blk_io_trace structure and places it in a per-cpu subbuffer.
171  */
172 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
173 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
174 {
175 	struct task_struct *tsk = current;
176 	struct ring_buffer_event *event = NULL;
177 	struct blk_io_trace *t;
178 	unsigned long flags = 0;
179 	unsigned long *sequence;
180 	pid_t pid;
181 	int cpu, pc = 0;
182 	bool blk_tracer = blk_tracer_enabled;
183 
184 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
185 		return;
186 
187 	what |= ddir_act[rw & WRITE];
188 	what |= MASK_TC_BIT(rw, BARRIER);
189 	what |= MASK_TC_BIT(rw, SYNCIO);
190 	what |= MASK_TC_BIT(rw, AHEAD);
191 	what |= MASK_TC_BIT(rw, META);
192 	what |= MASK_TC_BIT(rw, DISCARD);
193 
194 	pid = tsk->pid;
195 	if (unlikely(act_log_check(bt, what, sector, pid)))
196 		return;
197 	cpu = raw_smp_processor_id();
198 
199 	if (blk_tracer) {
200 		tracing_record_cmdline(current);
201 
202 		pc = preempt_count();
203 		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
204 						  sizeof(*t) + pdu_len,
205 						  0, pc);
206 		if (!event)
207 			return;
208 		t = ring_buffer_event_data(event);
209 		goto record_it;
210 	}
211 
212 	/*
213 	 * A word about the locking here - we disable interrupts to reserve
214 	 * some space in the relay per-cpu buffer, to prevent an irq
215 	 * from coming in and stepping on our toes.
216 	 */
217 	local_irq_save(flags);
218 
219 	if (unlikely(tsk->btrace_seq != blktrace_seq))
220 		trace_note_tsk(bt, tsk);
221 
222 	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
223 	if (t) {
224 		sequence = per_cpu_ptr(bt->sequence, cpu);
225 
226 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
227 		t->sequence = ++(*sequence);
228 		t->time = ktime_to_ns(ktime_get());
229 record_it:
230 		/*
231 		 * These two are not needed in ftrace as they are in the
232 		 * generic trace_entry, filled by tracing_generic_entry_update,
233 		 * but for the trace_event->bin() synthesizer benefit we do it
234 		 * here too.
235 		 */
236 		t->cpu = cpu;
237 		t->pid = pid;
238 
239 		t->sector = sector;
240 		t->bytes = bytes;
241 		t->action = what;
242 		t->device = bt->dev;
243 		t->error = error;
244 		t->pdu_len = pdu_len;
245 
246 		if (pdu_len)
247 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
248 
249 		if (blk_tracer) {
250 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
251 			return;
252 		}
253 	}
254 
255 	local_irq_restore(flags);
256 }
257 
258 static struct dentry *blk_tree_root;
259 static DEFINE_MUTEX(blk_tree_mutex);
260 
261 static void blk_trace_free(struct blk_trace *bt)
262 {
263 	debugfs_remove(bt->msg_file);
264 	debugfs_remove(bt->dropped_file);
265 	relay_close(bt->rchan);
266 	free_percpu(bt->sequence);
267 	free_percpu(bt->msg_data);
268 	kfree(bt);
269 }
270 
271 static void blk_trace_cleanup(struct blk_trace *bt)
272 {
273 	blk_trace_free(bt);
274 	if (atomic_dec_and_test(&blk_probes_ref))
275 		blk_unregister_tracepoints();
276 }
277 
278 int blk_trace_remove(struct request_queue *q)
279 {
280 	struct blk_trace *bt;
281 
282 	bt = xchg(&q->blk_trace, NULL);
283 	if (!bt)
284 		return -EINVAL;
285 
286 	if (bt->trace_state != Blktrace_running)
287 		blk_trace_cleanup(bt);
288 
289 	return 0;
290 }
291 EXPORT_SYMBOL_GPL(blk_trace_remove);
292 
293 static int blk_dropped_open(struct inode *inode, struct file *filp)
294 {
295 	filp->private_data = inode->i_private;
296 
297 	return 0;
298 }
299 
300 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
301 				size_t count, loff_t *ppos)
302 {
303 	struct blk_trace *bt = filp->private_data;
304 	char buf[16];
305 
306 	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
307 
308 	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
309 }
310 
311 static const struct file_operations blk_dropped_fops = {
312 	.owner =	THIS_MODULE,
313 	.open =		blk_dropped_open,
314 	.read =		blk_dropped_read,
315 };
316 
317 static int blk_msg_open(struct inode *inode, struct file *filp)
318 {
319 	filp->private_data = inode->i_private;
320 
321 	return 0;
322 }
323 
324 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
325 				size_t count, loff_t *ppos)
326 {
327 	char *msg;
328 	struct blk_trace *bt;
329 
330 	if (count >= BLK_TN_MAX_MSG)
331 		return -EINVAL;
332 
333 	msg = kmalloc(count + 1, GFP_KERNEL);
334 	if (msg == NULL)
335 		return -ENOMEM;
336 
337 	if (copy_from_user(msg, buffer, count)) {
338 		kfree(msg);
339 		return -EFAULT;
340 	}
341 
342 	msg[count] = '\0';
343 	bt = filp->private_data;
344 	__trace_note_message(bt, "%s", msg);
345 	kfree(msg);
346 
347 	return count;
348 }
349 
350 static const struct file_operations blk_msg_fops = {
351 	.owner =	THIS_MODULE,
352 	.open =		blk_msg_open,
353 	.write =	blk_msg_write,
354 };
355 
356 /*
357  * Keep track of how many times we encountered a full subbuffer, to aid
358  * the user space app in telling how many lost events there were.
359  */
360 static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
361 				     void *prev_subbuf, size_t prev_padding)
362 {
363 	struct blk_trace *bt;
364 
365 	if (!relay_buf_full(buf))
366 		return 1;
367 
368 	bt = buf->chan->private_data;
369 	atomic_inc(&bt->dropped);
370 	return 0;
371 }
372 
373 static int blk_remove_buf_file_callback(struct dentry *dentry)
374 {
375 	struct dentry *parent = dentry->d_parent;
376 	debugfs_remove(dentry);
377 
378 	/*
379 	* this will fail for all but the last file, but that is ok. what we
380 	* care about is the top level buts->name directory going away, when
381 	* the last trace file is gone. Then we don't have to rmdir() that
382 	* manually on trace stop, so it nicely solves the issue with
383 	* force killing of running traces.
384 	*/
385 
386 	debugfs_remove(parent);
387 	return 0;
388 }
389 
390 static struct dentry *blk_create_buf_file_callback(const char *filename,
391 						   struct dentry *parent,
392 						   int mode,
393 						   struct rchan_buf *buf,
394 						   int *is_global)
395 {
396 	return debugfs_create_file(filename, mode, parent, buf,
397 					&relay_file_operations);
398 }
399 
400 static struct rchan_callbacks blk_relay_callbacks = {
401 	.subbuf_start		= blk_subbuf_start_callback,
402 	.create_buf_file	= blk_create_buf_file_callback,
403 	.remove_buf_file	= blk_remove_buf_file_callback,
404 };
405 
406 /*
407  * Setup everything required to start tracing
408  */
409 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 			struct blk_user_trace_setup *buts)
411 {
412 	struct blk_trace *old_bt, *bt = NULL;
413 	struct dentry *dir = NULL;
414 	int ret, i;
415 
416 	if (!buts->buf_size || !buts->buf_nr)
417 		return -EINVAL;
418 
419 	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
420 	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
421 
422 	/*
423 	 * some device names have larger paths - convert the slashes
424 	 * to underscores for this to work as expected
425 	 */
426 	for (i = 0; i < strlen(buts->name); i++)
427 		if (buts->name[i] == '/')
428 			buts->name[i] = '_';
429 
430 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
431 	if (!bt)
432 		return -ENOMEM;
433 
434 	ret = -ENOMEM;
435 	bt->sequence = alloc_percpu(unsigned long);
436 	if (!bt->sequence)
437 		goto err;
438 
439 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
440 	if (!bt->msg_data)
441 		goto err;
442 
443 	ret = -ENOENT;
444 
445 	mutex_lock(&blk_tree_mutex);
446 	if (!blk_tree_root) {
447 		blk_tree_root = debugfs_create_dir("block", NULL);
448 		if (!blk_tree_root) {
449 			mutex_unlock(&blk_tree_mutex);
450 			goto err;
451 		}
452 	}
453 	mutex_unlock(&blk_tree_mutex);
454 
455 	dir = debugfs_create_dir(buts->name, blk_tree_root);
456 
457 	if (!dir)
458 		goto err;
459 
460 	bt->dir = dir;
461 	bt->dev = dev;
462 	atomic_set(&bt->dropped, 0);
463 
464 	ret = -EIO;
465 	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
466 					       &blk_dropped_fops);
467 	if (!bt->dropped_file)
468 		goto err;
469 
470 	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
471 	if (!bt->msg_file)
472 		goto err;
473 
474 	bt->rchan = relay_open("trace", dir, buts->buf_size,
475 				buts->buf_nr, &blk_relay_callbacks, bt);
476 	if (!bt->rchan)
477 		goto err;
478 
479 	bt->act_mask = buts->act_mask;
480 	if (!bt->act_mask)
481 		bt->act_mask = (u16) -1;
482 
483 	bt->start_lba = buts->start_lba;
484 	bt->end_lba = buts->end_lba;
485 	if (!bt->end_lba)
486 		bt->end_lba = -1ULL;
487 
488 	bt->pid = buts->pid;
489 	bt->trace_state = Blktrace_setup;
490 
491 	ret = -EBUSY;
492 	old_bt = xchg(&q->blk_trace, bt);
493 	if (old_bt) {
494 		(void) xchg(&q->blk_trace, old_bt);
495 		goto err;
496 	}
497 
498 	if (atomic_inc_return(&blk_probes_ref) == 1)
499 		blk_register_tracepoints();
500 
501 	return 0;
502 err:
503 	blk_trace_free(bt);
504 	return ret;
505 }
506 
507 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
508 		    char __user *arg)
509 {
510 	struct blk_user_trace_setup buts;
511 	int ret;
512 
513 	ret = copy_from_user(&buts, arg, sizeof(buts));
514 	if (ret)
515 		return -EFAULT;
516 
517 	ret = do_blk_trace_setup(q, name, dev, &buts);
518 	if (ret)
519 		return ret;
520 
521 	if (copy_to_user(arg, &buts, sizeof(buts)))
522 		return -EFAULT;
523 
524 	return 0;
525 }
526 EXPORT_SYMBOL_GPL(blk_trace_setup);
527 
528 int blk_trace_startstop(struct request_queue *q, int start)
529 {
530 	int ret;
531 	struct blk_trace *bt = q->blk_trace;
532 
533 	if (bt == NULL)
534 		return -EINVAL;
535 
536 	/*
537 	 * For starting a trace, we can transition from a setup or stopped
538 	 * trace. For stopping a trace, the state must be running
539 	 */
540 	ret = -EINVAL;
541 	if (start) {
542 		if (bt->trace_state == Blktrace_setup ||
543 		    bt->trace_state == Blktrace_stopped) {
544 			blktrace_seq++;
545 			smp_mb();
546 			bt->trace_state = Blktrace_running;
547 
548 			trace_note_time(bt);
549 			ret = 0;
550 		}
551 	} else {
552 		if (bt->trace_state == Blktrace_running) {
553 			bt->trace_state = Blktrace_stopped;
554 			relay_flush(bt->rchan);
555 			ret = 0;
556 		}
557 	}
558 
559 	return ret;
560 }
561 EXPORT_SYMBOL_GPL(blk_trace_startstop);
562 
563 /**
564  * blk_trace_ioctl: - handle the ioctls associated with tracing
565  * @bdev:	the block device
566  * @cmd:	the ioctl cmd
567  * @arg:	the argument data, if any
568  *
569  **/
570 int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
571 {
572 	struct request_queue *q;
573 	int ret, start = 0;
574 	char b[BDEVNAME_SIZE];
575 
576 	q = bdev_get_queue(bdev);
577 	if (!q)
578 		return -ENXIO;
579 
580 	mutex_lock(&bdev->bd_mutex);
581 
582 	switch (cmd) {
583 	case BLKTRACESETUP:
584 		bdevname(bdev, b);
585 		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
586 		break;
587 	case BLKTRACESTART:
588 		start = 1;
589 	case BLKTRACESTOP:
590 		ret = blk_trace_startstop(q, start);
591 		break;
592 	case BLKTRACETEARDOWN:
593 		ret = blk_trace_remove(q);
594 		break;
595 	default:
596 		ret = -ENOTTY;
597 		break;
598 	}
599 
600 	mutex_unlock(&bdev->bd_mutex);
601 	return ret;
602 }
603 
604 /**
605  * blk_trace_shutdown: - stop and cleanup trace structures
606  * @q:    the request queue associated with the device
607  *
608  **/
609 void blk_trace_shutdown(struct request_queue *q)
610 {
611 	if (q->blk_trace) {
612 		blk_trace_startstop(q, 0);
613 		blk_trace_remove(q);
614 	}
615 }
616 
617 /*
618  * blktrace probes
619  */
620 
621 /**
622  * blk_add_trace_rq - Add a trace for a request oriented action
623  * @q:		queue the io is for
624  * @rq:		the source request
625  * @what:	the action
626  *
627  * Description:
628  *     Records an action against a request. Will log the bio offset + size.
629  *
630  **/
631 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
632 				    u32 what)
633 {
634 	struct blk_trace *bt = q->blk_trace;
635 	int rw = rq->cmd_flags & 0x03;
636 
637 	if (likely(!bt))
638 		return;
639 
640 	if (blk_discard_rq(rq))
641 		rw |= (1 << BIO_RW_DISCARD);
642 
643 	if (blk_pc_request(rq)) {
644 		what |= BLK_TC_ACT(BLK_TC_PC);
645 		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
646 				rq->cmd_len, rq->cmd);
647 	} else  {
648 		what |= BLK_TC_ACT(BLK_TC_FS);
649 		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
650 				rw, what, rq->errors, 0, NULL);
651 	}
652 }
653 
654 static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
655 {
656 	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
657 }
658 
659 static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
660 {
661 	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
662 }
663 
664 static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
665 {
666 	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
667 }
668 
669 static void blk_add_trace_rq_requeue(struct request_queue *q,
670 				     struct request *rq)
671 {
672 	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
673 }
674 
675 static void blk_add_trace_rq_complete(struct request_queue *q,
676 				      struct request *rq)
677 {
678 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
679 }
680 
681 /**
682  * blk_add_trace_bio - Add a trace for a bio oriented action
683  * @q:		queue the io is for
684  * @bio:	the source bio
685  * @what:	the action
686  *
687  * Description:
688  *     Records an action against a bio. Will log the bio offset + size.
689  *
690  **/
691 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
692 				     u32 what)
693 {
694 	struct blk_trace *bt = q->blk_trace;
695 
696 	if (likely(!bt))
697 		return;
698 
699 	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
700 			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
701 }
702 
703 static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
704 {
705 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
706 }
707 
708 static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
709 {
710 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
711 }
712 
713 static void blk_add_trace_bio_backmerge(struct request_queue *q,
714 					struct bio *bio)
715 {
716 	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
717 }
718 
719 static void blk_add_trace_bio_frontmerge(struct request_queue *q,
720 					 struct bio *bio)
721 {
722 	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
723 }
724 
725 static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
726 {
727 	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
728 }
729 
730 static void blk_add_trace_getrq(struct request_queue *q,
731 				struct bio *bio, int rw)
732 {
733 	if (bio)
734 		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
735 	else {
736 		struct blk_trace *bt = q->blk_trace;
737 
738 		if (bt)
739 			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
740 	}
741 }
742 
743 
744 static void blk_add_trace_sleeprq(struct request_queue *q,
745 				  struct bio *bio, int rw)
746 {
747 	if (bio)
748 		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
749 	else {
750 		struct blk_trace *bt = q->blk_trace;
751 
752 		if (bt)
753 			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
754 					0, 0, NULL);
755 	}
756 }
757 
758 static void blk_add_trace_plug(struct request_queue *q)
759 {
760 	struct blk_trace *bt = q->blk_trace;
761 
762 	if (bt)
763 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
764 }
765 
766 static void blk_add_trace_unplug_io(struct request_queue *q)
767 {
768 	struct blk_trace *bt = q->blk_trace;
769 
770 	if (bt) {
771 		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
772 		__be64 rpdu = cpu_to_be64(pdu);
773 
774 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
775 				sizeof(rpdu), &rpdu);
776 	}
777 }
778 
779 static void blk_add_trace_unplug_timer(struct request_queue *q)
780 {
781 	struct blk_trace *bt = q->blk_trace;
782 
783 	if (bt) {
784 		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
785 		__be64 rpdu = cpu_to_be64(pdu);
786 
787 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
788 				sizeof(rpdu), &rpdu);
789 	}
790 }
791 
792 static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
793 				unsigned int pdu)
794 {
795 	struct blk_trace *bt = q->blk_trace;
796 
797 	if (bt) {
798 		__be64 rpdu = cpu_to_be64(pdu);
799 
800 		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
801 				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
802 				sizeof(rpdu), &rpdu);
803 	}
804 }
805 
806 /**
807  * blk_add_trace_remap - Add a trace for a remap operation
808  * @q:		queue the io is for
809  * @bio:	the source bio
810  * @dev:	target device
811  * @from:	source sector
812  * @to:		target sector
813  *
814  * Description:
815  *     Device mapper or raid target sometimes need to split a bio because
816  *     it spans a stripe (or similar). Add a trace for that action.
817  *
818  **/
819 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 				       dev_t dev, sector_t from, sector_t to)
821 {
822 	struct blk_trace *bt = q->blk_trace;
823 	struct blk_io_trace_remap r;
824 
825 	if (likely(!bt))
826 		return;
827 
828 	r.device = cpu_to_be32(dev);
829 	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
830 	r.sector = cpu_to_be64(to);
831 
832 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
833 			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
834 }
835 
836 /**
837  * blk_add_driver_data - Add binary message with driver-specific data
838  * @q:		queue the io is for
839  * @rq:		io request
840  * @data:	driver-specific data
841  * @len:	length of driver-specific data
842  *
843  * Description:
844  *     Some drivers might want to write driver-specific data per request.
845  *
846  **/
847 void blk_add_driver_data(struct request_queue *q,
848 			 struct request *rq,
849 			 void *data, size_t len)
850 {
851 	struct blk_trace *bt = q->blk_trace;
852 
853 	if (likely(!bt))
854 		return;
855 
856 	if (blk_pc_request(rq))
857 		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
858 				rq->errors, len, data);
859 	else
860 		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
861 				0, BLK_TA_DRV_DATA, rq->errors, len, data);
862 }
863 EXPORT_SYMBOL_GPL(blk_add_driver_data);
864 
865 static void blk_register_tracepoints(void)
866 {
867 	int ret;
868 
869 	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
870 	WARN_ON(ret);
871 	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
872 	WARN_ON(ret);
873 	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
874 	WARN_ON(ret);
875 	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
876 	WARN_ON(ret);
877 	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
878 	WARN_ON(ret);
879 	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
880 	WARN_ON(ret);
881 	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
882 	WARN_ON(ret);
883 	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
884 	WARN_ON(ret);
885 	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
886 	WARN_ON(ret);
887 	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
888 	WARN_ON(ret);
889 	ret = register_trace_block_getrq(blk_add_trace_getrq);
890 	WARN_ON(ret);
891 	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
892 	WARN_ON(ret);
893 	ret = register_trace_block_plug(blk_add_trace_plug);
894 	WARN_ON(ret);
895 	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
896 	WARN_ON(ret);
897 	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
898 	WARN_ON(ret);
899 	ret = register_trace_block_split(blk_add_trace_split);
900 	WARN_ON(ret);
901 	ret = register_trace_block_remap(blk_add_trace_remap);
902 	WARN_ON(ret);
903 }
904 
905 static void blk_unregister_tracepoints(void)
906 {
907 	unregister_trace_block_remap(blk_add_trace_remap);
908 	unregister_trace_block_split(blk_add_trace_split);
909 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
910 	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
911 	unregister_trace_block_plug(blk_add_trace_plug);
912 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
913 	unregister_trace_block_getrq(blk_add_trace_getrq);
914 	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
915 	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
916 	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
917 	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
918 	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
919 	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
920 	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
921 	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
922 	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
923 	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
924 
925 	tracepoint_synchronize_unregister();
926 }
927 
928 /*
929  * struct blk_io_tracer formatting routines
930  */
931 
932 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
933 {
934 	int i = 0;
935 	int tc = t->action >> BLK_TC_SHIFT;
936 
937 	if (t->action == BLK_TN_MESSAGE) {
938 		rwbs[i++] = 'N';
939 		goto out;
940 	}
941 
942 	if (tc & BLK_TC_DISCARD)
943 		rwbs[i++] = 'D';
944 	else if (tc & BLK_TC_WRITE)
945 		rwbs[i++] = 'W';
946 	else if (t->bytes)
947 		rwbs[i++] = 'R';
948 	else
949 		rwbs[i++] = 'N';
950 
951 	if (tc & BLK_TC_AHEAD)
952 		rwbs[i++] = 'A';
953 	if (tc & BLK_TC_BARRIER)
954 		rwbs[i++] = 'B';
955 	if (tc & BLK_TC_SYNC)
956 		rwbs[i++] = 'S';
957 	if (tc & BLK_TC_META)
958 		rwbs[i++] = 'M';
959 out:
960 	rwbs[i] = '\0';
961 }
962 
963 static inline
964 const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
965 {
966 	return (const struct blk_io_trace *)ent;
967 }
968 
969 static inline const void *pdu_start(const struct trace_entry *ent)
970 {
971 	return te_blk_io_trace(ent) + 1;
972 }
973 
974 static inline u32 t_sec(const struct trace_entry *ent)
975 {
976 	return te_blk_io_trace(ent)->bytes >> 9;
977 }
978 
979 static inline unsigned long long t_sector(const struct trace_entry *ent)
980 {
981 	return te_blk_io_trace(ent)->sector;
982 }
983 
984 static inline __u16 t_error(const struct trace_entry *ent)
985 {
986 	return te_blk_io_trace(ent)->error;
987 }
988 
989 static __u64 get_pdu_int(const struct trace_entry *ent)
990 {
991 	const __u64 *val = pdu_start(ent);
992 	return be64_to_cpu(*val);
993 }
994 
995 static void get_pdu_remap(const struct trace_entry *ent,
996 			  struct blk_io_trace_remap *r)
997 {
998 	const struct blk_io_trace_remap *__r = pdu_start(ent);
999 	__u64 sector = __r->sector;
1000 
1001 	r->device = be32_to_cpu(__r->device);
1002 	r->device_from = be32_to_cpu(__r->device_from);
1003 	r->sector = be64_to_cpu(sector);
1004 }
1005 
1006 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1007 
1008 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1009 {
1010 	char rwbs[6];
1011 	unsigned long long ts  = iter->ts;
1012 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1013 	unsigned secs	       = (unsigned long)ts;
1014 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1015 
1016 	fill_rwbs(rwbs, t);
1017 
1018 	return trace_seq_printf(&iter->seq,
1019 				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1020 				MAJOR(t->device), MINOR(t->device), iter->cpu,
1021 				secs, nsec_rem, iter->ent->pid, act, rwbs);
1022 }
1023 
1024 static int blk_log_action(struct trace_iterator *iter, const char *act)
1025 {
1026 	char rwbs[6];
1027 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1028 
1029 	fill_rwbs(rwbs, t);
1030 	return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1031 				MAJOR(t->device), MINOR(t->device), act, rwbs);
1032 }
1033 
1034 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035 {
1036 	char cmd[TASK_COMM_LEN];
1037 
1038 	trace_find_cmdline(ent->pid, cmd);
1039 
1040 	if (t_sec(ent))
1041 		return trace_seq_printf(s, "%llu + %u [%s]\n",
1042 					t_sector(ent), t_sec(ent), cmd);
1043 	return trace_seq_printf(s, "[%s]\n", cmd);
1044 }
1045 
1046 static int blk_log_with_error(struct trace_seq *s,
1047 			      const struct trace_entry *ent)
1048 {
1049 	if (t_sec(ent))
1050 		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
1051 					t_sec(ent), t_error(ent));
1052 	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
1053 }
1054 
1055 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056 {
1057 	struct blk_io_trace_remap r = { .device = 0, };
1058 
1059 	get_pdu_remap(ent, &r);
1060 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 			       t_sector(ent),
1062 			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
1063 			       (unsigned long long)r.sector);
1064 }
1065 
1066 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1067 {
1068 	char cmd[TASK_COMM_LEN];
1069 
1070 	trace_find_cmdline(ent->pid, cmd);
1071 
1072 	return trace_seq_printf(s, "[%s]\n", cmd);
1073 }
1074 
1075 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1076 {
1077 	char cmd[TASK_COMM_LEN];
1078 
1079 	trace_find_cmdline(ent->pid, cmd);
1080 
1081 	return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1082 }
1083 
1084 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1085 {
1086 	char cmd[TASK_COMM_LEN];
1087 
1088 	trace_find_cmdline(ent->pid, cmd);
1089 
1090 	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1091 				get_pdu_int(ent), cmd);
1092 }
1093 
1094 static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1095 {
1096 	int ret;
1097 	const struct blk_io_trace *t = te_blk_io_trace(ent);
1098 
1099 	ret = trace_seq_putmem(s, t + 1, t->pdu_len);
1100 	if (ret)
1101 		return trace_seq_putc(s, '\n');
1102 	return ret;
1103 }
1104 
1105 /*
1106  * struct tracer operations
1107  */
1108 
1109 static void blk_tracer_print_header(struct seq_file *m)
1110 {
1111 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1112 		return;
1113 	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
1114 		    "#  |     |     |           |   |   |\n");
1115 }
1116 
1117 static void blk_tracer_start(struct trace_array *tr)
1118 {
1119 	blk_tracer_enabled = true;
1120 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121 }
1122 
1123 static int blk_tracer_init(struct trace_array *tr)
1124 {
1125 	blk_tr = tr;
1126 	blk_tracer_start(tr);
1127 	return 0;
1128 }
1129 
1130 static void blk_tracer_stop(struct trace_array *tr)
1131 {
1132 	blk_tracer_enabled = false;
1133 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134 }
1135 
1136 static void blk_tracer_reset(struct trace_array *tr)
1137 {
1138 	blk_tracer_stop(tr);
1139 }
1140 
1141 static const struct {
1142 	const char *act[2];
1143 	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
1144 } what2act[] = {
1145 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
1146 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
1147 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
1148 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
1149 	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
1150 	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
1151 	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
1152 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
1153 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
1154 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
1155 	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
1156 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
1157 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
1158 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
1159 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
1160 };
1161 
1162 static enum print_line_t print_one_line(struct trace_iterator *iter,
1163 					bool classic)
1164 {
1165 	struct trace_seq *s = &iter->seq;
1166 	const struct blk_io_trace *t;
1167 	u16 what;
1168 	int ret;
1169 	bool long_act;
1170 	blk_log_action_t *log_action;
1171 
1172 	t	   = te_blk_io_trace(iter->ent);
1173 	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
1174 	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
1175 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
1176 
1177 	if (t->action == BLK_TN_MESSAGE) {
1178 		ret = log_action(iter, long_act ? "message" : "m");
1179 		if (ret)
1180 			ret = blk_log_msg(s, iter->ent);
1181 		goto out;
1182 	}
1183 
1184 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1186 	else {
1187 		ret = log_action(iter, what2act[what].act[long_act]);
1188 		if (ret)
1189 			ret = what2act[what].print(s, iter->ent);
1190 	}
1191 out:
1192 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1193 }
1194 
1195 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 					       int flags)
1197 {
1198 	if (!trace_print_context(iter))
1199 		return TRACE_TYPE_PARTIAL_LINE;
1200 
1201 	return print_one_line(iter, false);
1202 }
1203 
1204 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1205 {
1206 	struct trace_seq *s = &iter->seq;
1207 	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1208 	const int offset = offsetof(struct blk_io_trace, sector);
1209 	struct blk_io_trace old = {
1210 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1211 		.time     = iter->ts,
1212 	};
1213 
1214 	if (!trace_seq_putmem(s, &old, offset))
1215 		return 0;
1216 	return trace_seq_putmem(s, &t->sector,
1217 				sizeof(old) - offset + t->pdu_len);
1218 }
1219 
1220 static enum print_line_t
1221 blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1222 {
1223 	return blk_trace_synthesize_old_trace(iter) ?
1224 			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1225 }
1226 
1227 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1228 {
1229 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1230 		return TRACE_TYPE_UNHANDLED;
1231 
1232 	return print_one_line(iter, true);
1233 }
1234 
1235 static struct tracer blk_tracer __read_mostly = {
1236 	.name		= "blk",
1237 	.init		= blk_tracer_init,
1238 	.reset		= blk_tracer_reset,
1239 	.start		= blk_tracer_start,
1240 	.stop		= blk_tracer_stop,
1241 	.print_header	= blk_tracer_print_header,
1242 	.print_line	= blk_tracer_print_line,
1243 	.flags		= &blk_tracer_flags,
1244 };
1245 
1246 static struct trace_event trace_blk_event = {
1247 	.type		= TRACE_BLK,
1248 	.trace		= blk_trace_event_print,
1249 	.binary		= blk_trace_event_print_binary,
1250 };
1251 
1252 static int __init init_blk_tracer(void)
1253 {
1254 	if (!register_ftrace_event(&trace_blk_event)) {
1255 		pr_warning("Warning: could not register block events\n");
1256 		return 1;
1257 	}
1258 
1259 	if (register_tracer(&blk_tracer) != 0) {
1260 		pr_warning("Warning: could not register the block tracer\n");
1261 		unregister_ftrace_event(&trace_blk_event);
1262 		return 1;
1263 	}
1264 
1265 	return 0;
1266 }
1267 
1268 device_initcall(init_blk_tracer);
1269 
1270 static int blk_trace_remove_queue(struct request_queue *q)
1271 {
1272 	struct blk_trace *bt;
1273 
1274 	bt = xchg(&q->blk_trace, NULL);
1275 	if (bt == NULL)
1276 		return -EINVAL;
1277 
1278 	if (atomic_dec_and_test(&blk_probes_ref))
1279 		blk_unregister_tracepoints();
1280 
1281 	blk_trace_free(bt);
1282 	return 0;
1283 }
1284 
1285 /*
1286  * Setup everything required to start tracing
1287  */
1288 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1289 {
1290 	struct blk_trace *old_bt, *bt = NULL;
1291 	int ret = -ENOMEM;
1292 
1293 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1294 	if (!bt)
1295 		return -ENOMEM;
1296 
1297 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1298 	if (!bt->msg_data)
1299 		goto free_bt;
1300 
1301 	bt->dev = dev;
1302 	bt->act_mask = (u16)-1;
1303 	bt->end_lba = -1ULL;
1304 
1305 	old_bt = xchg(&q->blk_trace, bt);
1306 	if (old_bt != NULL) {
1307 		(void)xchg(&q->blk_trace, old_bt);
1308 		ret = -EBUSY;
1309 		goto free_bt;
1310 	}
1311 
1312 	if (atomic_inc_return(&blk_probes_ref) == 1)
1313 		blk_register_tracepoints();
1314 	return 0;
1315 
1316 free_bt:
1317 	blk_trace_free(bt);
1318 	return ret;
1319 }
1320 
1321 /*
1322  * sysfs interface to enable and configure tracing
1323  */
1324 
1325 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1326 					 struct device_attribute *attr,
1327 					 char *buf);
1328 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1329 					  struct device_attribute *attr,
1330 					  const char *buf, size_t count);
1331 #define BLK_TRACE_DEVICE_ATTR(_name) \
1332 	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1333 		    sysfs_blk_trace_attr_show, \
1334 		    sysfs_blk_trace_attr_store)
1335 
1336 static BLK_TRACE_DEVICE_ATTR(enable);
1337 static BLK_TRACE_DEVICE_ATTR(act_mask);
1338 static BLK_TRACE_DEVICE_ATTR(pid);
1339 static BLK_TRACE_DEVICE_ATTR(start_lba);
1340 static BLK_TRACE_DEVICE_ATTR(end_lba);
1341 
1342 static struct attribute *blk_trace_attrs[] = {
1343 	&dev_attr_enable.attr,
1344 	&dev_attr_act_mask.attr,
1345 	&dev_attr_pid.attr,
1346 	&dev_attr_start_lba.attr,
1347 	&dev_attr_end_lba.attr,
1348 	NULL
1349 };
1350 
1351 struct attribute_group blk_trace_attr_group = {
1352 	.name  = "trace",
1353 	.attrs = blk_trace_attrs,
1354 };
1355 
1356 static const struct {
1357 	int mask;
1358 	const char *str;
1359 } mask_maps[] = {
1360 	{ BLK_TC_READ,		"read"		},
1361 	{ BLK_TC_WRITE,		"write"		},
1362 	{ BLK_TC_BARRIER,	"barrier"	},
1363 	{ BLK_TC_SYNC,		"sync"		},
1364 	{ BLK_TC_QUEUE,		"queue"		},
1365 	{ BLK_TC_REQUEUE,	"requeue"	},
1366 	{ BLK_TC_ISSUE,		"issue"		},
1367 	{ BLK_TC_COMPLETE,	"complete"	},
1368 	{ BLK_TC_FS,		"fs"		},
1369 	{ BLK_TC_PC,		"pc"		},
1370 	{ BLK_TC_AHEAD,		"ahead"		},
1371 	{ BLK_TC_META,		"meta"		},
1372 	{ BLK_TC_DISCARD,	"discard"	},
1373 	{ BLK_TC_DRV_DATA,	"drv_data"	},
1374 };
1375 
1376 static int blk_trace_str2mask(const char *str)
1377 {
1378 	int i;
1379 	int mask = 0;
1380 	char *buf, *s, *token;
1381 
1382 	buf = kstrdup(str, GFP_KERNEL);
1383 	if (buf == NULL)
1384 		return -ENOMEM;
1385 	s = strstrip(buf);
1386 
1387 	while (1) {
1388 		token = strsep(&s, ",");
1389 		if (token == NULL)
1390 			break;
1391 
1392 		if (*token == '\0')
1393 			continue;
1394 
1395 		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1396 			if (strcasecmp(token, mask_maps[i].str) == 0) {
1397 				mask |= mask_maps[i].mask;
1398 				break;
1399 			}
1400 		}
1401 		if (i == ARRAY_SIZE(mask_maps)) {
1402 			mask = -EINVAL;
1403 			break;
1404 		}
1405 	}
1406 	kfree(buf);
1407 
1408 	return mask;
1409 }
1410 
1411 static ssize_t blk_trace_mask2str(char *buf, int mask)
1412 {
1413 	int i;
1414 	char *p = buf;
1415 
1416 	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1417 		if (mask & mask_maps[i].mask) {
1418 			p += sprintf(p, "%s%s",
1419 				    (p == buf) ? "" : ",", mask_maps[i].str);
1420 		}
1421 	}
1422 	*p++ = '\n';
1423 
1424 	return p - buf;
1425 }
1426 
1427 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
1428 {
1429 	if (bdev->bd_disk == NULL)
1430 		return NULL;
1431 
1432 	return bdev_get_queue(bdev);
1433 }
1434 
1435 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1436 					 struct device_attribute *attr,
1437 					 char *buf)
1438 {
1439 	struct hd_struct *p = dev_to_part(dev);
1440 	struct request_queue *q;
1441 	struct block_device *bdev;
1442 	ssize_t ret = -ENXIO;
1443 
1444 	lock_kernel();
1445 	bdev = bdget(part_devt(p));
1446 	if (bdev == NULL)
1447 		goto out_unlock_kernel;
1448 
1449 	q = blk_trace_get_queue(bdev);
1450 	if (q == NULL)
1451 		goto out_bdput;
1452 
1453 	mutex_lock(&bdev->bd_mutex);
1454 
1455 	if (attr == &dev_attr_enable) {
1456 		ret = sprintf(buf, "%u\n", !!q->blk_trace);
1457 		goto out_unlock_bdev;
1458 	}
1459 
1460 	if (q->blk_trace == NULL)
1461 		ret = sprintf(buf, "disabled\n");
1462 	else if (attr == &dev_attr_act_mask)
1463 		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
1464 	else if (attr == &dev_attr_pid)
1465 		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1466 	else if (attr == &dev_attr_start_lba)
1467 		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1468 	else if (attr == &dev_attr_end_lba)
1469 		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1470 
1471 out_unlock_bdev:
1472 	mutex_unlock(&bdev->bd_mutex);
1473 out_bdput:
1474 	bdput(bdev);
1475 out_unlock_kernel:
1476 	unlock_kernel();
1477 	return ret;
1478 }
1479 
1480 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1481 					  struct device_attribute *attr,
1482 					  const char *buf, size_t count)
1483 {
1484 	struct block_device *bdev;
1485 	struct request_queue *q;
1486 	struct hd_struct *p;
1487 	u64 value;
1488 	ssize_t ret = -EINVAL;
1489 
1490 	if (count == 0)
1491 		goto out;
1492 
1493 	if (attr == &dev_attr_act_mask) {
1494 		if (sscanf(buf, "%llx", &value) != 1) {
1495 			/* Assume it is a list of trace category names */
1496 			ret = blk_trace_str2mask(buf);
1497 			if (ret < 0)
1498 				goto out;
1499 			value = ret;
1500 		}
1501 	} else if (sscanf(buf, "%llu", &value) != 1)
1502 		goto out;
1503 
1504 	ret = -ENXIO;
1505 
1506 	lock_kernel();
1507 	p = dev_to_part(dev);
1508 	bdev = bdget(part_devt(p));
1509 	if (bdev == NULL)
1510 		goto out_unlock_kernel;
1511 
1512 	q = blk_trace_get_queue(bdev);
1513 	if (q == NULL)
1514 		goto out_bdput;
1515 
1516 	mutex_lock(&bdev->bd_mutex);
1517 
1518 	if (attr == &dev_attr_enable) {
1519 		if (value)
1520 			ret = blk_trace_setup_queue(q, bdev->bd_dev);
1521 		else
1522 			ret = blk_trace_remove_queue(q);
1523 		goto out_unlock_bdev;
1524 	}
1525 
1526 	ret = 0;
1527 	if (q->blk_trace == NULL)
1528 		ret = blk_trace_setup_queue(q, bdev->bd_dev);
1529 
1530 	if (ret == 0) {
1531 		if (attr == &dev_attr_act_mask)
1532 			q->blk_trace->act_mask = value;
1533 		else if (attr == &dev_attr_pid)
1534 			q->blk_trace->pid = value;
1535 		else if (attr == &dev_attr_start_lba)
1536 			q->blk_trace->start_lba = value;
1537 		else if (attr == &dev_attr_end_lba)
1538 			q->blk_trace->end_lba = value;
1539 	}
1540 
1541 out_unlock_bdev:
1542 	mutex_unlock(&bdev->bd_mutex);
1543 out_bdput:
1544 	bdput(bdev);
1545 out_unlock_kernel:
1546 	unlock_kernel();
1547 out:
1548 	return ret ? ret : count;
1549 }
1550 
1551