1 /* 2 * trace event based perf event profiling/tracing 3 * 4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> 5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/kprobes.h> 10 #include "trace.h" 11 12 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; 13 14 /* 15 * Force it to be aligned to unsigned long to avoid misaligned accesses 16 * suprises 17 */ 18 typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) 19 perf_trace_t; 20 21 /* Count the events in use (per event id, not per instance) */ 22 static int total_ref_count; 23 24 static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 25 struct perf_event *p_event) 26 { 27 /* The ftrace function trace is allowed only for root. */ 28 if (ftrace_event_is_function(tp_event) && 29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 30 return -EPERM; 31 32 /* No tracing, just counting, so no obvious leak */ 33 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 34 return 0; 35 36 /* Some events are ok to be traced by non-root users... */ 37 if (p_event->attach_state == PERF_ATTACH_TASK) { 38 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) 39 return 0; 40 } 41 42 /* 43 * ...otherwise raw tracepoint data can be a severe data leak, 44 * only allow root to have these. 45 */ 46 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 47 return -EPERM; 48 49 return 0; 50 } 51 52 static int perf_trace_event_reg(struct ftrace_event_call *tp_event, 53 struct perf_event *p_event) 54 { 55 struct hlist_head __percpu *list; 56 int ret = -ENOMEM; 57 int cpu; 58 59 p_event->tp_event = tp_event; 60 if (tp_event->perf_refcount++ > 0) 61 return 0; 62 63 list = alloc_percpu(struct hlist_head); 64 if (!list) 65 goto fail; 66 67 for_each_possible_cpu(cpu) 68 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); 69 70 tp_event->perf_events = list; 71 72 if (!total_ref_count) { 73 char __percpu *buf; 74 int i; 75 76 for (i = 0; i < PERF_NR_CONTEXTS; i++) { 77 buf = (char __percpu *)alloc_percpu(perf_trace_t); 78 if (!buf) 79 goto fail; 80 81 perf_trace_buf[i] = buf; 82 } 83 } 84 85 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); 86 if (ret) 87 goto fail; 88 89 total_ref_count++; 90 return 0; 91 92 fail: 93 if (!total_ref_count) { 94 int i; 95 96 for (i = 0; i < PERF_NR_CONTEXTS; i++) { 97 free_percpu(perf_trace_buf[i]); 98 perf_trace_buf[i] = NULL; 99 } 100 } 101 102 if (!--tp_event->perf_refcount) { 103 free_percpu(tp_event->perf_events); 104 tp_event->perf_events = NULL; 105 } 106 107 return ret; 108 } 109 110 static void perf_trace_event_unreg(struct perf_event *p_event) 111 { 112 struct ftrace_event_call *tp_event = p_event->tp_event; 113 int i; 114 115 if (--tp_event->perf_refcount > 0) 116 goto out; 117 118 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); 119 120 /* 121 * Ensure our callback won't be called anymore. The buffers 122 * will be freed after that. 123 */ 124 tracepoint_synchronize_unregister(); 125 126 free_percpu(tp_event->perf_events); 127 tp_event->perf_events = NULL; 128 129 if (!--total_ref_count) { 130 for (i = 0; i < PERF_NR_CONTEXTS; i++) { 131 free_percpu(perf_trace_buf[i]); 132 perf_trace_buf[i] = NULL; 133 } 134 } 135 out: 136 module_put(tp_event->mod); 137 } 138 139 static int perf_trace_event_open(struct perf_event *p_event) 140 { 141 struct ftrace_event_call *tp_event = p_event->tp_event; 142 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); 143 } 144 145 static void perf_trace_event_close(struct perf_event *p_event) 146 { 147 struct ftrace_event_call *tp_event = p_event->tp_event; 148 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); 149 } 150 151 static int perf_trace_event_init(struct ftrace_event_call *tp_event, 152 struct perf_event *p_event) 153 { 154 int ret; 155 156 ret = perf_trace_event_perm(tp_event, p_event); 157 if (ret) 158 return ret; 159 160 ret = perf_trace_event_reg(tp_event, p_event); 161 if (ret) 162 return ret; 163 164 ret = perf_trace_event_open(p_event); 165 if (ret) { 166 perf_trace_event_unreg(p_event); 167 return ret; 168 } 169 170 return 0; 171 } 172 173 int perf_trace_init(struct perf_event *p_event) 174 { 175 struct ftrace_event_call *tp_event; 176 int event_id = p_event->attr.config; 177 int ret = -EINVAL; 178 179 mutex_lock(&event_mutex); 180 list_for_each_entry(tp_event, &ftrace_events, list) { 181 if (tp_event->event.type == event_id && 182 tp_event->class && tp_event->class->reg && 183 try_module_get(tp_event->mod)) { 184 ret = perf_trace_event_init(tp_event, p_event); 185 if (ret) 186 module_put(tp_event->mod); 187 break; 188 } 189 } 190 mutex_unlock(&event_mutex); 191 192 return ret; 193 } 194 195 void perf_trace_destroy(struct perf_event *p_event) 196 { 197 mutex_lock(&event_mutex); 198 perf_trace_event_close(p_event); 199 perf_trace_event_unreg(p_event); 200 mutex_unlock(&event_mutex); 201 } 202 203 int perf_trace_add(struct perf_event *p_event, int flags) 204 { 205 struct ftrace_event_call *tp_event = p_event->tp_event; 206 struct hlist_head __percpu *pcpu_list; 207 struct hlist_head *list; 208 209 pcpu_list = tp_event->perf_events; 210 if (WARN_ON_ONCE(!pcpu_list)) 211 return -EINVAL; 212 213 if (!(flags & PERF_EF_START)) 214 p_event->hw.state = PERF_HES_STOPPED; 215 216 list = this_cpu_ptr(pcpu_list); 217 hlist_add_head_rcu(&p_event->hlist_entry, list); 218 219 return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); 220 } 221 222 void perf_trace_del(struct perf_event *p_event, int flags) 223 { 224 struct ftrace_event_call *tp_event = p_event->tp_event; 225 hlist_del_rcu(&p_event->hlist_entry); 226 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); 227 } 228 229 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 230 struct pt_regs *regs, int *rctxp) 231 { 232 struct trace_entry *entry; 233 unsigned long flags; 234 char *raw_data; 235 int pc; 236 237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 238 239 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 240 "perf buffer not large enough")) 241 return NULL; 242 243 pc = preempt_count(); 244 245 *rctxp = perf_swevent_get_recursion_context(); 246 if (*rctxp < 0) 247 return NULL; 248 249 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 250 251 /* zero the dead bytes from align to not leak stack to user */ 252 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 253 254 entry = (struct trace_entry *)raw_data; 255 local_save_flags(flags); 256 tracing_generic_entry_update(entry, flags, pc); 257 entry->type = type; 258 259 return raw_data; 260 } 261 EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 262 263 #ifdef CONFIG_FUNCTION_TRACER 264 static void 265 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, 266 struct ftrace_ops *ops, struct pt_regs *pt_regs) 267 { 268 struct ftrace_entry *entry; 269 struct hlist_head *head; 270 struct pt_regs regs; 271 int rctx; 272 273 head = this_cpu_ptr(event_function.perf_events); 274 if (hlist_empty(head)) 275 return; 276 277 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ 278 sizeof(u64)) - sizeof(u32)) 279 280 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); 281 282 perf_fetch_caller_regs(®s); 283 284 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); 285 if (!entry) 286 return; 287 288 entry->ip = ip; 289 entry->parent_ip = parent_ip; 290 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 291 1, ®s, head, NULL); 292 293 #undef ENTRY_SIZE 294 } 295 296 static int perf_ftrace_function_register(struct perf_event *event) 297 { 298 struct ftrace_ops *ops = &event->ftrace_ops; 299 300 ops->flags |= FTRACE_OPS_FL_CONTROL; 301 ops->func = perf_ftrace_function_call; 302 return register_ftrace_function(ops); 303 } 304 305 static int perf_ftrace_function_unregister(struct perf_event *event) 306 { 307 struct ftrace_ops *ops = &event->ftrace_ops; 308 int ret = unregister_ftrace_function(ops); 309 ftrace_free_filter(ops); 310 return ret; 311 } 312 313 static void perf_ftrace_function_enable(struct perf_event *event) 314 { 315 ftrace_function_local_enable(&event->ftrace_ops); 316 } 317 318 static void perf_ftrace_function_disable(struct perf_event *event) 319 { 320 ftrace_function_local_disable(&event->ftrace_ops); 321 } 322 323 int perf_ftrace_event_register(struct ftrace_event_call *call, 324 enum trace_reg type, void *data) 325 { 326 switch (type) { 327 case TRACE_REG_REGISTER: 328 case TRACE_REG_UNREGISTER: 329 break; 330 case TRACE_REG_PERF_REGISTER: 331 case TRACE_REG_PERF_UNREGISTER: 332 return 0; 333 case TRACE_REG_PERF_OPEN: 334 return perf_ftrace_function_register(data); 335 case TRACE_REG_PERF_CLOSE: 336 return perf_ftrace_function_unregister(data); 337 case TRACE_REG_PERF_ADD: 338 perf_ftrace_function_enable(data); 339 return 0; 340 case TRACE_REG_PERF_DEL: 341 perf_ftrace_function_disable(data); 342 return 0; 343 } 344 345 return -EINVAL; 346 } 347 #endif /* CONFIG_FUNCTION_TRACER */ 348