xref: /openbmc/linux/net/core/drop_monitor.c (revision 63dc02bd)
1 /*
2  * Monitoring code for network dropped packet alerts
3  *
4  * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
5  */
6 
7 #include <linux/netdevice.h>
8 #include <linux/etherdevice.h>
9 #include <linux/string.h>
10 #include <linux/if_arp.h>
11 #include <linux/inetdevice.h>
12 #include <linux/inet.h>
13 #include <linux/interrupt.h>
14 #include <linux/netpoll.h>
15 #include <linux/sched.h>
16 #include <linux/delay.h>
17 #include <linux/types.h>
18 #include <linux/workqueue.h>
19 #include <linux/netlink.h>
20 #include <linux/net_dropmon.h>
21 #include <linux/percpu.h>
22 #include <linux/timer.h>
23 #include <linux/bitops.h>
24 #include <linux/slab.h>
25 #include <net/genetlink.h>
26 #include <net/netevent.h>
27 
28 #include <trace/events/skb.h>
29 #include <trace/events/napi.h>
30 
31 #include <asm/unaligned.h>
32 
33 #define TRACE_ON 1
34 #define TRACE_OFF 0
35 
36 static void send_dm_alert(struct work_struct *unused);
37 
38 
39 /*
40  * Globals, our netlink socket pointer
41  * and the work handle that will send up
42  * netlink alerts
43  */
44 static int trace_state = TRACE_OFF;
45 static DEFINE_MUTEX(trace_state_mutex);
46 
47 struct per_cpu_dm_data {
48 	struct work_struct dm_alert_work;
49 	struct sk_buff __rcu *skb;
50 	atomic_t dm_hit_count;
51 	struct timer_list send_timer;
52 	int cpu;
53 };
54 
55 struct dm_hw_stat_delta {
56 	struct net_device *dev;
57 	unsigned long last_rx;
58 	struct list_head list;
59 	struct rcu_head rcu;
60 	unsigned long last_drop_val;
61 };
62 
63 static struct genl_family net_drop_monitor_family = {
64 	.id             = GENL_ID_GENERATE,
65 	.hdrsize        = 0,
66 	.name           = "NET_DM",
67 	.version        = 2,
68 	.maxattr        = NET_DM_CMD_MAX,
69 };
70 
71 static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
72 
73 static int dm_hit_limit = 64;
74 static int dm_delay = 1;
75 static unsigned long dm_hw_check_delta = 2*HZ;
76 static LIST_HEAD(hw_stats_list);
77 
78 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
79 {
80 	size_t al;
81 	struct net_dm_alert_msg *msg;
82 	struct nlattr *nla;
83 	struct sk_buff *skb;
84 	struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
85 
86 	al = sizeof(struct net_dm_alert_msg);
87 	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
88 	al += sizeof(struct nlattr);
89 
90 	skb = genlmsg_new(al, GFP_KERNEL);
91 
92 	if (skb) {
93 		genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
94 				0, NET_DM_CMD_ALERT);
95 		nla = nla_reserve(skb, NLA_UNSPEC,
96 				  sizeof(struct net_dm_alert_msg));
97 		msg = nla_data(nla);
98 		memset(msg, 0, al);
99 	} else
100 		schedule_work_on(data->cpu, &data->dm_alert_work);
101 
102 	/*
103 	 * Don't need to lock this, since we are guaranteed to only
104 	 * run this on a single cpu at a time.
105 	 * Note also that we only update data->skb if the old and new skb
106 	 * pointers don't match.  This ensures that we don't continually call
107 	 * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
108 	 */
109 	if (skb != oskb) {
110 		rcu_assign_pointer(data->skb, skb);
111 
112 		synchronize_rcu();
113 
114 		atomic_set(&data->dm_hit_count, dm_hit_limit);
115 	}
116 
117 }
118 
119 static void send_dm_alert(struct work_struct *unused)
120 {
121 	struct sk_buff *skb;
122 	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
123 
124 	WARN_ON_ONCE(data->cpu != smp_processor_id());
125 
126 	/*
127 	 * Grab the skb we're about to send
128 	 */
129 	skb = rcu_dereference_protected(data->skb, 1);
130 
131 	/*
132 	 * Replace it with a new one
133 	 */
134 	reset_per_cpu_data(data);
135 
136 	/*
137 	 * Ship it!
138 	 */
139 	if (skb)
140 		genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
141 
142 	put_cpu_var(dm_cpu_data);
143 }
144 
145 /*
146  * This is the timer function to delay the sending of an alert
147  * in the event that more drops will arrive during the
148  * hysteresis period.  Note that it operates under the timer interrupt
149  * so we don't need to disable preemption here
150  */
151 static void sched_send_work(unsigned long unused)
152 {
153 	struct per_cpu_dm_data *data =  &get_cpu_var(dm_cpu_data);
154 
155 	schedule_work_on(smp_processor_id(), &data->dm_alert_work);
156 
157 	put_cpu_var(dm_cpu_data);
158 }
159 
160 static void trace_drop_common(struct sk_buff *skb, void *location)
161 {
162 	struct net_dm_alert_msg *msg;
163 	struct nlmsghdr *nlh;
164 	struct nlattr *nla;
165 	int i;
166 	struct sk_buff *dskb;
167 	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
168 
169 
170 	rcu_read_lock();
171 	dskb = rcu_dereference(data->skb);
172 
173 	if (!dskb)
174 		goto out;
175 
176 	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
177 		/*
178 		 * we're already at zero, discard this hit
179 		 */
180 		goto out;
181 	}
182 
183 	nlh = (struct nlmsghdr *)dskb->data;
184 	nla = genlmsg_data(nlmsg_data(nlh));
185 	msg = nla_data(nla);
186 	for (i = 0; i < msg->entries; i++) {
187 		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
188 			msg->points[i].count++;
189 			atomic_inc(&data->dm_hit_count);
190 			goto out;
191 		}
192 	}
193 
194 	/*
195 	 * We need to create a new entry
196 	 */
197 	__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
198 	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
199 	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
200 	msg->points[msg->entries].count = 1;
201 	msg->entries++;
202 
203 	if (!timer_pending(&data->send_timer)) {
204 		data->send_timer.expires = jiffies + dm_delay * HZ;
205 		add_timer_on(&data->send_timer, smp_processor_id());
206 	}
207 
208 out:
209 	rcu_read_unlock();
210 	put_cpu_var(dm_cpu_data);
211 	return;
212 }
213 
214 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
215 {
216 	trace_drop_common(skb, location);
217 }
218 
219 static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
220 {
221 	struct dm_hw_stat_delta *new_stat;
222 
223 	/*
224 	 * Don't check napi structures with no associated device
225 	 */
226 	if (!napi->dev)
227 		return;
228 
229 	rcu_read_lock();
230 	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
231 		/*
232 		 * only add a note to our monitor buffer if:
233 		 * 1) this is the dev we received on
234 		 * 2) its after the last_rx delta
235 		 * 3) our rx_dropped count has gone up
236 		 */
237 		if ((new_stat->dev == napi->dev)  &&
238 		    (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
239 		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
240 			trace_drop_common(NULL, NULL);
241 			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
242 			new_stat->last_rx = jiffies;
243 			break;
244 		}
245 	}
246 	rcu_read_unlock();
247 }
248 
249 static int set_all_monitor_traces(int state)
250 {
251 	int rc = 0;
252 	struct dm_hw_stat_delta *new_stat = NULL;
253 	struct dm_hw_stat_delta *temp;
254 
255 	mutex_lock(&trace_state_mutex);
256 
257 	if (state == trace_state) {
258 		rc = -EAGAIN;
259 		goto out_unlock;
260 	}
261 
262 	switch (state) {
263 	case TRACE_ON:
264 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
265 		rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
266 		break;
267 	case TRACE_OFF:
268 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
269 		rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
270 
271 		tracepoint_synchronize_unregister();
272 
273 		/*
274 		 * Clean the device list
275 		 */
276 		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
277 			if (new_stat->dev == NULL) {
278 				list_del_rcu(&new_stat->list);
279 				kfree_rcu(new_stat, rcu);
280 			}
281 		}
282 		break;
283 	default:
284 		rc = 1;
285 		break;
286 	}
287 
288 	if (!rc)
289 		trace_state = state;
290 	else
291 		rc = -EINPROGRESS;
292 
293 out_unlock:
294 	mutex_unlock(&trace_state_mutex);
295 
296 	return rc;
297 }
298 
299 
300 static int net_dm_cmd_config(struct sk_buff *skb,
301 			struct genl_info *info)
302 {
303 	return -ENOTSUPP;
304 }
305 
306 static int net_dm_cmd_trace(struct sk_buff *skb,
307 			struct genl_info *info)
308 {
309 	switch (info->genlhdr->cmd) {
310 	case NET_DM_CMD_START:
311 		return set_all_monitor_traces(TRACE_ON);
312 		break;
313 	case NET_DM_CMD_STOP:
314 		return set_all_monitor_traces(TRACE_OFF);
315 		break;
316 	}
317 
318 	return -ENOTSUPP;
319 }
320 
321 static int dropmon_net_event(struct notifier_block *ev_block,
322 			unsigned long event, void *ptr)
323 {
324 	struct net_device *dev = ptr;
325 	struct dm_hw_stat_delta *new_stat = NULL;
326 	struct dm_hw_stat_delta *tmp;
327 
328 	switch (event) {
329 	case NETDEV_REGISTER:
330 		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
331 
332 		if (!new_stat)
333 			goto out;
334 
335 		new_stat->dev = dev;
336 		new_stat->last_rx = jiffies;
337 		mutex_lock(&trace_state_mutex);
338 		list_add_rcu(&new_stat->list, &hw_stats_list);
339 		mutex_unlock(&trace_state_mutex);
340 		break;
341 	case NETDEV_UNREGISTER:
342 		mutex_lock(&trace_state_mutex);
343 		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
344 			if (new_stat->dev == dev) {
345 				new_stat->dev = NULL;
346 				if (trace_state == TRACE_OFF) {
347 					list_del_rcu(&new_stat->list);
348 					kfree_rcu(new_stat, rcu);
349 					break;
350 				}
351 			}
352 		}
353 		mutex_unlock(&trace_state_mutex);
354 		break;
355 	}
356 out:
357 	return NOTIFY_DONE;
358 }
359 
360 static struct genl_ops dropmon_ops[] = {
361 	{
362 		.cmd = NET_DM_CMD_CONFIG,
363 		.doit = net_dm_cmd_config,
364 	},
365 	{
366 		.cmd = NET_DM_CMD_START,
367 		.doit = net_dm_cmd_trace,
368 	},
369 	{
370 		.cmd = NET_DM_CMD_STOP,
371 		.doit = net_dm_cmd_trace,
372 	},
373 };
374 
375 static struct notifier_block dropmon_net_notifier = {
376 	.notifier_call = dropmon_net_event
377 };
378 
379 static int __init init_net_drop_monitor(void)
380 {
381 	struct per_cpu_dm_data *data;
382 	int cpu, rc;
383 
384 	printk(KERN_INFO "Initializing network drop monitor service\n");
385 
386 	if (sizeof(void *) > 8) {
387 		printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
388 		return -ENOSPC;
389 	}
390 
391 	rc = genl_register_family_with_ops(&net_drop_monitor_family,
392 					   dropmon_ops,
393 					   ARRAY_SIZE(dropmon_ops));
394 	if (rc) {
395 		printk(KERN_ERR "Could not create drop monitor netlink family\n");
396 		return rc;
397 	}
398 
399 	rc = register_netdevice_notifier(&dropmon_net_notifier);
400 	if (rc < 0) {
401 		printk(KERN_CRIT "Failed to register netdevice notifier\n");
402 		goto out_unreg;
403 	}
404 
405 	rc = 0;
406 
407 	for_each_present_cpu(cpu) {
408 		data = &per_cpu(dm_cpu_data, cpu);
409 		data->cpu = cpu;
410 		INIT_WORK(&data->dm_alert_work, send_dm_alert);
411 		init_timer(&data->send_timer);
412 		data->send_timer.data = cpu;
413 		data->send_timer.function = sched_send_work;
414 		reset_per_cpu_data(data);
415 	}
416 
417 
418 	goto out;
419 
420 out_unreg:
421 	genl_unregister_family(&net_drop_monitor_family);
422 out:
423 	return rc;
424 }
425 
426 late_initcall(init_net_drop_monitor);
427