xref: /openbmc/linux/net/xdp/xdp_umem.c (revision 31af04cd)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP user-space packet buffer
3  * Copyright(c) 2018 Intel Corporation.
4  */
5 
6 #include <linux/init.h>
7 #include <linux/sched/mm.h>
8 #include <linux/sched/signal.h>
9 #include <linux/sched/task.h>
10 #include <linux/uaccess.h>
11 #include <linux/slab.h>
12 #include <linux/bpf.h>
13 #include <linux/mm.h>
14 #include <linux/netdevice.h>
15 #include <linux/rtnetlink.h>
16 
17 #include "xdp_umem.h"
18 #include "xsk_queue.h"
19 
20 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
21 
22 void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
23 {
24 	unsigned long flags;
25 
26 	spin_lock_irqsave(&umem->xsk_list_lock, flags);
27 	list_add_rcu(&xs->list, &umem->xsk_list);
28 	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
29 }
30 
31 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
32 {
33 	unsigned long flags;
34 
35 	spin_lock_irqsave(&umem->xsk_list_lock, flags);
36 	list_del_rcu(&xs->list);
37 	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
38 }
39 
40 /* The umem is stored both in the _rx struct and the _tx struct as we do
41  * not know if the device has more tx queues than rx, or the opposite.
42  * This might also change during run time.
43  */
44 static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
45 			       u16 queue_id)
46 {
47 	if (queue_id >= max_t(unsigned int,
48 			      dev->real_num_rx_queues,
49 			      dev->real_num_tx_queues))
50 		return -EINVAL;
51 
52 	if (queue_id < dev->real_num_rx_queues)
53 		dev->_rx[queue_id].umem = umem;
54 	if (queue_id < dev->real_num_tx_queues)
55 		dev->_tx[queue_id].umem = umem;
56 
57 	return 0;
58 }
59 
60 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
61 				       u16 queue_id)
62 {
63 	if (queue_id < dev->real_num_rx_queues)
64 		return dev->_rx[queue_id].umem;
65 	if (queue_id < dev->real_num_tx_queues)
66 		return dev->_tx[queue_id].umem;
67 
68 	return NULL;
69 }
70 
71 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
72 {
73 	if (queue_id < dev->real_num_rx_queues)
74 		dev->_rx[queue_id].umem = NULL;
75 	if (queue_id < dev->real_num_tx_queues)
76 		dev->_tx[queue_id].umem = NULL;
77 }
78 
79 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
80 			u16 queue_id, u16 flags)
81 {
82 	bool force_zc, force_copy;
83 	struct netdev_bpf bpf;
84 	int err = 0;
85 
86 	force_zc = flags & XDP_ZEROCOPY;
87 	force_copy = flags & XDP_COPY;
88 
89 	if (force_zc && force_copy)
90 		return -EINVAL;
91 
92 	rtnl_lock();
93 	if (xdp_get_umem_from_qid(dev, queue_id)) {
94 		err = -EBUSY;
95 		goto out_rtnl_unlock;
96 	}
97 
98 	err = xdp_reg_umem_at_qid(dev, umem, queue_id);
99 	if (err)
100 		goto out_rtnl_unlock;
101 
102 	umem->dev = dev;
103 	umem->queue_id = queue_id;
104 	if (force_copy)
105 		/* For copy-mode, we are done. */
106 		goto out_rtnl_unlock;
107 
108 	if (!dev->netdev_ops->ndo_bpf ||
109 	    !dev->netdev_ops->ndo_xsk_async_xmit) {
110 		err = -EOPNOTSUPP;
111 		goto err_unreg_umem;
112 	}
113 
114 	bpf.command = XDP_SETUP_XSK_UMEM;
115 	bpf.xsk.umem = umem;
116 	bpf.xsk.queue_id = queue_id;
117 
118 	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
119 	if (err)
120 		goto err_unreg_umem;
121 	rtnl_unlock();
122 
123 	dev_hold(dev);
124 	umem->zc = true;
125 	return 0;
126 
127 err_unreg_umem:
128 	xdp_clear_umem_at_qid(dev, queue_id);
129 	if (!force_zc)
130 		err = 0; /* fallback to copy mode */
131 out_rtnl_unlock:
132 	rtnl_unlock();
133 	return err;
134 }
135 
136 static void xdp_umem_clear_dev(struct xdp_umem *umem)
137 {
138 	struct netdev_bpf bpf;
139 	int err;
140 
141 	if (umem->zc) {
142 		bpf.command = XDP_SETUP_XSK_UMEM;
143 		bpf.xsk.umem = NULL;
144 		bpf.xsk.queue_id = umem->queue_id;
145 
146 		rtnl_lock();
147 		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
148 		rtnl_unlock();
149 
150 		if (err)
151 			WARN(1, "failed to disable umem!\n");
152 	}
153 
154 	if (umem->dev) {
155 		rtnl_lock();
156 		xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
157 		rtnl_unlock();
158 	}
159 
160 	if (umem->zc) {
161 		dev_put(umem->dev);
162 		umem->zc = false;
163 	}
164 }
165 
166 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
167 {
168 	unsigned int i;
169 
170 	for (i = 0; i < umem->npgs; i++) {
171 		struct page *page = umem->pgs[i];
172 
173 		set_page_dirty_lock(page);
174 		put_page(page);
175 	}
176 
177 	kfree(umem->pgs);
178 	umem->pgs = NULL;
179 }
180 
181 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
182 {
183 	if (umem->user) {
184 		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
185 		free_uid(umem->user);
186 	}
187 }
188 
189 static void xdp_umem_release(struct xdp_umem *umem)
190 {
191 	struct task_struct *task;
192 	struct mm_struct *mm;
193 
194 	xdp_umem_clear_dev(umem);
195 
196 	if (umem->fq) {
197 		xskq_destroy(umem->fq);
198 		umem->fq = NULL;
199 	}
200 
201 	if (umem->cq) {
202 		xskq_destroy(umem->cq);
203 		umem->cq = NULL;
204 	}
205 
206 	xsk_reuseq_destroy(umem);
207 
208 	xdp_umem_unpin_pages(umem);
209 
210 	task = get_pid_task(umem->pid, PIDTYPE_PID);
211 	put_pid(umem->pid);
212 	if (!task)
213 		goto out;
214 	mm = get_task_mm(task);
215 	put_task_struct(task);
216 	if (!mm)
217 		goto out;
218 
219 	mmput(mm);
220 	kfree(umem->pages);
221 	umem->pages = NULL;
222 
223 	xdp_umem_unaccount_pages(umem);
224 out:
225 	kfree(umem);
226 }
227 
228 static void xdp_umem_release_deferred(struct work_struct *work)
229 {
230 	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
231 
232 	xdp_umem_release(umem);
233 }
234 
235 void xdp_get_umem(struct xdp_umem *umem)
236 {
237 	refcount_inc(&umem->users);
238 }
239 
240 void xdp_put_umem(struct xdp_umem *umem)
241 {
242 	if (!umem)
243 		return;
244 
245 	if (refcount_dec_and_test(&umem->users)) {
246 		INIT_WORK(&umem->work, xdp_umem_release_deferred);
247 		schedule_work(&umem->work);
248 	}
249 }
250 
251 static int xdp_umem_pin_pages(struct xdp_umem *umem)
252 {
253 	unsigned int gup_flags = FOLL_WRITE;
254 	long npgs;
255 	int err;
256 
257 	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
258 			    GFP_KERNEL | __GFP_NOWARN);
259 	if (!umem->pgs)
260 		return -ENOMEM;
261 
262 	down_write(&current->mm->mmap_sem);
263 	npgs = get_user_pages(umem->address, umem->npgs,
264 			      gup_flags, &umem->pgs[0], NULL);
265 	up_write(&current->mm->mmap_sem);
266 
267 	if (npgs != umem->npgs) {
268 		if (npgs >= 0) {
269 			umem->npgs = npgs;
270 			err = -ENOMEM;
271 			goto out_pin;
272 		}
273 		err = npgs;
274 		goto out_pgs;
275 	}
276 	return 0;
277 
278 out_pin:
279 	xdp_umem_unpin_pages(umem);
280 out_pgs:
281 	kfree(umem->pgs);
282 	umem->pgs = NULL;
283 	return err;
284 }
285 
286 static int xdp_umem_account_pages(struct xdp_umem *umem)
287 {
288 	unsigned long lock_limit, new_npgs, old_npgs;
289 
290 	if (capable(CAP_IPC_LOCK))
291 		return 0;
292 
293 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
294 	umem->user = get_uid(current_user());
295 
296 	do {
297 		old_npgs = atomic_long_read(&umem->user->locked_vm);
298 		new_npgs = old_npgs + umem->npgs;
299 		if (new_npgs > lock_limit) {
300 			free_uid(umem->user);
301 			umem->user = NULL;
302 			return -ENOBUFS;
303 		}
304 	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
305 				     new_npgs) != old_npgs);
306 	return 0;
307 }
308 
309 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
310 {
311 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
312 	unsigned int chunks, chunks_per_page;
313 	u64 addr = mr->addr, size = mr->len;
314 	int size_chk, err, i;
315 
316 	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
317 		/* Strictly speaking we could support this, if:
318 		 * - huge pages, or*
319 		 * - using an IOMMU, or
320 		 * - making sure the memory area is consecutive
321 		 * but for now, we simply say "computer says no".
322 		 */
323 		return -EINVAL;
324 	}
325 
326 	if (!is_power_of_2(chunk_size))
327 		return -EINVAL;
328 
329 	if (!PAGE_ALIGNED(addr)) {
330 		/* Memory area has to be page size aligned. For
331 		 * simplicity, this might change.
332 		 */
333 		return -EINVAL;
334 	}
335 
336 	if ((addr + size) < addr)
337 		return -EINVAL;
338 
339 	chunks = (unsigned int)div_u64(size, chunk_size);
340 	if (chunks == 0)
341 		return -EINVAL;
342 
343 	chunks_per_page = PAGE_SIZE / chunk_size;
344 	if (chunks < chunks_per_page || chunks % chunks_per_page)
345 		return -EINVAL;
346 
347 	headroom = ALIGN(headroom, 64);
348 
349 	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
350 	if (size_chk < 0)
351 		return -EINVAL;
352 
353 	umem->pid = get_task_pid(current, PIDTYPE_PID);
354 	umem->address = (unsigned long)addr;
355 	umem->chunk_mask = ~((u64)chunk_size - 1);
356 	umem->size = size;
357 	umem->headroom = headroom;
358 	umem->chunk_size_nohr = chunk_size - headroom;
359 	umem->npgs = size / PAGE_SIZE;
360 	umem->pgs = NULL;
361 	umem->user = NULL;
362 	INIT_LIST_HEAD(&umem->xsk_list);
363 	spin_lock_init(&umem->xsk_list_lock);
364 
365 	refcount_set(&umem->users, 1);
366 
367 	err = xdp_umem_account_pages(umem);
368 	if (err)
369 		goto out;
370 
371 	err = xdp_umem_pin_pages(umem);
372 	if (err)
373 		goto out_account;
374 
375 	umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
376 	if (!umem->pages) {
377 		err = -ENOMEM;
378 		goto out_account;
379 	}
380 
381 	for (i = 0; i < umem->npgs; i++)
382 		umem->pages[i].addr = page_address(umem->pgs[i]);
383 
384 	return 0;
385 
386 out_account:
387 	xdp_umem_unaccount_pages(umem);
388 out:
389 	put_pid(umem->pid);
390 	return err;
391 }
392 
393 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
394 {
395 	struct xdp_umem *umem;
396 	int err;
397 
398 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
399 	if (!umem)
400 		return ERR_PTR(-ENOMEM);
401 
402 	err = xdp_umem_reg(umem, mr);
403 	if (err) {
404 		kfree(umem);
405 		return ERR_PTR(err);
406 	}
407 
408 	return umem;
409 }
410 
411 bool xdp_umem_validate_queues(struct xdp_umem *umem)
412 {
413 	return umem->fq && umem->cq;
414 }
415