xref: /openbmc/linux/net/smc/smc_core.c (revision d2ba09c1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11 
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19 
20 #include "smc.h"
21 #include "smc_clc.h"
22 #include "smc_core.h"
23 #include "smc_ib.h"
24 #include "smc_wr.h"
25 #include "smc_llc.h"
26 #include "smc_cdc.h"
27 #include "smc_close.h"
28 
29 #define SMC_LGR_NUM_INCR		256
30 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
31 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
32 
33 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
34 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
35 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
36 	.num = 0,
37 };
38 
39 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
40 			 struct smc_buf_desc *buf_desc);
41 
42 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
43 {
44 	/* client link group creation always follows the server link group
45 	 * creation. For client use a somewhat higher removal delay time,
46 	 * otherwise there is a risk of out-of-sync link groups.
47 	 */
48 	mod_delayed_work(system_wq, &lgr->free_work,
49 			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
50 						 SMC_LGR_FREE_DELAY_SERV);
51 }
52 
53 /* Register connection's alert token in our lookup structure.
54  * To use rbtrees we have to implement our own insert core.
55  * Requires @conns_lock
56  * @smc		connection to register
57  * Returns 0 on success, != otherwise.
58  */
59 static void smc_lgr_add_alert_token(struct smc_connection *conn)
60 {
61 	struct rb_node **link, *parent = NULL;
62 	u32 token = conn->alert_token_local;
63 
64 	link = &conn->lgr->conns_all.rb_node;
65 	while (*link) {
66 		struct smc_connection *cur = rb_entry(*link,
67 					struct smc_connection, alert_node);
68 
69 		parent = *link;
70 		if (cur->alert_token_local > token)
71 			link = &parent->rb_left;
72 		else
73 			link = &parent->rb_right;
74 	}
75 	/* Put the new node there */
76 	rb_link_node(&conn->alert_node, parent, link);
77 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
78 }
79 
80 /* Register connection in link group by assigning an alert token
81  * registered in a search tree.
82  * Requires @conns_lock
83  * Note that '0' is a reserved value and not assigned.
84  */
85 static void smc_lgr_register_conn(struct smc_connection *conn)
86 {
87 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
88 	static atomic_t nexttoken = ATOMIC_INIT(0);
89 
90 	/* find a new alert_token_local value not yet used by some connection
91 	 * in this link group
92 	 */
93 	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
94 	while (!conn->alert_token_local) {
95 		conn->alert_token_local = atomic_inc_return(&nexttoken);
96 		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
97 			conn->alert_token_local = 0;
98 	}
99 	smc_lgr_add_alert_token(conn);
100 	conn->lgr->conns_num++;
101 }
102 
103 /* Unregister connection and reset the alert token of the given connection<
104  */
105 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
106 {
107 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
108 	struct smc_link_group *lgr = conn->lgr;
109 
110 	rb_erase(&conn->alert_node, &lgr->conns_all);
111 	lgr->conns_num--;
112 	conn->alert_token_local = 0;
113 	conn->lgr = NULL;
114 	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
115 }
116 
117 /* Unregister connection and trigger lgr freeing if applicable
118  */
119 static void smc_lgr_unregister_conn(struct smc_connection *conn)
120 {
121 	struct smc_link_group *lgr = conn->lgr;
122 	int reduced = 0;
123 
124 	write_lock_bh(&lgr->conns_lock);
125 	if (conn->alert_token_local) {
126 		reduced = 1;
127 		__smc_lgr_unregister_conn(conn);
128 	}
129 	write_unlock_bh(&lgr->conns_lock);
130 	if (!reduced || lgr->conns_num)
131 		return;
132 	smc_lgr_schedule_free_work(lgr);
133 }
134 
135 static void smc_lgr_free_work(struct work_struct *work)
136 {
137 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
138 						  struct smc_link_group,
139 						  free_work);
140 	bool conns;
141 
142 	spin_lock_bh(&smc_lgr_list.lock);
143 	if (list_empty(&lgr->list))
144 		goto free;
145 	read_lock_bh(&lgr->conns_lock);
146 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
147 	read_unlock_bh(&lgr->conns_lock);
148 	if (!conns) { /* number of lgr connections is no longer zero */
149 		spin_unlock_bh(&smc_lgr_list.lock);
150 		return;
151 	}
152 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
153 free:
154 	spin_unlock_bh(&smc_lgr_list.lock);
155 	if (!delayed_work_pending(&lgr->free_work)) {
156 		if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
157 			smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
158 		smc_lgr_free(lgr);
159 	}
160 }
161 
162 /* create a new SMC link group */
163 static int smc_lgr_create(struct smc_sock *smc,
164 			  struct smc_ib_device *smcibdev, u8 ibport,
165 			  char *peer_systemid, unsigned short vlan_id)
166 {
167 	struct smc_link_group *lgr;
168 	struct smc_link *lnk;
169 	u8 rndvec[3];
170 	int rc = 0;
171 	int i;
172 
173 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
174 	if (!lgr) {
175 		rc = -ENOMEM;
176 		goto out;
177 	}
178 	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
179 	lgr->sync_err = 0;
180 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
181 	lgr->vlan_id = vlan_id;
182 	rwlock_init(&lgr->sndbufs_lock);
183 	rwlock_init(&lgr->rmbs_lock);
184 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
185 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
186 		INIT_LIST_HEAD(&lgr->rmbs[i]);
187 	}
188 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
189 	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
190 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
191 	lgr->conns_all = RB_ROOT;
192 
193 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
194 	/* initialize link */
195 	lnk->state = SMC_LNK_ACTIVATING;
196 	lnk->link_id = SMC_SINGLE_LINK;
197 	lnk->smcibdev = smcibdev;
198 	lnk->ibport = ibport;
199 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
200 	if (!smcibdev->initialized)
201 		smc_ib_setup_per_ibdev(smcibdev);
202 	get_random_bytes(rndvec, sizeof(rndvec));
203 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
204 	rc = smc_llc_link_init(lnk);
205 	if (rc)
206 		goto free_lgr;
207 	rc = smc_wr_alloc_link_mem(lnk);
208 	if (rc)
209 		goto clear_llc_lnk;
210 	rc = smc_ib_create_protection_domain(lnk);
211 	if (rc)
212 		goto free_link_mem;
213 	rc = smc_ib_create_queue_pair(lnk);
214 	if (rc)
215 		goto dealloc_pd;
216 	rc = smc_wr_create_link(lnk);
217 	if (rc)
218 		goto destroy_qp;
219 
220 	smc->conn.lgr = lgr;
221 	rwlock_init(&lgr->conns_lock);
222 	spin_lock_bh(&smc_lgr_list.lock);
223 	list_add(&lgr->list, &smc_lgr_list.list);
224 	spin_unlock_bh(&smc_lgr_list.lock);
225 	return 0;
226 
227 destroy_qp:
228 	smc_ib_destroy_queue_pair(lnk);
229 dealloc_pd:
230 	smc_ib_dealloc_protection_domain(lnk);
231 free_link_mem:
232 	smc_wr_free_link_mem(lnk);
233 clear_llc_lnk:
234 	smc_llc_link_clear(lnk);
235 free_lgr:
236 	kfree(lgr);
237 out:
238 	return rc;
239 }
240 
241 static void smc_buf_unuse(struct smc_connection *conn)
242 {
243 	if (conn->sndbuf_desc)
244 		conn->sndbuf_desc->used = 0;
245 	if (conn->rmb_desc) {
246 		if (!conn->rmb_desc->regerr) {
247 			conn->rmb_desc->reused = 1;
248 			conn->rmb_desc->used = 0;
249 		} else {
250 			/* buf registration failed, reuse not possible */
251 			struct smc_link_group *lgr = conn->lgr;
252 
253 			write_lock_bh(&lgr->rmbs_lock);
254 			list_del(&conn->rmb_desc->list);
255 			write_unlock_bh(&lgr->rmbs_lock);
256 
257 			smc_buf_free(lgr, true, conn->rmb_desc);
258 		}
259 	}
260 }
261 
262 /* remove a finished connection from its link group */
263 void smc_conn_free(struct smc_connection *conn)
264 {
265 	if (!conn->lgr)
266 		return;
267 	smc_cdc_tx_dismiss_slots(conn);
268 	smc_lgr_unregister_conn(conn);
269 	smc_buf_unuse(conn);
270 }
271 
272 static void smc_link_clear(struct smc_link *lnk)
273 {
274 	lnk->peer_qpn = 0;
275 	smc_llc_link_clear(lnk);
276 	smc_ib_modify_qp_reset(lnk);
277 	smc_wr_free_link(lnk);
278 	smc_ib_destroy_queue_pair(lnk);
279 	smc_ib_dealloc_protection_domain(lnk);
280 	smc_wr_free_link_mem(lnk);
281 }
282 
283 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
284 			 struct smc_buf_desc *buf_desc)
285 {
286 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
287 
288 	if (is_rmb) {
289 		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
290 			smc_ib_put_memory_region(
291 					buf_desc->mr_rx[SMC_SINGLE_LINK]);
292 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
293 				    DMA_FROM_DEVICE);
294 	} else {
295 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
296 				    DMA_TO_DEVICE);
297 	}
298 	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
299 	if (buf_desc->pages)
300 		__free_pages(buf_desc->pages, buf_desc->order);
301 	kfree(buf_desc);
302 }
303 
304 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
305 {
306 	struct smc_buf_desc *buf_desc, *bf_desc;
307 	struct list_head *buf_list;
308 	int i;
309 
310 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
311 		if (is_rmb)
312 			buf_list = &lgr->rmbs[i];
313 		else
314 			buf_list = &lgr->sndbufs[i];
315 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
316 					 list) {
317 			list_del(&buf_desc->list);
318 			smc_buf_free(lgr, is_rmb, buf_desc);
319 		}
320 	}
321 }
322 
323 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
324 {
325 	/* free send buffers */
326 	__smc_lgr_free_bufs(lgr, false);
327 	/* free rmbs */
328 	__smc_lgr_free_bufs(lgr, true);
329 }
330 
331 /* remove a link group */
332 void smc_lgr_free(struct smc_link_group *lgr)
333 {
334 	smc_lgr_free_bufs(lgr);
335 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
336 	kfree(lgr);
337 }
338 
339 void smc_lgr_forget(struct smc_link_group *lgr)
340 {
341 	spin_lock_bh(&smc_lgr_list.lock);
342 	/* do not use this link group for new connections */
343 	if (!list_empty(&lgr->list))
344 		list_del_init(&lgr->list);
345 	spin_unlock_bh(&smc_lgr_list.lock);
346 }
347 
348 /* terminate linkgroup abnormally */
349 void smc_lgr_terminate(struct smc_link_group *lgr)
350 {
351 	struct smc_connection *conn;
352 	struct smc_sock *smc;
353 	struct rb_node *node;
354 
355 	if (lgr->terminating)
356 		return;	/* lgr already terminating */
357 	lgr->terminating = 1;
358 	smc_lgr_forget(lgr);
359 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
360 
361 	write_lock_bh(&lgr->conns_lock);
362 	node = rb_first(&lgr->conns_all);
363 	while (node) {
364 		conn = rb_entry(node, struct smc_connection, alert_node);
365 		smc = container_of(conn, struct smc_sock, conn);
366 		sock_hold(&smc->sk); /* sock_put in close work */
367 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
368 		__smc_lgr_unregister_conn(conn);
369 		write_unlock_bh(&lgr->conns_lock);
370 		if (!schedule_work(&conn->close_work))
371 			sock_put(&smc->sk);
372 		write_lock_bh(&lgr->conns_lock);
373 		node = rb_first(&lgr->conns_all);
374 	}
375 	write_unlock_bh(&lgr->conns_lock);
376 	wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
377 	smc_lgr_schedule_free_work(lgr);
378 }
379 
380 /* Called when IB port is terminated */
381 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
382 {
383 	struct smc_link_group *lgr, *l;
384 
385 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
386 		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
387 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
388 			smc_lgr_terminate(lgr);
389 	}
390 }
391 
392 /* Determine vlan of internal TCP socket.
393  * @vlan_id: address to store the determined vlan id into
394  */
395 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
396 {
397 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
398 	struct net_device *ndev;
399 	int i, nest_lvl, rc = 0;
400 
401 	*vlan_id = 0;
402 	if (!dst) {
403 		rc = -ENOTCONN;
404 		goto out;
405 	}
406 	if (!dst->dev) {
407 		rc = -ENODEV;
408 		goto out_rel;
409 	}
410 
411 	ndev = dst->dev;
412 	if (is_vlan_dev(ndev)) {
413 		*vlan_id = vlan_dev_vlan_id(ndev);
414 		goto out_rel;
415 	}
416 
417 	rtnl_lock();
418 	nest_lvl = dev_get_nest_level(ndev);
419 	for (i = 0; i < nest_lvl; i++) {
420 		struct list_head *lower = &ndev->adj_list.lower;
421 
422 		if (list_empty(lower))
423 			break;
424 		lower = lower->next;
425 		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
426 		if (is_vlan_dev(ndev)) {
427 			*vlan_id = vlan_dev_vlan_id(ndev);
428 			break;
429 		}
430 	}
431 	rtnl_unlock();
432 
433 out_rel:
434 	dst_release(dst);
435 out:
436 	return rc;
437 }
438 
439 /* determine the link gid matching the vlan id of the link group */
440 static int smc_link_determine_gid(struct smc_link_group *lgr)
441 {
442 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
443 	struct ib_gid_attr gattr;
444 	union ib_gid gid;
445 	int i;
446 
447 	if (!lgr->vlan_id) {
448 		lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
449 		return 0;
450 	}
451 
452 	for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
453 	     i++) {
454 		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
455 				 &gattr))
456 			continue;
457 		if (gattr.ndev) {
458 			if (is_vlan_dev(gattr.ndev) &&
459 			    vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
460 				lnk->gid = gid;
461 				dev_put(gattr.ndev);
462 				return 0;
463 			}
464 			dev_put(gattr.ndev);
465 		}
466 	}
467 	return -ENODEV;
468 }
469 
470 /* create a new SMC connection (and a new link group if necessary) */
471 int smc_conn_create(struct smc_sock *smc,
472 		    struct smc_ib_device *smcibdev, u8 ibport,
473 		    struct smc_clc_msg_local *lcl, int srv_first_contact)
474 {
475 	struct smc_connection *conn = &smc->conn;
476 	int local_contact = SMC_FIRST_CONTACT;
477 	struct smc_link_group *lgr;
478 	unsigned short vlan_id;
479 	enum smc_lgr_role role;
480 	int rc = 0;
481 
482 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
483 	rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
484 	if (rc)
485 		return rc;
486 
487 	if ((role == SMC_CLNT) && srv_first_contact)
488 		/* create new link group as well */
489 		goto create;
490 
491 	/* determine if an existing link group can be reused */
492 	spin_lock_bh(&smc_lgr_list.lock);
493 	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
494 		write_lock_bh(&lgr->conns_lock);
495 		if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
496 			    SMC_SYSTEMID_LEN) &&
497 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
498 			    SMC_GID_SIZE) &&
499 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
500 			    sizeof(lcl->mac)) &&
501 		    !lgr->sync_err &&
502 		    (lgr->role == role) &&
503 		    (lgr->vlan_id == vlan_id) &&
504 		    ((role == SMC_CLNT) ||
505 		     (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
506 			/* link group found */
507 			local_contact = SMC_REUSE_CONTACT;
508 			conn->lgr = lgr;
509 			smc_lgr_register_conn(conn); /* add smc conn to lgr */
510 			write_unlock_bh(&lgr->conns_lock);
511 			break;
512 		}
513 		write_unlock_bh(&lgr->conns_lock);
514 	}
515 	spin_unlock_bh(&smc_lgr_list.lock);
516 
517 	if (role == SMC_CLNT && !srv_first_contact &&
518 	    (local_contact == SMC_FIRST_CONTACT)) {
519 		/* Server reuses a link group, but Client wants to start
520 		 * a new one
521 		 * send out_of_sync decline, reason synchr. error
522 		 */
523 		return -ENOLINK;
524 	}
525 
526 create:
527 	if (local_contact == SMC_FIRST_CONTACT) {
528 		rc = smc_lgr_create(smc, smcibdev, ibport,
529 				    lcl->id_for_peer, vlan_id);
530 		if (rc)
531 			goto out;
532 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
533 		rc = smc_link_determine_gid(conn->lgr);
534 	}
535 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
536 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
537 #ifndef KERNEL_HAS_ATOMIC64
538 	spin_lock_init(&conn->acurs_lock);
539 #endif
540 
541 out:
542 	return rc ? rc : local_contact;
543 }
544 
545 /* convert the RMB size into the compressed notation - minimum 16K.
546  * In contrast to plain ilog2, this rounds towards the next power of 2,
547  * so the socket application gets at least its desired sndbuf / rcvbuf size.
548  */
549 static u8 smc_compress_bufsize(int size)
550 {
551 	u8 compressed;
552 
553 	if (size <= SMC_BUF_MIN_SIZE)
554 		return 0;
555 
556 	size = (size - 1) >> 14;
557 	compressed = ilog2(size) + 1;
558 	if (compressed >= SMC_RMBE_SIZES)
559 		compressed = SMC_RMBE_SIZES - 1;
560 	return compressed;
561 }
562 
563 /* convert the RMB size from compressed notation into integer */
564 int smc_uncompress_bufsize(u8 compressed)
565 {
566 	u32 size;
567 
568 	size = 0x00000001 << (((int)compressed) + 14);
569 	return (int)size;
570 }
571 
572 /* try to reuse a sndbuf or rmb description slot for a certain
573  * buffer size; if not available, return NULL
574  */
575 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
576 					     rwlock_t *lock,
577 					     struct list_head *buf_list)
578 {
579 	struct smc_buf_desc *buf_slot;
580 
581 	read_lock_bh(lock);
582 	list_for_each_entry(buf_slot, buf_list, list) {
583 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
584 			read_unlock_bh(lock);
585 			return buf_slot;
586 		}
587 	}
588 	read_unlock_bh(lock);
589 	return NULL;
590 }
591 
592 /* one of the conditions for announcing a receiver's current window size is
593  * that it "results in a minimum increase in the window size of 10% of the
594  * receive buffer space" [RFC7609]
595  */
596 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
597 {
598 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
599 }
600 
601 static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
602 					       bool is_rmb, int bufsize)
603 {
604 	struct smc_buf_desc *buf_desc;
605 	struct smc_link *lnk;
606 	int rc;
607 
608 	/* try to alloc a new buffer */
609 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
610 	if (!buf_desc)
611 		return ERR_PTR(-ENOMEM);
612 
613 	buf_desc->order = get_order(bufsize);
614 	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
615 				      __GFP_NOMEMALLOC | __GFP_COMP |
616 				      __GFP_NORETRY | __GFP_ZERO,
617 				      buf_desc->order);
618 	if (!buf_desc->pages) {
619 		kfree(buf_desc);
620 		return ERR_PTR(-EAGAIN);
621 	}
622 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
623 
624 	/* build the sg table from the pages */
625 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
626 	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
627 			    GFP_KERNEL);
628 	if (rc) {
629 		smc_buf_free(lgr, is_rmb, buf_desc);
630 		return ERR_PTR(rc);
631 	}
632 	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
633 		   buf_desc->cpu_addr, bufsize);
634 
635 	/* map sg table to DMA address */
636 	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
637 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
638 	/* SMC protocol depends on mapping to one DMA address only */
639 	if (rc != 1)  {
640 		smc_buf_free(lgr, is_rmb, buf_desc);
641 		return ERR_PTR(-EAGAIN);
642 	}
643 
644 	/* create a new memory region for the RMB */
645 	if (is_rmb) {
646 		rc = smc_ib_get_memory_region(lnk->roce_pd,
647 					      IB_ACCESS_REMOTE_WRITE |
648 					      IB_ACCESS_LOCAL_WRITE,
649 					      buf_desc);
650 		if (rc) {
651 			smc_buf_free(lgr, is_rmb, buf_desc);
652 			return ERR_PTR(rc);
653 		}
654 	}
655 
656 	buf_desc->len = bufsize;
657 	return buf_desc;
658 }
659 
660 static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
661 {
662 	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
663 	struct smc_connection *conn = &smc->conn;
664 	struct smc_link_group *lgr = conn->lgr;
665 	struct list_head *buf_list;
666 	int bufsize, bufsize_short;
667 	int sk_buf_size;
668 	rwlock_t *lock;
669 
670 	if (is_rmb)
671 		/* use socket recv buffer size (w/o overhead) as start value */
672 		sk_buf_size = smc->sk.sk_rcvbuf / 2;
673 	else
674 		/* use socket send buffer size (w/o overhead) as start value */
675 		sk_buf_size = smc->sk.sk_sndbuf / 2;
676 
677 	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
678 	     bufsize_short >= 0; bufsize_short--) {
679 
680 		if (is_rmb) {
681 			lock = &lgr->rmbs_lock;
682 			buf_list = &lgr->rmbs[bufsize_short];
683 		} else {
684 			lock = &lgr->sndbufs_lock;
685 			buf_list = &lgr->sndbufs[bufsize_short];
686 		}
687 		bufsize = smc_uncompress_bufsize(bufsize_short);
688 		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
689 			continue;
690 
691 		/* check for reusable slot in the link group */
692 		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
693 		if (buf_desc) {
694 			memset(buf_desc->cpu_addr, 0, bufsize);
695 			break; /* found reusable slot */
696 		}
697 
698 		buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
699 		if (PTR_ERR(buf_desc) == -ENOMEM)
700 			break;
701 		if (IS_ERR(buf_desc))
702 			continue;
703 
704 		buf_desc->used = 1;
705 		write_lock_bh(lock);
706 		list_add(&buf_desc->list, buf_list);
707 		write_unlock_bh(lock);
708 		break; /* found */
709 	}
710 
711 	if (IS_ERR(buf_desc))
712 		return -ENOMEM;
713 
714 	if (is_rmb) {
715 		conn->rmb_desc = buf_desc;
716 		conn->rmbe_size_short = bufsize_short;
717 		smc->sk.sk_rcvbuf = bufsize * 2;
718 		atomic_set(&conn->bytes_to_rcv, 0);
719 		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
720 	} else {
721 		conn->sndbuf_desc = buf_desc;
722 		smc->sk.sk_sndbuf = bufsize * 2;
723 		atomic_set(&conn->sndbuf_space, bufsize);
724 	}
725 	return 0;
726 }
727 
728 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
729 {
730 	struct smc_link_group *lgr = conn->lgr;
731 
732 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
733 			       conn->sndbuf_desc, DMA_TO_DEVICE);
734 }
735 
736 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
737 {
738 	struct smc_link_group *lgr = conn->lgr;
739 
740 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
741 				  conn->sndbuf_desc, DMA_TO_DEVICE);
742 }
743 
744 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
745 {
746 	struct smc_link_group *lgr = conn->lgr;
747 
748 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
749 			       conn->rmb_desc, DMA_FROM_DEVICE);
750 }
751 
752 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
753 {
754 	struct smc_link_group *lgr = conn->lgr;
755 
756 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
757 				  conn->rmb_desc, DMA_FROM_DEVICE);
758 }
759 
760 /* create the send and receive buffer for an SMC socket;
761  * receive buffers are called RMBs;
762  * (even though the SMC protocol allows more than one RMB-element per RMB,
763  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
764  * extra RMB for every connection in a link group
765  */
766 int smc_buf_create(struct smc_sock *smc)
767 {
768 	int rc;
769 
770 	/* create send buffer */
771 	rc = __smc_buf_create(smc, false);
772 	if (rc)
773 		return rc;
774 	/* create rmb */
775 	rc = __smc_buf_create(smc, true);
776 	if (rc)
777 		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
778 	return rc;
779 }
780 
781 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
782 {
783 	int i;
784 
785 	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
786 		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
787 			return i;
788 	}
789 	return -ENOSPC;
790 }
791 
792 /* add a new rtoken from peer */
793 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
794 {
795 	u64 dma_addr = be64_to_cpu(nw_vaddr);
796 	u32 rkey = ntohl(nw_rkey);
797 	int i;
798 
799 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
800 		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
801 		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
802 		    test_bit(i, lgr->rtokens_used_mask)) {
803 			/* already in list */
804 			return i;
805 		}
806 	}
807 	i = smc_rmb_reserve_rtoken_idx(lgr);
808 	if (i < 0)
809 		return i;
810 	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
811 	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
812 	return i;
813 }
814 
815 /* delete an rtoken */
816 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
817 {
818 	u32 rkey = ntohl(nw_rkey);
819 	int i;
820 
821 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
822 		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
823 		    test_bit(i, lgr->rtokens_used_mask)) {
824 			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
825 			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
826 
827 			clear_bit(i, lgr->rtokens_used_mask);
828 			return 0;
829 		}
830 	}
831 	return -ENOENT;
832 }
833 
834 /* save rkey and dma_addr received from peer during clc handshake */
835 int smc_rmb_rtoken_handling(struct smc_connection *conn,
836 			    struct smc_clc_msg_accept_confirm *clc)
837 {
838 	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
839 					  clc->rmb_rkey);
840 	if (conn->rtoken_idx < 0)
841 		return conn->rtoken_idx;
842 	return 0;
843 }
844 
845 /* Called (from smc_exit) when module is removed */
846 void smc_core_exit(void)
847 {
848 	struct smc_link_group *lgr, *lg;
849 	LIST_HEAD(lgr_freeing_list);
850 
851 	spin_lock_bh(&smc_lgr_list.lock);
852 	if (!list_empty(&smc_lgr_list.list))
853 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
854 	spin_unlock_bh(&smc_lgr_list.lock);
855 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
856 		list_del_init(&lgr->list);
857 		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
858 		cancel_delayed_work_sync(&lgr->free_work);
859 		smc_lgr_free(lgr); /* free link group */
860 	}
861 }
862