xref: /openbmc/linux/net/smc/smc_wr.c (revision 335f70fa)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Shared Memory Communications over RDMA (SMC-R) and RoCE
4   *
5   * Work Requests exploiting Infiniband API
6   *
7   * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8   * are submitted to either RC SQ or RC RQ respectively
9   * (reliably connected send/receive queue)
10   * and become work queue entries (WQEs).
11   * While an SQ WR/WQE is pending, we track it until transmission completion.
12   * Through a send or receive completion queue (CQ) respectively,
13   * we get completion queue entries (CQEs) [aka work completions (WCs)].
14   * Since the CQ callback is called from IRQ context, we split work by using
15   * bottom halves implemented by tasklets.
16   *
17   * SMC uses this to exchange LLC (link layer control)
18   * and CDC (connection data control) messages.
19   *
20   * Copyright IBM Corp. 2016
21   *
22   * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
23   */
24  
25  #include <linux/atomic.h>
26  #include <linux/hashtable.h>
27  #include <linux/wait.h>
28  #include <rdma/ib_verbs.h>
29  #include <asm/div64.h>
30  
31  #include "smc.h"
32  #include "smc_wr.h"
33  
34  #define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
35  
36  #define SMC_WR_RX_HASH_BITS 4
37  static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
38  static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
39  
40  struct smc_wr_tx_pend {	/* control data for a pending send request */
41  	u64			wr_id;		/* work request id sent */
42  	smc_wr_tx_handler	handler;
43  	enum ib_wc_status	wc_status;	/* CQE status */
44  	struct smc_link		*link;
45  	u32			idx;
46  	struct smc_wr_tx_pend_priv priv;
47  	u8			compl_requested;
48  };
49  
50  /******************************** send queue *********************************/
51  
52  /*------------------------------- completion --------------------------------*/
53  
54  /* returns true if at least one tx work request is pending on the given link */
55  static inline bool smc_wr_is_tx_pend(struct smc_link *link)
56  {
57  	return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
58  }
59  
60  /* wait till all pending tx work requests on the given link are completed */
61  void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
62  {
63  	wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
64  }
65  
66  static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
67  {
68  	u32 i;
69  
70  	for (i = 0; i < link->wr_tx_cnt; i++) {
71  		if (link->wr_tx_pends[i].wr_id == wr_id)
72  			return i;
73  	}
74  	return link->wr_tx_cnt;
75  }
76  
77  static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
78  {
79  	struct smc_wr_tx_pend pnd_snd;
80  	struct smc_link *link;
81  	u32 pnd_snd_idx;
82  
83  	link = wc->qp->qp_context;
84  
85  	if (wc->opcode == IB_WC_REG_MR) {
86  		if (wc->status)
87  			link->wr_reg_state = FAILED;
88  		else
89  			link->wr_reg_state = CONFIRMED;
90  		smc_wr_wakeup_reg_wait(link);
91  		return;
92  	}
93  
94  	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
95  	if (pnd_snd_idx == link->wr_tx_cnt) {
96  		if (link->lgr->smc_version != SMC_V2 ||
97  		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
98  			return;
99  		link->wr_tx_v2_pend->wc_status = wc->status;
100  		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
101  		/* clear the full struct smc_wr_tx_pend including .priv */
102  		memset(link->wr_tx_v2_pend, 0,
103  		       sizeof(*link->wr_tx_v2_pend));
104  		memset(link->lgr->wr_tx_buf_v2, 0,
105  		       sizeof(*link->lgr->wr_tx_buf_v2));
106  	} else {
107  		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
108  		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
109  			complete(&link->wr_tx_compl[pnd_snd_idx]);
110  		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
111  		       sizeof(pnd_snd));
112  		/* clear the full struct smc_wr_tx_pend including .priv */
113  		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
114  		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
115  		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
116  		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
117  		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
118  			return;
119  	}
120  
121  	if (wc->status) {
122  		if (link->lgr->smc_version == SMC_V2) {
123  			memset(link->wr_tx_v2_pend, 0,
124  			       sizeof(*link->wr_tx_v2_pend));
125  			memset(link->lgr->wr_tx_buf_v2, 0,
126  			       sizeof(*link->lgr->wr_tx_buf_v2));
127  		}
128  		/* terminate link */
129  		smcr_link_down_cond_sched(link);
130  	}
131  	if (pnd_snd.handler)
132  		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
133  	wake_up(&link->wr_tx_wait);
134  }
135  
136  static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
137  {
138  	struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
139  	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
140  	int i = 0, rc;
141  	int polled = 0;
142  
143  again:
144  	polled++;
145  	do {
146  		memset(&wc, 0, sizeof(wc));
147  		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
148  		if (polled == 1) {
149  			ib_req_notify_cq(dev->roce_cq_send,
150  					 IB_CQ_NEXT_COMP |
151  					 IB_CQ_REPORT_MISSED_EVENTS);
152  		}
153  		if (!rc)
154  			break;
155  		for (i = 0; i < rc; i++)
156  			smc_wr_tx_process_cqe(&wc[i]);
157  	} while (rc > 0);
158  	if (polled == 1)
159  		goto again;
160  }
161  
162  void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
163  {
164  	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
165  
166  	tasklet_schedule(&dev->send_tasklet);
167  }
168  
169  /*---------------------------- request submission ---------------------------*/
170  
171  static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
172  {
173  	*idx = link->wr_tx_cnt;
174  	if (!smc_link_sendable(link))
175  		return -ENOLINK;
176  	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
177  		if (!test_and_set_bit(*idx, link->wr_tx_mask))
178  			return 0;
179  	}
180  	*idx = link->wr_tx_cnt;
181  	return -EBUSY;
182  }
183  
184  /**
185   * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
186   *			and sets info for pending transmit tracking
187   * @link:		Pointer to smc_link used to later send the message.
188   * @handler:		Send completion handler function pointer.
189   * @wr_buf:		Out value returns pointer to message buffer.
190   * @wr_rdma_buf:	Out value returns pointer to rdma work request.
191   * @wr_pend_priv:	Out value returns pointer serving as handler context.
192   *
193   * Return: 0 on success, or -errno on error.
194   */
195  int smc_wr_tx_get_free_slot(struct smc_link *link,
196  			    smc_wr_tx_handler handler,
197  			    struct smc_wr_buf **wr_buf,
198  			    struct smc_rdma_wr **wr_rdma_buf,
199  			    struct smc_wr_tx_pend_priv **wr_pend_priv)
200  {
201  	struct smc_link_group *lgr = smc_get_lgr(link);
202  	struct smc_wr_tx_pend *wr_pend;
203  	u32 idx = link->wr_tx_cnt;
204  	struct ib_send_wr *wr_ib;
205  	u64 wr_id;
206  	int rc;
207  
208  	*wr_buf = NULL;
209  	*wr_pend_priv = NULL;
210  	if (in_softirq() || lgr->terminating) {
211  		rc = smc_wr_tx_get_free_slot_index(link, &idx);
212  		if (rc)
213  			return rc;
214  	} else {
215  		rc = wait_event_interruptible_timeout(
216  			link->wr_tx_wait,
217  			!smc_link_sendable(link) ||
218  			lgr->terminating ||
219  			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
220  			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
221  		if (!rc) {
222  			/* timeout - terminate link */
223  			smcr_link_down_cond_sched(link);
224  			return -EPIPE;
225  		}
226  		if (idx == link->wr_tx_cnt)
227  			return -EPIPE;
228  	}
229  	wr_id = smc_wr_tx_get_next_wr_id(link);
230  	wr_pend = &link->wr_tx_pends[idx];
231  	wr_pend->wr_id = wr_id;
232  	wr_pend->handler = handler;
233  	wr_pend->link = link;
234  	wr_pend->idx = idx;
235  	wr_ib = &link->wr_tx_ibs[idx];
236  	wr_ib->wr_id = wr_id;
237  	*wr_buf = &link->wr_tx_bufs[idx];
238  	if (wr_rdma_buf)
239  		*wr_rdma_buf = &link->wr_tx_rdmas[idx];
240  	*wr_pend_priv = &wr_pend->priv;
241  	return 0;
242  }
243  
244  int smc_wr_tx_get_v2_slot(struct smc_link *link,
245  			  smc_wr_tx_handler handler,
246  			  struct smc_wr_v2_buf **wr_buf,
247  			  struct smc_wr_tx_pend_priv **wr_pend_priv)
248  {
249  	struct smc_wr_tx_pend *wr_pend;
250  	struct ib_send_wr *wr_ib;
251  	u64 wr_id;
252  
253  	if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
254  		return -EBUSY;
255  
256  	*wr_buf = NULL;
257  	*wr_pend_priv = NULL;
258  	wr_id = smc_wr_tx_get_next_wr_id(link);
259  	wr_pend = link->wr_tx_v2_pend;
260  	wr_pend->wr_id = wr_id;
261  	wr_pend->handler = handler;
262  	wr_pend->link = link;
263  	wr_pend->idx = link->wr_tx_cnt;
264  	wr_ib = link->wr_tx_v2_ib;
265  	wr_ib->wr_id = wr_id;
266  	*wr_buf = link->lgr->wr_tx_buf_v2;
267  	*wr_pend_priv = &wr_pend->priv;
268  	return 0;
269  }
270  
271  int smc_wr_tx_put_slot(struct smc_link *link,
272  		       struct smc_wr_tx_pend_priv *wr_pend_priv)
273  {
274  	struct smc_wr_tx_pend *pend;
275  
276  	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
277  	if (pend->idx < link->wr_tx_cnt) {
278  		u32 idx = pend->idx;
279  
280  		/* clear the full struct smc_wr_tx_pend including .priv */
281  		memset(&link->wr_tx_pends[idx], 0,
282  		       sizeof(link->wr_tx_pends[idx]));
283  		memset(&link->wr_tx_bufs[idx], 0,
284  		       sizeof(link->wr_tx_bufs[idx]));
285  		test_and_clear_bit(idx, link->wr_tx_mask);
286  		wake_up(&link->wr_tx_wait);
287  		return 1;
288  	} else if (link->lgr->smc_version == SMC_V2 &&
289  		   pend->idx == link->wr_tx_cnt) {
290  		/* Large v2 buffer */
291  		memset(&link->wr_tx_v2_pend, 0,
292  		       sizeof(link->wr_tx_v2_pend));
293  		memset(&link->lgr->wr_tx_buf_v2, 0,
294  		       sizeof(link->lgr->wr_tx_buf_v2));
295  		return 1;
296  	}
297  
298  	return 0;
299  }
300  
301  /* Send prepared WR slot via ib_post_send.
302   * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
303   */
304  int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
305  {
306  	struct smc_wr_tx_pend *pend;
307  	int rc;
308  
309  	ib_req_notify_cq(link->smcibdev->roce_cq_send,
310  			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
311  	pend = container_of(priv, struct smc_wr_tx_pend, priv);
312  	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
313  	if (rc) {
314  		smc_wr_tx_put_slot(link, priv);
315  		smcr_link_down_cond_sched(link);
316  	}
317  	return rc;
318  }
319  
320  int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
321  		      int len)
322  {
323  	int rc;
324  
325  	link->wr_tx_v2_ib->sg_list[0].length = len;
326  	ib_req_notify_cq(link->smcibdev->roce_cq_send,
327  			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
328  	rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
329  	if (rc) {
330  		smc_wr_tx_put_slot(link, priv);
331  		smcr_link_down_cond_sched(link);
332  	}
333  	return rc;
334  }
335  
336  /* Send prepared WR slot via ib_post_send and wait for send completion
337   * notification.
338   * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
339   */
340  int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
341  			unsigned long timeout)
342  {
343  	struct smc_wr_tx_pend *pend;
344  	u32 pnd_idx;
345  	int rc;
346  
347  	pend = container_of(priv, struct smc_wr_tx_pend, priv);
348  	pend->compl_requested = 1;
349  	pnd_idx = pend->idx;
350  	init_completion(&link->wr_tx_compl[pnd_idx]);
351  
352  	rc = smc_wr_tx_send(link, priv);
353  	if (rc)
354  		return rc;
355  	/* wait for completion by smc_wr_tx_process_cqe() */
356  	rc = wait_for_completion_interruptible_timeout(
357  					&link->wr_tx_compl[pnd_idx], timeout);
358  	if (rc <= 0)
359  		rc = -ENODATA;
360  	if (rc > 0)
361  		rc = 0;
362  	return rc;
363  }
364  
365  /* Register a memory region and wait for result. */
366  int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
367  {
368  	int rc;
369  
370  	ib_req_notify_cq(link->smcibdev->roce_cq_send,
371  			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
372  	link->wr_reg_state = POSTED;
373  	link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
374  	link->wr_reg.mr = mr;
375  	link->wr_reg.key = mr->rkey;
376  	rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
377  	if (rc)
378  		return rc;
379  
380  	atomic_inc(&link->wr_reg_refcnt);
381  	rc = wait_event_interruptible_timeout(link->wr_reg_wait,
382  					      (link->wr_reg_state != POSTED),
383  					      SMC_WR_REG_MR_WAIT_TIME);
384  	if (atomic_dec_and_test(&link->wr_reg_refcnt))
385  		wake_up_all(&link->wr_reg_wait);
386  	if (!rc) {
387  		/* timeout - terminate link */
388  		smcr_link_down_cond_sched(link);
389  		return -EPIPE;
390  	}
391  	if (rc == -ERESTARTSYS)
392  		return -EINTR;
393  	switch (link->wr_reg_state) {
394  	case CONFIRMED:
395  		rc = 0;
396  		break;
397  	case FAILED:
398  		rc = -EIO;
399  		break;
400  	case POSTED:
401  		rc = -EPIPE;
402  		break;
403  	}
404  	return rc;
405  }
406  
407  /****************************** receive queue ********************************/
408  
409  int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
410  {
411  	struct smc_wr_rx_handler *h_iter;
412  	int rc = 0;
413  
414  	spin_lock(&smc_wr_rx_hash_lock);
415  	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
416  		if (h_iter->type == handler->type) {
417  			rc = -EEXIST;
418  			goto out_unlock;
419  		}
420  	}
421  	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
422  out_unlock:
423  	spin_unlock(&smc_wr_rx_hash_lock);
424  	return rc;
425  }
426  
427  /* Demultiplex a received work request based on the message type to its handler.
428   * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
429   * and not being modified any more afterwards so we don't need to lock it.
430   */
431  static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
432  {
433  	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
434  	struct smc_wr_rx_handler *handler;
435  	struct smc_wr_rx_hdr *wr_rx;
436  	u64 temp_wr_id;
437  	u32 index;
438  
439  	if (wc->byte_len < sizeof(*wr_rx))
440  		return; /* short message */
441  	temp_wr_id = wc->wr_id;
442  	index = do_div(temp_wr_id, link->wr_rx_cnt);
443  	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
444  	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
445  		if (handler->type == wr_rx->type)
446  			handler->handler(wc, wr_rx);
447  	}
448  }
449  
450  static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
451  {
452  	struct smc_link *link;
453  	int i;
454  
455  	for (i = 0; i < num; i++) {
456  		link = wc[i].qp->qp_context;
457  		if (wc[i].status == IB_WC_SUCCESS) {
458  			link->wr_rx_tstamp = jiffies;
459  			smc_wr_rx_demultiplex(&wc[i]);
460  			smc_wr_rx_post(link); /* refill WR RX */
461  		} else {
462  			/* handle status errors */
463  			switch (wc[i].status) {
464  			case IB_WC_RETRY_EXC_ERR:
465  			case IB_WC_RNR_RETRY_EXC_ERR:
466  			case IB_WC_WR_FLUSH_ERR:
467  				smcr_link_down_cond_sched(link);
468  				break;
469  			default:
470  				smc_wr_rx_post(link); /* refill WR RX */
471  				break;
472  			}
473  		}
474  	}
475  }
476  
477  static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
478  {
479  	struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
480  	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
481  	int polled = 0;
482  	int rc;
483  
484  again:
485  	polled++;
486  	do {
487  		memset(&wc, 0, sizeof(wc));
488  		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
489  		if (polled == 1) {
490  			ib_req_notify_cq(dev->roce_cq_recv,
491  					 IB_CQ_SOLICITED_MASK
492  					 | IB_CQ_REPORT_MISSED_EVENTS);
493  		}
494  		if (!rc)
495  			break;
496  		smc_wr_rx_process_cqes(&wc[0], rc);
497  	} while (rc > 0);
498  	if (polled == 1)
499  		goto again;
500  }
501  
502  void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
503  {
504  	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
505  
506  	tasklet_schedule(&dev->recv_tasklet);
507  }
508  
509  int smc_wr_rx_post_init(struct smc_link *link)
510  {
511  	u32 i;
512  	int rc = 0;
513  
514  	for (i = 0; i < link->wr_rx_cnt; i++)
515  		rc = smc_wr_rx_post(link);
516  	return rc;
517  }
518  
519  /***************************** init, exit, misc ******************************/
520  
521  void smc_wr_remember_qp_attr(struct smc_link *lnk)
522  {
523  	struct ib_qp_attr *attr = &lnk->qp_attr;
524  	struct ib_qp_init_attr init_attr;
525  
526  	memset(attr, 0, sizeof(*attr));
527  	memset(&init_attr, 0, sizeof(init_attr));
528  	ib_query_qp(lnk->roce_qp, attr,
529  		    IB_QP_STATE |
530  		    IB_QP_CUR_STATE |
531  		    IB_QP_PKEY_INDEX |
532  		    IB_QP_PORT |
533  		    IB_QP_QKEY |
534  		    IB_QP_AV |
535  		    IB_QP_PATH_MTU |
536  		    IB_QP_TIMEOUT |
537  		    IB_QP_RETRY_CNT |
538  		    IB_QP_RNR_RETRY |
539  		    IB_QP_RQ_PSN |
540  		    IB_QP_ALT_PATH |
541  		    IB_QP_MIN_RNR_TIMER |
542  		    IB_QP_SQ_PSN |
543  		    IB_QP_PATH_MIG_STATE |
544  		    IB_QP_CAP |
545  		    IB_QP_DEST_QPN,
546  		    &init_attr);
547  
548  	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
549  			       lnk->qp_attr.cap.max_send_wr);
550  	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
551  			       lnk->qp_attr.cap.max_recv_wr);
552  }
553  
554  static void smc_wr_init_sge(struct smc_link *lnk)
555  {
556  	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
557  	u32 i;
558  
559  	for (i = 0; i < lnk->wr_tx_cnt; i++) {
560  		lnk->wr_tx_sges[i].addr =
561  			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
562  		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
563  		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
564  		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
565  			lnk->roce_pd->local_dma_lkey;
566  		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
567  			lnk->roce_pd->local_dma_lkey;
568  		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
569  			lnk->roce_pd->local_dma_lkey;
570  		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
571  			lnk->roce_pd->local_dma_lkey;
572  		lnk->wr_tx_ibs[i].next = NULL;
573  		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
574  		lnk->wr_tx_ibs[i].num_sge = 1;
575  		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
576  		lnk->wr_tx_ibs[i].send_flags =
577  			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
578  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
579  		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
580  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
581  			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
582  		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
583  			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
584  	}
585  
586  	if (lnk->lgr->smc_version == SMC_V2) {
587  		lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
588  		lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
589  		lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
590  
591  		lnk->wr_tx_v2_ib->next = NULL;
592  		lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
593  		lnk->wr_tx_v2_ib->num_sge = 1;
594  		lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
595  		lnk->wr_tx_v2_ib->send_flags =
596  			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
597  	}
598  
599  	/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
600  	 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
601  	 * and the same buffer for all sges. When a larger message arrived then
602  	 * the content of the first small sge is copied to the beginning of
603  	 * the larger spillover buffer, allowing easy data mapping.
604  	 */
605  	for (i = 0; i < lnk->wr_rx_cnt; i++) {
606  		int x = i * sges_per_buf;
607  
608  		lnk->wr_rx_sges[x].addr =
609  			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
610  		lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
611  		lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
612  		if (lnk->lgr->smc_version == SMC_V2) {
613  			lnk->wr_rx_sges[x + 1].addr =
614  					lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
615  			lnk->wr_rx_sges[x + 1].length =
616  					SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
617  			lnk->wr_rx_sges[x + 1].lkey =
618  					lnk->roce_pd->local_dma_lkey;
619  		}
620  		lnk->wr_rx_ibs[i].next = NULL;
621  		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
622  		lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
623  	}
624  	lnk->wr_reg.wr.next = NULL;
625  	lnk->wr_reg.wr.num_sge = 0;
626  	lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
627  	lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
628  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
629  }
630  
631  void smc_wr_free_link(struct smc_link *lnk)
632  {
633  	struct ib_device *ibdev;
634  
635  	if (!lnk->smcibdev)
636  		return;
637  	ibdev = lnk->smcibdev->ibdev;
638  
639  	smc_wr_wakeup_reg_wait(lnk);
640  	smc_wr_wakeup_tx_wait(lnk);
641  
642  	smc_wr_tx_wait_no_pending_sends(lnk);
643  	wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
644  	wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
645  
646  	if (lnk->wr_rx_dma_addr) {
647  		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
648  				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
649  				    DMA_FROM_DEVICE);
650  		lnk->wr_rx_dma_addr = 0;
651  	}
652  	if (lnk->wr_rx_v2_dma_addr) {
653  		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
654  				    SMC_WR_BUF_V2_SIZE,
655  				    DMA_FROM_DEVICE);
656  		lnk->wr_rx_v2_dma_addr = 0;
657  	}
658  	if (lnk->wr_tx_dma_addr) {
659  		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
660  				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
661  				    DMA_TO_DEVICE);
662  		lnk->wr_tx_dma_addr = 0;
663  	}
664  	if (lnk->wr_tx_v2_dma_addr) {
665  		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
666  				    SMC_WR_BUF_V2_SIZE,
667  				    DMA_TO_DEVICE);
668  		lnk->wr_tx_v2_dma_addr = 0;
669  	}
670  }
671  
672  void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
673  {
674  	if (lgr->smc_version < SMC_V2)
675  		return;
676  
677  	kfree(lgr->wr_rx_buf_v2);
678  	lgr->wr_rx_buf_v2 = NULL;
679  	kfree(lgr->wr_tx_buf_v2);
680  	lgr->wr_tx_buf_v2 = NULL;
681  }
682  
683  void smc_wr_free_link_mem(struct smc_link *lnk)
684  {
685  	kfree(lnk->wr_tx_v2_ib);
686  	lnk->wr_tx_v2_ib = NULL;
687  	kfree(lnk->wr_tx_v2_sge);
688  	lnk->wr_tx_v2_sge = NULL;
689  	kfree(lnk->wr_tx_v2_pend);
690  	lnk->wr_tx_v2_pend = NULL;
691  	kfree(lnk->wr_tx_compl);
692  	lnk->wr_tx_compl = NULL;
693  	kfree(lnk->wr_tx_pends);
694  	lnk->wr_tx_pends = NULL;
695  	bitmap_free(lnk->wr_tx_mask);
696  	lnk->wr_tx_mask = NULL;
697  	kfree(lnk->wr_tx_sges);
698  	lnk->wr_tx_sges = NULL;
699  	kfree(lnk->wr_tx_rdma_sges);
700  	lnk->wr_tx_rdma_sges = NULL;
701  	kfree(lnk->wr_rx_sges);
702  	lnk->wr_rx_sges = NULL;
703  	kfree(lnk->wr_tx_rdmas);
704  	lnk->wr_tx_rdmas = NULL;
705  	kfree(lnk->wr_rx_ibs);
706  	lnk->wr_rx_ibs = NULL;
707  	kfree(lnk->wr_tx_ibs);
708  	lnk->wr_tx_ibs = NULL;
709  	kfree(lnk->wr_tx_bufs);
710  	lnk->wr_tx_bufs = NULL;
711  	kfree(lnk->wr_rx_bufs);
712  	lnk->wr_rx_bufs = NULL;
713  }
714  
715  int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
716  {
717  	if (lgr->smc_version < SMC_V2)
718  		return 0;
719  
720  	lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
721  	if (!lgr->wr_rx_buf_v2)
722  		return -ENOMEM;
723  	lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
724  	if (!lgr->wr_tx_buf_v2) {
725  		kfree(lgr->wr_rx_buf_v2);
726  		return -ENOMEM;
727  	}
728  	return 0;
729  }
730  
731  int smc_wr_alloc_link_mem(struct smc_link *link)
732  {
733  	int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
734  
735  	/* allocate link related memory */
736  	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
737  	if (!link->wr_tx_bufs)
738  		goto no_mem;
739  	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
740  				   GFP_KERNEL);
741  	if (!link->wr_rx_bufs)
742  		goto no_mem_wr_tx_bufs;
743  	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
744  				  GFP_KERNEL);
745  	if (!link->wr_tx_ibs)
746  		goto no_mem_wr_rx_bufs;
747  	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
748  				  sizeof(link->wr_rx_ibs[0]),
749  				  GFP_KERNEL);
750  	if (!link->wr_rx_ibs)
751  		goto no_mem_wr_tx_ibs;
752  	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
753  				    sizeof(link->wr_tx_rdmas[0]),
754  				    GFP_KERNEL);
755  	if (!link->wr_tx_rdmas)
756  		goto no_mem_wr_rx_ibs;
757  	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
758  					sizeof(link->wr_tx_rdma_sges[0]),
759  					GFP_KERNEL);
760  	if (!link->wr_tx_rdma_sges)
761  		goto no_mem_wr_tx_rdmas;
762  	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
763  				   GFP_KERNEL);
764  	if (!link->wr_tx_sges)
765  		goto no_mem_wr_tx_rdma_sges;
766  	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
767  				   sizeof(link->wr_rx_sges[0]) * sges_per_buf,
768  				   GFP_KERNEL);
769  	if (!link->wr_rx_sges)
770  		goto no_mem_wr_tx_sges;
771  	link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
772  	if (!link->wr_tx_mask)
773  		goto no_mem_wr_rx_sges;
774  	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
775  				    sizeof(link->wr_tx_pends[0]),
776  				    GFP_KERNEL);
777  	if (!link->wr_tx_pends)
778  		goto no_mem_wr_tx_mask;
779  	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
780  				    sizeof(link->wr_tx_compl[0]),
781  				    GFP_KERNEL);
782  	if (!link->wr_tx_compl)
783  		goto no_mem_wr_tx_pends;
784  
785  	if (link->lgr->smc_version == SMC_V2) {
786  		link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
787  					    GFP_KERNEL);
788  		if (!link->wr_tx_v2_ib)
789  			goto no_mem_tx_compl;
790  		link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
791  					     GFP_KERNEL);
792  		if (!link->wr_tx_v2_sge)
793  			goto no_mem_v2_ib;
794  		link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
795  					      GFP_KERNEL);
796  		if (!link->wr_tx_v2_pend)
797  			goto no_mem_v2_sge;
798  	}
799  	return 0;
800  
801  no_mem_v2_sge:
802  	kfree(link->wr_tx_v2_sge);
803  no_mem_v2_ib:
804  	kfree(link->wr_tx_v2_ib);
805  no_mem_tx_compl:
806  	kfree(link->wr_tx_compl);
807  no_mem_wr_tx_pends:
808  	kfree(link->wr_tx_pends);
809  no_mem_wr_tx_mask:
810  	kfree(link->wr_tx_mask);
811  no_mem_wr_rx_sges:
812  	kfree(link->wr_rx_sges);
813  no_mem_wr_tx_sges:
814  	kfree(link->wr_tx_sges);
815  no_mem_wr_tx_rdma_sges:
816  	kfree(link->wr_tx_rdma_sges);
817  no_mem_wr_tx_rdmas:
818  	kfree(link->wr_tx_rdmas);
819  no_mem_wr_rx_ibs:
820  	kfree(link->wr_rx_ibs);
821  no_mem_wr_tx_ibs:
822  	kfree(link->wr_tx_ibs);
823  no_mem_wr_rx_bufs:
824  	kfree(link->wr_rx_bufs);
825  no_mem_wr_tx_bufs:
826  	kfree(link->wr_tx_bufs);
827  no_mem:
828  	return -ENOMEM;
829  }
830  
831  void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
832  {
833  	tasklet_kill(&smcibdev->recv_tasklet);
834  	tasklet_kill(&smcibdev->send_tasklet);
835  }
836  
837  void smc_wr_add_dev(struct smc_ib_device *smcibdev)
838  {
839  	tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
840  	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
841  }
842  
843  int smc_wr_create_link(struct smc_link *lnk)
844  {
845  	struct ib_device *ibdev = lnk->smcibdev->ibdev;
846  	int rc = 0;
847  
848  	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
849  	lnk->wr_rx_id = 0;
850  	lnk->wr_rx_dma_addr = ib_dma_map_single(
851  		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
852  		DMA_FROM_DEVICE);
853  	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
854  		lnk->wr_rx_dma_addr = 0;
855  		rc = -EIO;
856  		goto out;
857  	}
858  	if (lnk->lgr->smc_version == SMC_V2) {
859  		lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
860  			lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
861  			DMA_FROM_DEVICE);
862  		if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
863  			lnk->wr_rx_v2_dma_addr = 0;
864  			rc = -EIO;
865  			goto dma_unmap;
866  		}
867  		lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
868  			lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
869  			DMA_TO_DEVICE);
870  		if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
871  			lnk->wr_tx_v2_dma_addr = 0;
872  			rc = -EIO;
873  			goto dma_unmap;
874  		}
875  	}
876  	lnk->wr_tx_dma_addr = ib_dma_map_single(
877  		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
878  		DMA_TO_DEVICE);
879  	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
880  		rc = -EIO;
881  		goto dma_unmap;
882  	}
883  	smc_wr_init_sge(lnk);
884  	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
885  	init_waitqueue_head(&lnk->wr_tx_wait);
886  	atomic_set(&lnk->wr_tx_refcnt, 0);
887  	init_waitqueue_head(&lnk->wr_reg_wait);
888  	atomic_set(&lnk->wr_reg_refcnt, 0);
889  	return rc;
890  
891  dma_unmap:
892  	if (lnk->wr_rx_v2_dma_addr) {
893  		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
894  				    SMC_WR_BUF_V2_SIZE,
895  				    DMA_FROM_DEVICE);
896  		lnk->wr_rx_v2_dma_addr = 0;
897  	}
898  	if (lnk->wr_tx_v2_dma_addr) {
899  		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
900  				    SMC_WR_BUF_V2_SIZE,
901  				    DMA_TO_DEVICE);
902  		lnk->wr_tx_v2_dma_addr = 0;
903  	}
904  	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
905  			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
906  			    DMA_FROM_DEVICE);
907  	lnk->wr_rx_dma_addr = 0;
908  out:
909  	return rc;
910  }
911