xref: /openbmc/linux/drivers/nvme/target/tcp.c (revision 8ebc80a25f9d9bf7a8e368b266d5b740c485c362)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * NVMe over Fabrics TCP target.
4   * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5   */
6  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7  #include <linux/module.h>
8  #include <linux/init.h>
9  #include <linux/slab.h>
10  #include <linux/err.h>
11  #include <linux/nvme-tcp.h>
12  #include <net/sock.h>
13  #include <net/tcp.h>
14  #include <linux/inet.h>
15  #include <linux/llist.h>
16  #include <crypto/hash.h>
17  #include <trace/events/sock.h>
18  
19  #include "nvmet.h"
20  
21  #define NVMET_TCP_DEF_INLINE_DATA_SIZE	(4 * PAGE_SIZE)
22  #define NVMET_TCP_MAXH2CDATA		0x400000 /* 16M arbitrary limit */
23  
param_store_val(const char * str,int * val,int min,int max)24  static int param_store_val(const char *str, int *val, int min, int max)
25  {
26  	int ret, new_val;
27  
28  	ret = kstrtoint(str, 10, &new_val);
29  	if (ret)
30  		return -EINVAL;
31  
32  	if (new_val < min || new_val > max)
33  		return -EINVAL;
34  
35  	*val = new_val;
36  	return 0;
37  }
38  
set_params(const char * str,const struct kernel_param * kp)39  static int set_params(const char *str, const struct kernel_param *kp)
40  {
41  	return param_store_val(str, kp->arg, 0, INT_MAX);
42  }
43  
44  static const struct kernel_param_ops set_param_ops = {
45  	.set	= set_params,
46  	.get	= param_get_int,
47  };
48  
49  /* Define the socket priority to use for connections were it is desirable
50   * that the NIC consider performing optimized packet processing or filtering.
51   * A non-zero value being sufficient to indicate general consideration of any
52   * possible optimization.  Making it a module param allows for alternative
53   * values that may be unique for some NIC implementations.
54   */
55  static int so_priority;
56  device_param_cb(so_priority, &set_param_ops, &so_priority, 0644);
57  MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority: Default 0");
58  
59  /* Define a time period (in usecs) that io_work() shall sample an activated
60   * queue before determining it to be idle.  This optional module behavior
61   * can enable NIC solutions that support socket optimized packet processing
62   * using advanced interrupt moderation techniques.
63   */
64  static int idle_poll_period_usecs;
65  device_param_cb(idle_poll_period_usecs, &set_param_ops,
66  		&idle_poll_period_usecs, 0644);
67  MODULE_PARM_DESC(idle_poll_period_usecs,
68  		"nvmet tcp io_work poll till idle time period in usecs: Default 0");
69  
70  #define NVMET_TCP_RECV_BUDGET		8
71  #define NVMET_TCP_SEND_BUDGET		8
72  #define NVMET_TCP_IO_WORK_BUDGET	64
73  
74  enum nvmet_tcp_send_state {
75  	NVMET_TCP_SEND_DATA_PDU,
76  	NVMET_TCP_SEND_DATA,
77  	NVMET_TCP_SEND_R2T,
78  	NVMET_TCP_SEND_DDGST,
79  	NVMET_TCP_SEND_RESPONSE
80  };
81  
82  enum nvmet_tcp_recv_state {
83  	NVMET_TCP_RECV_PDU,
84  	NVMET_TCP_RECV_DATA,
85  	NVMET_TCP_RECV_DDGST,
86  	NVMET_TCP_RECV_ERR,
87  };
88  
89  enum {
90  	NVMET_TCP_F_INIT_FAILED = (1 << 0),
91  };
92  
93  struct nvmet_tcp_cmd {
94  	struct nvmet_tcp_queue		*queue;
95  	struct nvmet_req		req;
96  
97  	struct nvme_tcp_cmd_pdu		*cmd_pdu;
98  	struct nvme_tcp_rsp_pdu		*rsp_pdu;
99  	struct nvme_tcp_data_pdu	*data_pdu;
100  	struct nvme_tcp_r2t_pdu		*r2t_pdu;
101  
102  	u32				rbytes_done;
103  	u32				wbytes_done;
104  
105  	u32				pdu_len;
106  	u32				pdu_recv;
107  	int				sg_idx;
108  	struct msghdr			recv_msg;
109  	struct bio_vec			*iov;
110  	u32				flags;
111  
112  	struct list_head		entry;
113  	struct llist_node		lentry;
114  
115  	/* send state */
116  	u32				offset;
117  	struct scatterlist		*cur_sg;
118  	enum nvmet_tcp_send_state	state;
119  
120  	__le32				exp_ddgst;
121  	__le32				recv_ddgst;
122  };
123  
124  enum nvmet_tcp_queue_state {
125  	NVMET_TCP_Q_CONNECTING,
126  	NVMET_TCP_Q_LIVE,
127  	NVMET_TCP_Q_DISCONNECTING,
128  };
129  
130  struct nvmet_tcp_queue {
131  	struct socket		*sock;
132  	struct nvmet_tcp_port	*port;
133  	struct work_struct	io_work;
134  	struct nvmet_cq		nvme_cq;
135  	struct nvmet_sq		nvme_sq;
136  
137  	/* send state */
138  	struct nvmet_tcp_cmd	*cmds;
139  	unsigned int		nr_cmds;
140  	struct list_head	free_list;
141  	struct llist_head	resp_list;
142  	struct list_head	resp_send_list;
143  	int			send_list_len;
144  	struct nvmet_tcp_cmd	*snd_cmd;
145  
146  	/* recv state */
147  	int			offset;
148  	int			left;
149  	enum nvmet_tcp_recv_state rcv_state;
150  	struct nvmet_tcp_cmd	*cmd;
151  	union nvme_tcp_pdu	pdu;
152  
153  	/* digest state */
154  	bool			hdr_digest;
155  	bool			data_digest;
156  	struct ahash_request	*snd_hash;
157  	struct ahash_request	*rcv_hash;
158  
159  	unsigned long           poll_end;
160  
161  	spinlock_t		state_lock;
162  	enum nvmet_tcp_queue_state state;
163  
164  	struct sockaddr_storage	sockaddr;
165  	struct sockaddr_storage	sockaddr_peer;
166  	struct work_struct	release_work;
167  
168  	int			idx;
169  	struct list_head	queue_list;
170  
171  	struct nvmet_tcp_cmd	connect;
172  
173  	struct page_frag_cache	pf_cache;
174  
175  	void (*data_ready)(struct sock *);
176  	void (*state_change)(struct sock *);
177  	void (*write_space)(struct sock *);
178  };
179  
180  struct nvmet_tcp_port {
181  	struct socket		*sock;
182  	struct work_struct	accept_work;
183  	struct nvmet_port	*nport;
184  	struct sockaddr_storage addr;
185  	void (*data_ready)(struct sock *);
186  };
187  
188  static DEFINE_IDA(nvmet_tcp_queue_ida);
189  static LIST_HEAD(nvmet_tcp_queue_list);
190  static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
191  
192  static struct workqueue_struct *nvmet_tcp_wq;
193  static const struct nvmet_fabrics_ops nvmet_tcp_ops;
194  static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
195  static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
196  
nvmet_tcp_cmd_tag(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * cmd)197  static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
198  		struct nvmet_tcp_cmd *cmd)
199  {
200  	if (unlikely(!queue->nr_cmds)) {
201  		/* We didn't allocate cmds yet, send 0xffff */
202  		return USHRT_MAX;
203  	}
204  
205  	return cmd - queue->cmds;
206  }
207  
nvmet_tcp_has_data_in(struct nvmet_tcp_cmd * cmd)208  static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
209  {
210  	return nvme_is_write(cmd->req.cmd) &&
211  		cmd->rbytes_done < cmd->req.transfer_len;
212  }
213  
nvmet_tcp_need_data_in(struct nvmet_tcp_cmd * cmd)214  static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
215  {
216  	return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
217  }
218  
nvmet_tcp_need_data_out(struct nvmet_tcp_cmd * cmd)219  static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
220  {
221  	return !nvme_is_write(cmd->req.cmd) &&
222  		cmd->req.transfer_len > 0 &&
223  		!cmd->req.cqe->status;
224  }
225  
nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd * cmd)226  static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
227  {
228  	return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
229  		!cmd->rbytes_done;
230  }
231  
232  static inline struct nvmet_tcp_cmd *
nvmet_tcp_get_cmd(struct nvmet_tcp_queue * queue)233  nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
234  {
235  	struct nvmet_tcp_cmd *cmd;
236  
237  	cmd = list_first_entry_or_null(&queue->free_list,
238  				struct nvmet_tcp_cmd, entry);
239  	if (!cmd)
240  		return NULL;
241  	list_del_init(&cmd->entry);
242  
243  	cmd->rbytes_done = cmd->wbytes_done = 0;
244  	cmd->pdu_len = 0;
245  	cmd->pdu_recv = 0;
246  	cmd->iov = NULL;
247  	cmd->flags = 0;
248  	return cmd;
249  }
250  
nvmet_tcp_put_cmd(struct nvmet_tcp_cmd * cmd)251  static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
252  {
253  	if (unlikely(cmd == &cmd->queue->connect))
254  		return;
255  
256  	list_add_tail(&cmd->entry, &cmd->queue->free_list);
257  }
258  
queue_cpu(struct nvmet_tcp_queue * queue)259  static inline int queue_cpu(struct nvmet_tcp_queue *queue)
260  {
261  	return queue->sock->sk->sk_incoming_cpu;
262  }
263  
nvmet_tcp_hdgst_len(struct nvmet_tcp_queue * queue)264  static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
265  {
266  	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
267  }
268  
nvmet_tcp_ddgst_len(struct nvmet_tcp_queue * queue)269  static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
270  {
271  	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
272  }
273  
nvmet_tcp_hdgst(struct ahash_request * hash,void * pdu,size_t len)274  static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
275  		void *pdu, size_t len)
276  {
277  	struct scatterlist sg;
278  
279  	sg_init_one(&sg, pdu, len);
280  	ahash_request_set_crypt(hash, &sg, pdu + len, len);
281  	crypto_ahash_digest(hash);
282  }
283  
nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue * queue,void * pdu,size_t len)284  static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
285  	void *pdu, size_t len)
286  {
287  	struct nvme_tcp_hdr *hdr = pdu;
288  	__le32 recv_digest;
289  	__le32 exp_digest;
290  
291  	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
292  		pr_err("queue %d: header digest enabled but no header digest\n",
293  			queue->idx);
294  		return -EPROTO;
295  	}
296  
297  	recv_digest = *(__le32 *)(pdu + hdr->hlen);
298  	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
299  	exp_digest = *(__le32 *)(pdu + hdr->hlen);
300  	if (recv_digest != exp_digest) {
301  		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
302  			queue->idx, le32_to_cpu(recv_digest),
303  			le32_to_cpu(exp_digest));
304  		return -EPROTO;
305  	}
306  
307  	return 0;
308  }
309  
nvmet_tcp_check_ddgst(struct nvmet_tcp_queue * queue,void * pdu)310  static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
311  {
312  	struct nvme_tcp_hdr *hdr = pdu;
313  	u8 digest_len = nvmet_tcp_hdgst_len(queue);
314  	u32 len;
315  
316  	len = le32_to_cpu(hdr->plen) - hdr->hlen -
317  		(hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
318  
319  	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
320  		pr_err("queue %d: data digest flag is cleared\n", queue->idx);
321  		return -EPROTO;
322  	}
323  
324  	return 0;
325  }
326  
327  /* If cmd buffers are NULL, no operation is performed */
nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd * cmd)328  static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
329  {
330  	kfree(cmd->iov);
331  	sgl_free(cmd->req.sg);
332  	cmd->iov = NULL;
333  	cmd->req.sg = NULL;
334  }
335  
nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd * cmd)336  static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd)
337  {
338  	struct bio_vec *iov = cmd->iov;
339  	struct scatterlist *sg;
340  	u32 length, offset, sg_offset;
341  	int nr_pages;
342  
343  	length = cmd->pdu_len;
344  	nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
345  	offset = cmd->rbytes_done;
346  	cmd->sg_idx = offset / PAGE_SIZE;
347  	sg_offset = offset % PAGE_SIZE;
348  	sg = &cmd->req.sg[cmd->sg_idx];
349  
350  	while (length) {
351  		u32 iov_len = min_t(u32, length, sg->length - sg_offset);
352  
353  		bvec_set_page(iov, sg_page(sg), iov_len,
354  				sg->offset + sg_offset);
355  
356  		length -= iov_len;
357  		sg = sg_next(sg);
358  		iov++;
359  		sg_offset = 0;
360  	}
361  
362  	iov_iter_bvec(&cmd->recv_msg.msg_iter, ITER_DEST, cmd->iov,
363  		      nr_pages, cmd->pdu_len);
364  }
365  
nvmet_tcp_fatal_error(struct nvmet_tcp_queue * queue)366  static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
367  {
368  	queue->rcv_state = NVMET_TCP_RECV_ERR;
369  	if (queue->nvme_sq.ctrl)
370  		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
371  	else
372  		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
373  }
374  
nvmet_tcp_socket_error(struct nvmet_tcp_queue * queue,int status)375  static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
376  {
377  	queue->rcv_state = NVMET_TCP_RECV_ERR;
378  	if (status == -EPIPE || status == -ECONNRESET)
379  		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
380  	else
381  		nvmet_tcp_fatal_error(queue);
382  }
383  
nvmet_tcp_map_data(struct nvmet_tcp_cmd * cmd)384  static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
385  {
386  	struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
387  	u32 len = le32_to_cpu(sgl->length);
388  
389  	if (!len)
390  		return 0;
391  
392  	if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
393  			  NVME_SGL_FMT_OFFSET)) {
394  		if (!nvme_is_write(cmd->req.cmd))
395  			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
396  
397  		if (len > cmd->req.port->inline_data_size)
398  			return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
399  		cmd->pdu_len = len;
400  	}
401  	cmd->req.transfer_len += len;
402  
403  	cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
404  	if (!cmd->req.sg)
405  		return NVME_SC_INTERNAL;
406  	cmd->cur_sg = cmd->req.sg;
407  
408  	if (nvmet_tcp_has_data_in(cmd)) {
409  		cmd->iov = kmalloc_array(cmd->req.sg_cnt,
410  				sizeof(*cmd->iov), GFP_KERNEL);
411  		if (!cmd->iov)
412  			goto err;
413  	}
414  
415  	return 0;
416  err:
417  	nvmet_tcp_free_cmd_buffers(cmd);
418  	return NVME_SC_INTERNAL;
419  }
420  
nvmet_tcp_calc_ddgst(struct ahash_request * hash,struct nvmet_tcp_cmd * cmd)421  static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
422  		struct nvmet_tcp_cmd *cmd)
423  {
424  	ahash_request_set_crypt(hash, cmd->req.sg,
425  		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
426  	crypto_ahash_digest(hash);
427  }
428  
nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd * cmd)429  static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
430  {
431  	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
432  	struct nvmet_tcp_queue *queue = cmd->queue;
433  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
434  	u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
435  
436  	cmd->offset = 0;
437  	cmd->state = NVMET_TCP_SEND_DATA_PDU;
438  
439  	pdu->hdr.type = nvme_tcp_c2h_data;
440  	pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
441  						NVME_TCP_F_DATA_SUCCESS : 0);
442  	pdu->hdr.hlen = sizeof(*pdu);
443  	pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
444  	pdu->hdr.plen =
445  		cpu_to_le32(pdu->hdr.hlen + hdgst +
446  				cmd->req.transfer_len + ddgst);
447  	pdu->command_id = cmd->req.cqe->command_id;
448  	pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
449  	pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
450  
451  	if (queue->data_digest) {
452  		pdu->hdr.flags |= NVME_TCP_F_DDGST;
453  		nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
454  	}
455  
456  	if (cmd->queue->hdr_digest) {
457  		pdu->hdr.flags |= NVME_TCP_F_HDGST;
458  		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
459  	}
460  }
461  
nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd * cmd)462  static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
463  {
464  	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
465  	struct nvmet_tcp_queue *queue = cmd->queue;
466  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
467  
468  	cmd->offset = 0;
469  	cmd->state = NVMET_TCP_SEND_R2T;
470  
471  	pdu->hdr.type = nvme_tcp_r2t;
472  	pdu->hdr.flags = 0;
473  	pdu->hdr.hlen = sizeof(*pdu);
474  	pdu->hdr.pdo = 0;
475  	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
476  
477  	pdu->command_id = cmd->req.cmd->common.command_id;
478  	pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
479  	pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
480  	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
481  	if (cmd->queue->hdr_digest) {
482  		pdu->hdr.flags |= NVME_TCP_F_HDGST;
483  		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
484  	}
485  }
486  
nvmet_setup_response_pdu(struct nvmet_tcp_cmd * cmd)487  static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
488  {
489  	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
490  	struct nvmet_tcp_queue *queue = cmd->queue;
491  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
492  
493  	cmd->offset = 0;
494  	cmd->state = NVMET_TCP_SEND_RESPONSE;
495  
496  	pdu->hdr.type = nvme_tcp_rsp;
497  	pdu->hdr.flags = 0;
498  	pdu->hdr.hlen = sizeof(*pdu);
499  	pdu->hdr.pdo = 0;
500  	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
501  	if (cmd->queue->hdr_digest) {
502  		pdu->hdr.flags |= NVME_TCP_F_HDGST;
503  		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
504  	}
505  }
506  
nvmet_tcp_process_resp_list(struct nvmet_tcp_queue * queue)507  static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
508  {
509  	struct llist_node *node;
510  	struct nvmet_tcp_cmd *cmd;
511  
512  	for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
513  		cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
514  		list_add(&cmd->entry, &queue->resp_send_list);
515  		queue->send_list_len++;
516  	}
517  }
518  
nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue * queue)519  static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
520  {
521  	queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
522  				struct nvmet_tcp_cmd, entry);
523  	if (!queue->snd_cmd) {
524  		nvmet_tcp_process_resp_list(queue);
525  		queue->snd_cmd =
526  			list_first_entry_or_null(&queue->resp_send_list,
527  					struct nvmet_tcp_cmd, entry);
528  		if (unlikely(!queue->snd_cmd))
529  			return NULL;
530  	}
531  
532  	list_del_init(&queue->snd_cmd->entry);
533  	queue->send_list_len--;
534  
535  	if (nvmet_tcp_need_data_out(queue->snd_cmd))
536  		nvmet_setup_c2h_data_pdu(queue->snd_cmd);
537  	else if (nvmet_tcp_need_data_in(queue->snd_cmd))
538  		nvmet_setup_r2t_pdu(queue->snd_cmd);
539  	else
540  		nvmet_setup_response_pdu(queue->snd_cmd);
541  
542  	return queue->snd_cmd;
543  }
544  
nvmet_tcp_queue_response(struct nvmet_req * req)545  static void nvmet_tcp_queue_response(struct nvmet_req *req)
546  {
547  	struct nvmet_tcp_cmd *cmd =
548  		container_of(req, struct nvmet_tcp_cmd, req);
549  	struct nvmet_tcp_queue	*queue = cmd->queue;
550  	enum nvmet_tcp_recv_state queue_state;
551  	struct nvmet_tcp_cmd *queue_cmd;
552  	struct nvme_sgl_desc *sgl;
553  	u32 len;
554  
555  	/* Pairs with store_release in nvmet_prepare_receive_pdu() */
556  	queue_state = smp_load_acquire(&queue->rcv_state);
557  	queue_cmd = READ_ONCE(queue->cmd);
558  
559  	if (unlikely(cmd == queue_cmd)) {
560  		sgl = &cmd->req.cmd->common.dptr.sgl;
561  		len = le32_to_cpu(sgl->length);
562  
563  		/*
564  		 * Wait for inline data before processing the response.
565  		 * Avoid using helpers, this might happen before
566  		 * nvmet_req_init is completed.
567  		 */
568  		if (queue_state == NVMET_TCP_RECV_PDU &&
569  		    len && len <= cmd->req.port->inline_data_size &&
570  		    nvme_is_write(cmd->req.cmd))
571  			return;
572  	}
573  
574  	llist_add(&cmd->lentry, &queue->resp_list);
575  	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
576  }
577  
nvmet_tcp_execute_request(struct nvmet_tcp_cmd * cmd)578  static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
579  {
580  	if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
581  		nvmet_tcp_queue_response(&cmd->req);
582  	else
583  		cmd->req.execute(&cmd->req);
584  }
585  
nvmet_try_send_data_pdu(struct nvmet_tcp_cmd * cmd)586  static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
587  {
588  	struct msghdr msg = {
589  		.msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES,
590  	};
591  	struct bio_vec bvec;
592  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
593  	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
594  	int ret;
595  
596  	bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left);
597  	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
598  	ret = sock_sendmsg(cmd->queue->sock, &msg);
599  	if (ret <= 0)
600  		return ret;
601  
602  	cmd->offset += ret;
603  	left -= ret;
604  
605  	if (left)
606  		return -EAGAIN;
607  
608  	cmd->state = NVMET_TCP_SEND_DATA;
609  	cmd->offset  = 0;
610  	return 1;
611  }
612  
nvmet_try_send_data(struct nvmet_tcp_cmd * cmd,bool last_in_batch)613  static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
614  {
615  	struct nvmet_tcp_queue *queue = cmd->queue;
616  	int ret;
617  
618  	while (cmd->cur_sg) {
619  		struct msghdr msg = {
620  			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
621  		};
622  		struct page *page = sg_page(cmd->cur_sg);
623  		struct bio_vec bvec;
624  		u32 left = cmd->cur_sg->length - cmd->offset;
625  
626  		if ((!last_in_batch && cmd->queue->send_list_len) ||
627  		    cmd->wbytes_done + left < cmd->req.transfer_len ||
628  		    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
629  			msg.msg_flags |= MSG_MORE;
630  
631  		bvec_set_page(&bvec, page, left, cmd->offset);
632  		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
633  		ret = sock_sendmsg(cmd->queue->sock, &msg);
634  		if (ret <= 0)
635  			return ret;
636  
637  		cmd->offset += ret;
638  		cmd->wbytes_done += ret;
639  
640  		/* Done with sg?*/
641  		if (cmd->offset == cmd->cur_sg->length) {
642  			cmd->cur_sg = sg_next(cmd->cur_sg);
643  			cmd->offset = 0;
644  		}
645  	}
646  
647  	if (queue->data_digest) {
648  		cmd->state = NVMET_TCP_SEND_DDGST;
649  		cmd->offset = 0;
650  	} else {
651  		if (queue->nvme_sq.sqhd_disabled) {
652  			cmd->queue->snd_cmd = NULL;
653  			nvmet_tcp_put_cmd(cmd);
654  		} else {
655  			nvmet_setup_response_pdu(cmd);
656  		}
657  	}
658  
659  	if (queue->nvme_sq.sqhd_disabled)
660  		nvmet_tcp_free_cmd_buffers(cmd);
661  
662  	return 1;
663  
664  }
665  
nvmet_try_send_response(struct nvmet_tcp_cmd * cmd,bool last_in_batch)666  static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
667  		bool last_in_batch)
668  {
669  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
670  	struct bio_vec bvec;
671  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
672  	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
673  	int ret;
674  
675  	if (!last_in_batch && cmd->queue->send_list_len)
676  		msg.msg_flags |= MSG_MORE;
677  	else
678  		msg.msg_flags |= MSG_EOR;
679  
680  	bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left);
681  	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
682  	ret = sock_sendmsg(cmd->queue->sock, &msg);
683  	if (ret <= 0)
684  		return ret;
685  	cmd->offset += ret;
686  	left -= ret;
687  
688  	if (left)
689  		return -EAGAIN;
690  
691  	nvmet_tcp_free_cmd_buffers(cmd);
692  	cmd->queue->snd_cmd = NULL;
693  	nvmet_tcp_put_cmd(cmd);
694  	return 1;
695  }
696  
nvmet_try_send_r2t(struct nvmet_tcp_cmd * cmd,bool last_in_batch)697  static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
698  {
699  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
700  	struct bio_vec bvec;
701  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
702  	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
703  	int ret;
704  
705  	if (!last_in_batch && cmd->queue->send_list_len)
706  		msg.msg_flags |= MSG_MORE;
707  	else
708  		msg.msg_flags |= MSG_EOR;
709  
710  	bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left);
711  	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
712  	ret = sock_sendmsg(cmd->queue->sock, &msg);
713  	if (ret <= 0)
714  		return ret;
715  	cmd->offset += ret;
716  	left -= ret;
717  
718  	if (left)
719  		return -EAGAIN;
720  
721  	cmd->queue->snd_cmd = NULL;
722  	return 1;
723  }
724  
nvmet_try_send_ddgst(struct nvmet_tcp_cmd * cmd,bool last_in_batch)725  static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
726  {
727  	struct nvmet_tcp_queue *queue = cmd->queue;
728  	int left = NVME_TCP_DIGEST_LENGTH - cmd->offset;
729  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
730  	struct kvec iov = {
731  		.iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
732  		.iov_len = left
733  	};
734  	int ret;
735  
736  	if (!last_in_batch && cmd->queue->send_list_len)
737  		msg.msg_flags |= MSG_MORE;
738  	else
739  		msg.msg_flags |= MSG_EOR;
740  
741  	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
742  	if (unlikely(ret <= 0))
743  		return ret;
744  
745  	cmd->offset += ret;
746  	left -= ret;
747  
748  	if (left)
749  		return -EAGAIN;
750  
751  	if (queue->nvme_sq.sqhd_disabled) {
752  		cmd->queue->snd_cmd = NULL;
753  		nvmet_tcp_put_cmd(cmd);
754  	} else {
755  		nvmet_setup_response_pdu(cmd);
756  	}
757  	return 1;
758  }
759  
nvmet_tcp_try_send_one(struct nvmet_tcp_queue * queue,bool last_in_batch)760  static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
761  		bool last_in_batch)
762  {
763  	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
764  	int ret = 0;
765  
766  	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
767  		cmd = nvmet_tcp_fetch_cmd(queue);
768  		if (unlikely(!cmd))
769  			return 0;
770  	}
771  
772  	if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
773  		ret = nvmet_try_send_data_pdu(cmd);
774  		if (ret <= 0)
775  			goto done_send;
776  	}
777  
778  	if (cmd->state == NVMET_TCP_SEND_DATA) {
779  		ret = nvmet_try_send_data(cmd, last_in_batch);
780  		if (ret <= 0)
781  			goto done_send;
782  	}
783  
784  	if (cmd->state == NVMET_TCP_SEND_DDGST) {
785  		ret = nvmet_try_send_ddgst(cmd, last_in_batch);
786  		if (ret <= 0)
787  			goto done_send;
788  	}
789  
790  	if (cmd->state == NVMET_TCP_SEND_R2T) {
791  		ret = nvmet_try_send_r2t(cmd, last_in_batch);
792  		if (ret <= 0)
793  			goto done_send;
794  	}
795  
796  	if (cmd->state == NVMET_TCP_SEND_RESPONSE)
797  		ret = nvmet_try_send_response(cmd, last_in_batch);
798  
799  done_send:
800  	if (ret < 0) {
801  		if (ret == -EAGAIN)
802  			return 0;
803  		return ret;
804  	}
805  
806  	return 1;
807  }
808  
nvmet_tcp_try_send(struct nvmet_tcp_queue * queue,int budget,int * sends)809  static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
810  		int budget, int *sends)
811  {
812  	int i, ret = 0;
813  
814  	for (i = 0; i < budget; i++) {
815  		ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
816  		if (unlikely(ret < 0)) {
817  			nvmet_tcp_socket_error(queue, ret);
818  			goto done;
819  		} else if (ret == 0) {
820  			break;
821  		}
822  		(*sends)++;
823  	}
824  done:
825  	return ret;
826  }
827  
nvmet_prepare_receive_pdu(struct nvmet_tcp_queue * queue)828  static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
829  {
830  	queue->offset = 0;
831  	queue->left = sizeof(struct nvme_tcp_hdr);
832  	WRITE_ONCE(queue->cmd, NULL);
833  	/* Ensure rcv_state is visible only after queue->cmd is set */
834  	smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
835  }
836  
nvmet_tcp_free_crypto(struct nvmet_tcp_queue * queue)837  static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
838  {
839  	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
840  
841  	ahash_request_free(queue->rcv_hash);
842  	ahash_request_free(queue->snd_hash);
843  	crypto_free_ahash(tfm);
844  }
845  
nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue * queue)846  static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
847  {
848  	struct crypto_ahash *tfm;
849  
850  	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
851  	if (IS_ERR(tfm))
852  		return PTR_ERR(tfm);
853  
854  	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
855  	if (!queue->snd_hash)
856  		goto free_tfm;
857  	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
858  
859  	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
860  	if (!queue->rcv_hash)
861  		goto free_snd_hash;
862  	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
863  
864  	return 0;
865  free_snd_hash:
866  	ahash_request_free(queue->snd_hash);
867  free_tfm:
868  	crypto_free_ahash(tfm);
869  	return -ENOMEM;
870  }
871  
872  
nvmet_tcp_handle_icreq(struct nvmet_tcp_queue * queue)873  static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
874  {
875  	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
876  	struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
877  	struct msghdr msg = {};
878  	struct kvec iov;
879  	int ret;
880  
881  	if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
882  		pr_err("bad nvme-tcp pdu length (%d)\n",
883  			le32_to_cpu(icreq->hdr.plen));
884  		nvmet_tcp_fatal_error(queue);
885  		return -EPROTO;
886  	}
887  
888  	if (icreq->pfv != NVME_TCP_PFV_1_0) {
889  		pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
890  		return -EPROTO;
891  	}
892  
893  	if (icreq->hpda != 0) {
894  		pr_err("queue %d: unsupported hpda %d\n", queue->idx,
895  			icreq->hpda);
896  		return -EPROTO;
897  	}
898  
899  	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
900  	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
901  	if (queue->hdr_digest || queue->data_digest) {
902  		ret = nvmet_tcp_alloc_crypto(queue);
903  		if (ret)
904  			return ret;
905  	}
906  
907  	memset(icresp, 0, sizeof(*icresp));
908  	icresp->hdr.type = nvme_tcp_icresp;
909  	icresp->hdr.hlen = sizeof(*icresp);
910  	icresp->hdr.pdo = 0;
911  	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
912  	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
913  	icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA);
914  	icresp->cpda = 0;
915  	if (queue->hdr_digest)
916  		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
917  	if (queue->data_digest)
918  		icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
919  
920  	iov.iov_base = icresp;
921  	iov.iov_len = sizeof(*icresp);
922  	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
923  	if (ret < 0)
924  		return ret; /* queue removal will cleanup */
925  
926  	queue->state = NVMET_TCP_Q_LIVE;
927  	nvmet_prepare_receive_pdu(queue);
928  	return 0;
929  }
930  
nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * cmd,struct nvmet_req * req)931  static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
932  		struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
933  {
934  	size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
935  	int ret;
936  
937  	/*
938  	 * This command has not been processed yet, hence we are trying to
939  	 * figure out if there is still pending data left to receive. If
940  	 * we don't, we can simply prepare for the next pdu and bail out,
941  	 * otherwise we will need to prepare a buffer and receive the
942  	 * stale data before continuing forward.
943  	 */
944  	if (!nvme_is_write(cmd->req.cmd) || !data_len ||
945  	    data_len > cmd->req.port->inline_data_size) {
946  		nvmet_prepare_receive_pdu(queue);
947  		return;
948  	}
949  
950  	ret = nvmet_tcp_map_data(cmd);
951  	if (unlikely(ret)) {
952  		pr_err("queue %d: failed to map data\n", queue->idx);
953  		nvmet_tcp_fatal_error(queue);
954  		return;
955  	}
956  
957  	queue->rcv_state = NVMET_TCP_RECV_DATA;
958  	nvmet_tcp_build_pdu_iovec(cmd);
959  	cmd->flags |= NVMET_TCP_F_INIT_FAILED;
960  }
961  
nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue * queue)962  static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
963  {
964  	struct nvme_tcp_data_pdu *data = &queue->pdu.data;
965  	struct nvmet_tcp_cmd *cmd;
966  	unsigned int exp_data_len;
967  
968  	if (likely(queue->nr_cmds)) {
969  		if (unlikely(data->ttag >= queue->nr_cmds)) {
970  			pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
971  				queue->idx, data->ttag, queue->nr_cmds);
972  			nvmet_tcp_fatal_error(queue);
973  			return -EPROTO;
974  		}
975  		cmd = &queue->cmds[data->ttag];
976  	} else {
977  		cmd = &queue->connect;
978  	}
979  
980  	if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
981  		pr_err("ttag %u unexpected data offset %u (expected %u)\n",
982  			data->ttag, le32_to_cpu(data->data_offset),
983  			cmd->rbytes_done);
984  		/* FIXME: use path and transport errors */
985  		nvmet_tcp_fatal_error(queue);
986  		return -EPROTO;
987  	}
988  
989  	exp_data_len = le32_to_cpu(data->hdr.plen) -
990  			nvmet_tcp_hdgst_len(queue) -
991  			nvmet_tcp_ddgst_len(queue) -
992  			sizeof(*data);
993  
994  	cmd->pdu_len = le32_to_cpu(data->data_length);
995  	if (unlikely(cmd->pdu_len != exp_data_len ||
996  		     cmd->pdu_len == 0 ||
997  		     cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) {
998  		pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len);
999  		/* FIXME: use proper transport errors */
1000  		nvmet_tcp_fatal_error(queue);
1001  		return -EPROTO;
1002  	}
1003  	cmd->pdu_recv = 0;
1004  	nvmet_tcp_build_pdu_iovec(cmd);
1005  	queue->cmd = cmd;
1006  	queue->rcv_state = NVMET_TCP_RECV_DATA;
1007  
1008  	return 0;
1009  }
1010  
nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue * queue)1011  static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
1012  {
1013  	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1014  	struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
1015  	struct nvmet_req *req;
1016  	int ret;
1017  
1018  	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1019  		if (hdr->type != nvme_tcp_icreq) {
1020  			pr_err("unexpected pdu type (%d) before icreq\n",
1021  				hdr->type);
1022  			nvmet_tcp_fatal_error(queue);
1023  			return -EPROTO;
1024  		}
1025  		return nvmet_tcp_handle_icreq(queue);
1026  	}
1027  
1028  	if (unlikely(hdr->type == nvme_tcp_icreq)) {
1029  		pr_err("queue %d: received icreq pdu in state %d\n",
1030  			queue->idx, queue->state);
1031  		nvmet_tcp_fatal_error(queue);
1032  		return -EPROTO;
1033  	}
1034  
1035  	if (hdr->type == nvme_tcp_h2c_data) {
1036  		ret = nvmet_tcp_handle_h2c_data_pdu(queue);
1037  		if (unlikely(ret))
1038  			return ret;
1039  		return 0;
1040  	}
1041  
1042  	queue->cmd = nvmet_tcp_get_cmd(queue);
1043  	if (unlikely(!queue->cmd)) {
1044  		/* This should never happen */
1045  		pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
1046  			queue->idx, queue->nr_cmds, queue->send_list_len,
1047  			nvme_cmd->common.opcode);
1048  		nvmet_tcp_fatal_error(queue);
1049  		return -ENOMEM;
1050  	}
1051  
1052  	req = &queue->cmd->req;
1053  	memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
1054  
1055  	if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
1056  			&queue->nvme_sq, &nvmet_tcp_ops))) {
1057  		pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
1058  			req->cmd, req->cmd->common.command_id,
1059  			req->cmd->common.opcode,
1060  			le32_to_cpu(req->cmd->common.dptr.sgl.length));
1061  
1062  		nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
1063  		return 0;
1064  	}
1065  
1066  	ret = nvmet_tcp_map_data(queue->cmd);
1067  	if (unlikely(ret)) {
1068  		pr_err("queue %d: failed to map data\n", queue->idx);
1069  		if (nvmet_tcp_has_inline_data(queue->cmd))
1070  			nvmet_tcp_fatal_error(queue);
1071  		else
1072  			nvmet_req_complete(req, ret);
1073  		ret = -EAGAIN;
1074  		goto out;
1075  	}
1076  
1077  	if (nvmet_tcp_need_data_in(queue->cmd)) {
1078  		if (nvmet_tcp_has_inline_data(queue->cmd)) {
1079  			queue->rcv_state = NVMET_TCP_RECV_DATA;
1080  			nvmet_tcp_build_pdu_iovec(queue->cmd);
1081  			return 0;
1082  		}
1083  		/* send back R2T */
1084  		nvmet_tcp_queue_response(&queue->cmd->req);
1085  		goto out;
1086  	}
1087  
1088  	queue->cmd->req.execute(&queue->cmd->req);
1089  out:
1090  	nvmet_prepare_receive_pdu(queue);
1091  	return ret;
1092  }
1093  
1094  static const u8 nvme_tcp_pdu_sizes[] = {
1095  	[nvme_tcp_icreq]	= sizeof(struct nvme_tcp_icreq_pdu),
1096  	[nvme_tcp_cmd]		= sizeof(struct nvme_tcp_cmd_pdu),
1097  	[nvme_tcp_h2c_data]	= sizeof(struct nvme_tcp_data_pdu),
1098  };
1099  
nvmet_tcp_pdu_size(u8 type)1100  static inline u8 nvmet_tcp_pdu_size(u8 type)
1101  {
1102  	size_t idx = type;
1103  
1104  	return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1105  		nvme_tcp_pdu_sizes[idx]) ?
1106  			nvme_tcp_pdu_sizes[idx] : 0;
1107  }
1108  
nvmet_tcp_pdu_valid(u8 type)1109  static inline bool nvmet_tcp_pdu_valid(u8 type)
1110  {
1111  	switch (type) {
1112  	case nvme_tcp_icreq:
1113  	case nvme_tcp_cmd:
1114  	case nvme_tcp_h2c_data:
1115  		/* fallthru */
1116  		return true;
1117  	}
1118  
1119  	return false;
1120  }
1121  
nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue * queue)1122  static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1123  {
1124  	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1125  	int len;
1126  	struct kvec iov;
1127  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1128  
1129  recv:
1130  	iov.iov_base = (void *)&queue->pdu + queue->offset;
1131  	iov.iov_len = queue->left;
1132  	len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1133  			iov.iov_len, msg.msg_flags);
1134  	if (unlikely(len < 0))
1135  		return len;
1136  
1137  	queue->offset += len;
1138  	queue->left -= len;
1139  	if (queue->left)
1140  		return -EAGAIN;
1141  
1142  	if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1143  		u8 hdgst = nvmet_tcp_hdgst_len(queue);
1144  
1145  		if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1146  			pr_err("unexpected pdu type %d\n", hdr->type);
1147  			nvmet_tcp_fatal_error(queue);
1148  			return -EIO;
1149  		}
1150  
1151  		if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1152  			pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1153  			return -EIO;
1154  		}
1155  
1156  		queue->left = hdr->hlen - queue->offset + hdgst;
1157  		goto recv;
1158  	}
1159  
1160  	if (queue->hdr_digest &&
1161  	    nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
1162  		nvmet_tcp_fatal_error(queue); /* fatal */
1163  		return -EPROTO;
1164  	}
1165  
1166  	if (queue->data_digest &&
1167  	    nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1168  		nvmet_tcp_fatal_error(queue); /* fatal */
1169  		return -EPROTO;
1170  	}
1171  
1172  	return nvmet_tcp_done_recv_pdu(queue);
1173  }
1174  
nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd * cmd)1175  static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1176  {
1177  	struct nvmet_tcp_queue *queue = cmd->queue;
1178  
1179  	nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
1180  	queue->offset = 0;
1181  	queue->left = NVME_TCP_DIGEST_LENGTH;
1182  	queue->rcv_state = NVMET_TCP_RECV_DDGST;
1183  }
1184  
nvmet_tcp_try_recv_data(struct nvmet_tcp_queue * queue)1185  static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1186  {
1187  	struct nvmet_tcp_cmd  *cmd = queue->cmd;
1188  	int ret;
1189  
1190  	while (msg_data_left(&cmd->recv_msg)) {
1191  		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1192  			cmd->recv_msg.msg_flags);
1193  		if (ret <= 0)
1194  			return ret;
1195  
1196  		cmd->pdu_recv += ret;
1197  		cmd->rbytes_done += ret;
1198  	}
1199  
1200  	if (queue->data_digest) {
1201  		nvmet_tcp_prep_recv_ddgst(cmd);
1202  		return 0;
1203  	}
1204  
1205  	if (cmd->rbytes_done == cmd->req.transfer_len)
1206  		nvmet_tcp_execute_request(cmd);
1207  
1208  	nvmet_prepare_receive_pdu(queue);
1209  	return 0;
1210  }
1211  
nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue * queue)1212  static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1213  {
1214  	struct nvmet_tcp_cmd *cmd = queue->cmd;
1215  	int ret;
1216  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1217  	struct kvec iov = {
1218  		.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1219  		.iov_len = queue->left
1220  	};
1221  
1222  	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1223  			iov.iov_len, msg.msg_flags);
1224  	if (unlikely(ret < 0))
1225  		return ret;
1226  
1227  	queue->offset += ret;
1228  	queue->left -= ret;
1229  	if (queue->left)
1230  		return -EAGAIN;
1231  
1232  	if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1233  		pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1234  			queue->idx, cmd->req.cmd->common.command_id,
1235  			queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1236  			le32_to_cpu(cmd->exp_ddgst));
1237  		nvmet_req_uninit(&cmd->req);
1238  		nvmet_tcp_free_cmd_buffers(cmd);
1239  		nvmet_tcp_fatal_error(queue);
1240  		ret = -EPROTO;
1241  		goto out;
1242  	}
1243  
1244  	if (cmd->rbytes_done == cmd->req.transfer_len)
1245  		nvmet_tcp_execute_request(cmd);
1246  
1247  	ret = 0;
1248  out:
1249  	nvmet_prepare_receive_pdu(queue);
1250  	return ret;
1251  }
1252  
nvmet_tcp_try_recv_one(struct nvmet_tcp_queue * queue)1253  static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1254  {
1255  	int result = 0;
1256  
1257  	if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1258  		return 0;
1259  
1260  	if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1261  		result = nvmet_tcp_try_recv_pdu(queue);
1262  		if (result != 0)
1263  			goto done_recv;
1264  	}
1265  
1266  	if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1267  		result = nvmet_tcp_try_recv_data(queue);
1268  		if (result != 0)
1269  			goto done_recv;
1270  	}
1271  
1272  	if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1273  		result = nvmet_tcp_try_recv_ddgst(queue);
1274  		if (result != 0)
1275  			goto done_recv;
1276  	}
1277  
1278  done_recv:
1279  	if (result < 0) {
1280  		if (result == -EAGAIN)
1281  			return 0;
1282  		return result;
1283  	}
1284  	return 1;
1285  }
1286  
nvmet_tcp_try_recv(struct nvmet_tcp_queue * queue,int budget,int * recvs)1287  static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1288  		int budget, int *recvs)
1289  {
1290  	int i, ret = 0;
1291  
1292  	for (i = 0; i < budget; i++) {
1293  		ret = nvmet_tcp_try_recv_one(queue);
1294  		if (unlikely(ret < 0)) {
1295  			nvmet_tcp_socket_error(queue, ret);
1296  			goto done;
1297  		} else if (ret == 0) {
1298  			break;
1299  		}
1300  		(*recvs)++;
1301  	}
1302  done:
1303  	return ret;
1304  }
1305  
nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue * queue)1306  static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1307  {
1308  	spin_lock(&queue->state_lock);
1309  	if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1310  		queue->state = NVMET_TCP_Q_DISCONNECTING;
1311  		queue_work(nvmet_wq, &queue->release_work);
1312  	}
1313  	spin_unlock(&queue->state_lock);
1314  }
1315  
nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue * queue)1316  static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue)
1317  {
1318  	queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs);
1319  }
1320  
nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue * queue,int ops)1321  static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue,
1322  		int ops)
1323  {
1324  	if (!idle_poll_period_usecs)
1325  		return false;
1326  
1327  	if (ops)
1328  		nvmet_tcp_arm_queue_deadline(queue);
1329  
1330  	return !time_after(jiffies, queue->poll_end);
1331  }
1332  
nvmet_tcp_io_work(struct work_struct * w)1333  static void nvmet_tcp_io_work(struct work_struct *w)
1334  {
1335  	struct nvmet_tcp_queue *queue =
1336  		container_of(w, struct nvmet_tcp_queue, io_work);
1337  	bool pending;
1338  	int ret, ops = 0;
1339  
1340  	do {
1341  		pending = false;
1342  
1343  		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1344  		if (ret > 0)
1345  			pending = true;
1346  		else if (ret < 0)
1347  			return;
1348  
1349  		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1350  		if (ret > 0)
1351  			pending = true;
1352  		else if (ret < 0)
1353  			return;
1354  
1355  	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1356  
1357  	/*
1358  	 * Requeue the worker if idle deadline period is in progress or any
1359  	 * ops activity was recorded during the do-while loop above.
1360  	 */
1361  	if (nvmet_tcp_check_queue_deadline(queue, ops) || pending)
1362  		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1363  }
1364  
nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * c)1365  static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1366  		struct nvmet_tcp_cmd *c)
1367  {
1368  	u8 hdgst = nvmet_tcp_hdgst_len(queue);
1369  
1370  	c->queue = queue;
1371  	c->req.port = queue->port->nport;
1372  
1373  	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1374  			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1375  	if (!c->cmd_pdu)
1376  		return -ENOMEM;
1377  	c->req.cmd = &c->cmd_pdu->cmd;
1378  
1379  	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1380  			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1381  	if (!c->rsp_pdu)
1382  		goto out_free_cmd;
1383  	c->req.cqe = &c->rsp_pdu->cqe;
1384  
1385  	c->data_pdu = page_frag_alloc(&queue->pf_cache,
1386  			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1387  	if (!c->data_pdu)
1388  		goto out_free_rsp;
1389  
1390  	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1391  			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1392  	if (!c->r2t_pdu)
1393  		goto out_free_data;
1394  
1395  	c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1396  
1397  	list_add_tail(&c->entry, &queue->free_list);
1398  
1399  	return 0;
1400  out_free_data:
1401  	page_frag_free(c->data_pdu);
1402  out_free_rsp:
1403  	page_frag_free(c->rsp_pdu);
1404  out_free_cmd:
1405  	page_frag_free(c->cmd_pdu);
1406  	return -ENOMEM;
1407  }
1408  
nvmet_tcp_free_cmd(struct nvmet_tcp_cmd * c)1409  static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1410  {
1411  	page_frag_free(c->r2t_pdu);
1412  	page_frag_free(c->data_pdu);
1413  	page_frag_free(c->rsp_pdu);
1414  	page_frag_free(c->cmd_pdu);
1415  }
1416  
nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue * queue)1417  static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1418  {
1419  	struct nvmet_tcp_cmd *cmds;
1420  	int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1421  
1422  	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1423  	if (!cmds)
1424  		goto out;
1425  
1426  	for (i = 0; i < nr_cmds; i++) {
1427  		ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1428  		if (ret)
1429  			goto out_free;
1430  	}
1431  
1432  	queue->cmds = cmds;
1433  
1434  	return 0;
1435  out_free:
1436  	while (--i >= 0)
1437  		nvmet_tcp_free_cmd(cmds + i);
1438  	kfree(cmds);
1439  out:
1440  	return ret;
1441  }
1442  
nvmet_tcp_free_cmds(struct nvmet_tcp_queue * queue)1443  static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1444  {
1445  	struct nvmet_tcp_cmd *cmds = queue->cmds;
1446  	int i;
1447  
1448  	for (i = 0; i < queue->nr_cmds; i++)
1449  		nvmet_tcp_free_cmd(cmds + i);
1450  
1451  	nvmet_tcp_free_cmd(&queue->connect);
1452  	kfree(cmds);
1453  }
1454  
nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue * queue)1455  static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1456  {
1457  	struct socket *sock = queue->sock;
1458  
1459  	write_lock_bh(&sock->sk->sk_callback_lock);
1460  	sock->sk->sk_data_ready =  queue->data_ready;
1461  	sock->sk->sk_state_change = queue->state_change;
1462  	sock->sk->sk_write_space = queue->write_space;
1463  	sock->sk->sk_user_data = NULL;
1464  	write_unlock_bh(&sock->sk->sk_callback_lock);
1465  }
1466  
nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue * queue)1467  static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1468  {
1469  	struct nvmet_tcp_cmd *cmd = queue->cmds;
1470  	int i;
1471  
1472  	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1473  		if (nvmet_tcp_need_data_in(cmd))
1474  			nvmet_req_uninit(&cmd->req);
1475  	}
1476  
1477  	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1478  		/* failed in connect */
1479  		nvmet_req_uninit(&queue->connect.req);
1480  	}
1481  }
1482  
nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue * queue)1483  static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
1484  {
1485  	struct nvmet_tcp_cmd *cmd = queue->cmds;
1486  	int i;
1487  
1488  	for (i = 0; i < queue->nr_cmds; i++, cmd++)
1489  		nvmet_tcp_free_cmd_buffers(cmd);
1490  	nvmet_tcp_free_cmd_buffers(&queue->connect);
1491  }
1492  
nvmet_tcp_release_queue_work(struct work_struct * w)1493  static void nvmet_tcp_release_queue_work(struct work_struct *w)
1494  {
1495  	struct page *page;
1496  	struct nvmet_tcp_queue *queue =
1497  		container_of(w, struct nvmet_tcp_queue, release_work);
1498  
1499  	mutex_lock(&nvmet_tcp_queue_mutex);
1500  	list_del_init(&queue->queue_list);
1501  	mutex_unlock(&nvmet_tcp_queue_mutex);
1502  
1503  	nvmet_tcp_restore_socket_callbacks(queue);
1504  	cancel_work_sync(&queue->io_work);
1505  	/* stop accepting incoming data */
1506  	queue->rcv_state = NVMET_TCP_RECV_ERR;
1507  
1508  	nvmet_tcp_uninit_data_in_cmds(queue);
1509  	nvmet_sq_destroy(&queue->nvme_sq);
1510  	cancel_work_sync(&queue->io_work);
1511  	nvmet_tcp_free_cmd_data_in_buffers(queue);
1512  	sock_release(queue->sock);
1513  	nvmet_tcp_free_cmds(queue);
1514  	if (queue->hdr_digest || queue->data_digest)
1515  		nvmet_tcp_free_crypto(queue);
1516  	ida_free(&nvmet_tcp_queue_ida, queue->idx);
1517  
1518  	page = virt_to_head_page(queue->pf_cache.va);
1519  	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1520  	kfree(queue);
1521  }
1522  
nvmet_tcp_data_ready(struct sock * sk)1523  static void nvmet_tcp_data_ready(struct sock *sk)
1524  {
1525  	struct nvmet_tcp_queue *queue;
1526  
1527  	trace_sk_data_ready(sk);
1528  
1529  	read_lock_bh(&sk->sk_callback_lock);
1530  	queue = sk->sk_user_data;
1531  	if (likely(queue))
1532  		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1533  	read_unlock_bh(&sk->sk_callback_lock);
1534  }
1535  
nvmet_tcp_write_space(struct sock * sk)1536  static void nvmet_tcp_write_space(struct sock *sk)
1537  {
1538  	struct nvmet_tcp_queue *queue;
1539  
1540  	read_lock_bh(&sk->sk_callback_lock);
1541  	queue = sk->sk_user_data;
1542  	if (unlikely(!queue))
1543  		goto out;
1544  
1545  	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1546  		queue->write_space(sk);
1547  		goto out;
1548  	}
1549  
1550  	if (sk_stream_is_writeable(sk)) {
1551  		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1552  		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1553  	}
1554  out:
1555  	read_unlock_bh(&sk->sk_callback_lock);
1556  }
1557  
nvmet_tcp_state_change(struct sock * sk)1558  static void nvmet_tcp_state_change(struct sock *sk)
1559  {
1560  	struct nvmet_tcp_queue *queue;
1561  
1562  	read_lock_bh(&sk->sk_callback_lock);
1563  	queue = sk->sk_user_data;
1564  	if (!queue)
1565  		goto done;
1566  
1567  	switch (sk->sk_state) {
1568  	case TCP_FIN_WAIT2:
1569  	case TCP_LAST_ACK:
1570  		break;
1571  	case TCP_FIN_WAIT1:
1572  	case TCP_CLOSE_WAIT:
1573  	case TCP_CLOSE:
1574  		/* FALLTHRU */
1575  		nvmet_tcp_schedule_release_queue(queue);
1576  		break;
1577  	default:
1578  		pr_warn("queue %d unhandled state %d\n",
1579  			queue->idx, sk->sk_state);
1580  	}
1581  done:
1582  	read_unlock_bh(&sk->sk_callback_lock);
1583  }
1584  
nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue * queue)1585  static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1586  {
1587  	struct socket *sock = queue->sock;
1588  	struct inet_sock *inet = inet_sk(sock->sk);
1589  	int ret;
1590  
1591  	ret = kernel_getsockname(sock,
1592  		(struct sockaddr *)&queue->sockaddr);
1593  	if (ret < 0)
1594  		return ret;
1595  
1596  	ret = kernel_getpeername(sock,
1597  		(struct sockaddr *)&queue->sockaddr_peer);
1598  	if (ret < 0)
1599  		return ret;
1600  
1601  	/*
1602  	 * Cleanup whatever is sitting in the TCP transmit queue on socket
1603  	 * close. This is done to prevent stale data from being sent should
1604  	 * the network connection be restored before TCP times out.
1605  	 */
1606  	sock_no_linger(sock->sk);
1607  
1608  	if (so_priority > 0)
1609  		sock_set_priority(sock->sk, so_priority);
1610  
1611  	/* Set socket type of service */
1612  	if (inet->rcv_tos > 0)
1613  		ip_sock_set_tos(sock->sk, inet->rcv_tos);
1614  
1615  	ret = 0;
1616  	write_lock_bh(&sock->sk->sk_callback_lock);
1617  	if (sock->sk->sk_state != TCP_ESTABLISHED) {
1618  		/*
1619  		 * If the socket is already closing, don't even start
1620  		 * consuming it
1621  		 */
1622  		ret = -ENOTCONN;
1623  	} else {
1624  		sock->sk->sk_user_data = queue;
1625  		queue->data_ready = sock->sk->sk_data_ready;
1626  		sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1627  		queue->state_change = sock->sk->sk_state_change;
1628  		sock->sk->sk_state_change = nvmet_tcp_state_change;
1629  		queue->write_space = sock->sk->sk_write_space;
1630  		sock->sk->sk_write_space = nvmet_tcp_write_space;
1631  		if (idle_poll_period_usecs)
1632  			nvmet_tcp_arm_queue_deadline(queue);
1633  		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1634  	}
1635  	write_unlock_bh(&sock->sk->sk_callback_lock);
1636  
1637  	return ret;
1638  }
1639  
nvmet_tcp_alloc_queue(struct nvmet_tcp_port * port,struct socket * newsock)1640  static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1641  		struct socket *newsock)
1642  {
1643  	struct nvmet_tcp_queue *queue;
1644  	int ret;
1645  
1646  	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1647  	if (!queue)
1648  		return -ENOMEM;
1649  
1650  	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1651  	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1652  	queue->sock = newsock;
1653  	queue->port = port;
1654  	queue->nr_cmds = 0;
1655  	spin_lock_init(&queue->state_lock);
1656  	queue->state = NVMET_TCP_Q_CONNECTING;
1657  	INIT_LIST_HEAD(&queue->free_list);
1658  	init_llist_head(&queue->resp_list);
1659  	INIT_LIST_HEAD(&queue->resp_send_list);
1660  
1661  	queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL);
1662  	if (queue->idx < 0) {
1663  		ret = queue->idx;
1664  		goto out_free_queue;
1665  	}
1666  
1667  	ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1668  	if (ret)
1669  		goto out_ida_remove;
1670  
1671  	ret = nvmet_sq_init(&queue->nvme_sq);
1672  	if (ret)
1673  		goto out_free_connect;
1674  
1675  	nvmet_prepare_receive_pdu(queue);
1676  
1677  	mutex_lock(&nvmet_tcp_queue_mutex);
1678  	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1679  	mutex_unlock(&nvmet_tcp_queue_mutex);
1680  
1681  	ret = nvmet_tcp_set_queue_sock(queue);
1682  	if (ret)
1683  		goto out_destroy_sq;
1684  
1685  	return 0;
1686  out_destroy_sq:
1687  	mutex_lock(&nvmet_tcp_queue_mutex);
1688  	list_del_init(&queue->queue_list);
1689  	mutex_unlock(&nvmet_tcp_queue_mutex);
1690  	nvmet_sq_destroy(&queue->nvme_sq);
1691  out_free_connect:
1692  	nvmet_tcp_free_cmd(&queue->connect);
1693  out_ida_remove:
1694  	ida_free(&nvmet_tcp_queue_ida, queue->idx);
1695  out_free_queue:
1696  	kfree(queue);
1697  	return ret;
1698  }
1699  
nvmet_tcp_accept_work(struct work_struct * w)1700  static void nvmet_tcp_accept_work(struct work_struct *w)
1701  {
1702  	struct nvmet_tcp_port *port =
1703  		container_of(w, struct nvmet_tcp_port, accept_work);
1704  	struct socket *newsock;
1705  	int ret;
1706  
1707  	while (true) {
1708  		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1709  		if (ret < 0) {
1710  			if (ret != -EAGAIN)
1711  				pr_warn("failed to accept err=%d\n", ret);
1712  			return;
1713  		}
1714  		ret = nvmet_tcp_alloc_queue(port, newsock);
1715  		if (ret) {
1716  			pr_err("failed to allocate queue\n");
1717  			sock_release(newsock);
1718  		}
1719  	}
1720  }
1721  
nvmet_tcp_listen_data_ready(struct sock * sk)1722  static void nvmet_tcp_listen_data_ready(struct sock *sk)
1723  {
1724  	struct nvmet_tcp_port *port;
1725  
1726  	trace_sk_data_ready(sk);
1727  
1728  	read_lock_bh(&sk->sk_callback_lock);
1729  	port = sk->sk_user_data;
1730  	if (!port)
1731  		goto out;
1732  
1733  	if (sk->sk_state == TCP_LISTEN)
1734  		queue_work(nvmet_wq, &port->accept_work);
1735  out:
1736  	read_unlock_bh(&sk->sk_callback_lock);
1737  }
1738  
nvmet_tcp_add_port(struct nvmet_port * nport)1739  static int nvmet_tcp_add_port(struct nvmet_port *nport)
1740  {
1741  	struct nvmet_tcp_port *port;
1742  	__kernel_sa_family_t af;
1743  	int ret;
1744  
1745  	port = kzalloc(sizeof(*port), GFP_KERNEL);
1746  	if (!port)
1747  		return -ENOMEM;
1748  
1749  	switch (nport->disc_addr.adrfam) {
1750  	case NVMF_ADDR_FAMILY_IP4:
1751  		af = AF_INET;
1752  		break;
1753  	case NVMF_ADDR_FAMILY_IP6:
1754  		af = AF_INET6;
1755  		break;
1756  	default:
1757  		pr_err("address family %d not supported\n",
1758  				nport->disc_addr.adrfam);
1759  		ret = -EINVAL;
1760  		goto err_port;
1761  	}
1762  
1763  	ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1764  			nport->disc_addr.trsvcid, &port->addr);
1765  	if (ret) {
1766  		pr_err("malformed ip/port passed: %s:%s\n",
1767  			nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1768  		goto err_port;
1769  	}
1770  
1771  	port->nport = nport;
1772  	INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1773  	if (port->nport->inline_data_size < 0)
1774  		port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1775  
1776  	ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1777  				IPPROTO_TCP, &port->sock);
1778  	if (ret) {
1779  		pr_err("failed to create a socket\n");
1780  		goto err_port;
1781  	}
1782  
1783  	port->sock->sk->sk_user_data = port;
1784  	port->data_ready = port->sock->sk->sk_data_ready;
1785  	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1786  	sock_set_reuseaddr(port->sock->sk);
1787  	tcp_sock_set_nodelay(port->sock->sk);
1788  	if (so_priority > 0)
1789  		sock_set_priority(port->sock->sk, so_priority);
1790  
1791  	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1792  			sizeof(port->addr));
1793  	if (ret) {
1794  		pr_err("failed to bind port socket %d\n", ret);
1795  		goto err_sock;
1796  	}
1797  
1798  	ret = kernel_listen(port->sock, 128);
1799  	if (ret) {
1800  		pr_err("failed to listen %d on port sock\n", ret);
1801  		goto err_sock;
1802  	}
1803  
1804  	nport->priv = port;
1805  	pr_info("enabling port %d (%pISpc)\n",
1806  		le16_to_cpu(nport->disc_addr.portid), &port->addr);
1807  
1808  	return 0;
1809  
1810  err_sock:
1811  	sock_release(port->sock);
1812  err_port:
1813  	kfree(port);
1814  	return ret;
1815  }
1816  
nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port * port)1817  static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
1818  {
1819  	struct nvmet_tcp_queue *queue;
1820  
1821  	mutex_lock(&nvmet_tcp_queue_mutex);
1822  	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1823  		if (queue->port == port)
1824  			kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1825  	mutex_unlock(&nvmet_tcp_queue_mutex);
1826  }
1827  
nvmet_tcp_remove_port(struct nvmet_port * nport)1828  static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1829  {
1830  	struct nvmet_tcp_port *port = nport->priv;
1831  
1832  	write_lock_bh(&port->sock->sk->sk_callback_lock);
1833  	port->sock->sk->sk_data_ready = port->data_ready;
1834  	port->sock->sk->sk_user_data = NULL;
1835  	write_unlock_bh(&port->sock->sk->sk_callback_lock);
1836  	cancel_work_sync(&port->accept_work);
1837  	/*
1838  	 * Destroy the remaining queues, which are not belong to any
1839  	 * controller yet.
1840  	 */
1841  	nvmet_tcp_destroy_port_queues(port);
1842  
1843  	sock_release(port->sock);
1844  	kfree(port);
1845  }
1846  
nvmet_tcp_delete_ctrl(struct nvmet_ctrl * ctrl)1847  static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1848  {
1849  	struct nvmet_tcp_queue *queue;
1850  
1851  	mutex_lock(&nvmet_tcp_queue_mutex);
1852  	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1853  		if (queue->nvme_sq.ctrl == ctrl)
1854  			kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1855  	mutex_unlock(&nvmet_tcp_queue_mutex);
1856  }
1857  
nvmet_tcp_install_queue(struct nvmet_sq * sq)1858  static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1859  {
1860  	struct nvmet_tcp_queue *queue =
1861  		container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1862  
1863  	if (sq->qid == 0) {
1864  		/* Let inflight controller teardown complete */
1865  		flush_workqueue(nvmet_wq);
1866  	}
1867  
1868  	queue->nr_cmds = sq->size * 2;
1869  	if (nvmet_tcp_alloc_cmds(queue)) {
1870  		queue->nr_cmds = 0;
1871  		return NVME_SC_INTERNAL;
1872  	}
1873  	return 0;
1874  }
1875  
nvmet_tcp_disc_port_addr(struct nvmet_req * req,struct nvmet_port * nport,char * traddr)1876  static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1877  		struct nvmet_port *nport, char *traddr)
1878  {
1879  	struct nvmet_tcp_port *port = nport->priv;
1880  
1881  	if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1882  		struct nvmet_tcp_cmd *cmd =
1883  			container_of(req, struct nvmet_tcp_cmd, req);
1884  		struct nvmet_tcp_queue *queue = cmd->queue;
1885  
1886  		sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1887  	} else {
1888  		memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1889  	}
1890  }
1891  
1892  static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
1893  	.owner			= THIS_MODULE,
1894  	.type			= NVMF_TRTYPE_TCP,
1895  	.msdbd			= 1,
1896  	.add_port		= nvmet_tcp_add_port,
1897  	.remove_port		= nvmet_tcp_remove_port,
1898  	.queue_response		= nvmet_tcp_queue_response,
1899  	.delete_ctrl		= nvmet_tcp_delete_ctrl,
1900  	.install_queue		= nvmet_tcp_install_queue,
1901  	.disc_traddr		= nvmet_tcp_disc_port_addr,
1902  };
1903  
nvmet_tcp_init(void)1904  static int __init nvmet_tcp_init(void)
1905  {
1906  	int ret;
1907  
1908  	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
1909  				WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1910  	if (!nvmet_tcp_wq)
1911  		return -ENOMEM;
1912  
1913  	ret = nvmet_register_transport(&nvmet_tcp_ops);
1914  	if (ret)
1915  		goto err;
1916  
1917  	return 0;
1918  err:
1919  	destroy_workqueue(nvmet_tcp_wq);
1920  	return ret;
1921  }
1922  
nvmet_tcp_exit(void)1923  static void __exit nvmet_tcp_exit(void)
1924  {
1925  	struct nvmet_tcp_queue *queue;
1926  
1927  	nvmet_unregister_transport(&nvmet_tcp_ops);
1928  
1929  	flush_workqueue(nvmet_wq);
1930  	mutex_lock(&nvmet_tcp_queue_mutex);
1931  	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1932  		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1933  	mutex_unlock(&nvmet_tcp_queue_mutex);
1934  	flush_workqueue(nvmet_wq);
1935  
1936  	destroy_workqueue(nvmet_tcp_wq);
1937  	ida_destroy(&nvmet_tcp_queue_ida);
1938  }
1939  
1940  module_init(nvmet_tcp_init);
1941  module_exit(nvmet_tcp_exit);
1942  
1943  MODULE_LICENSE("GPL v2");
1944  MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
1945