xref: /openbmc/linux/drivers/nvme/target/tcp.c (revision 24ce659d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP target.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
17 
18 #include "nvmet.h"
19 
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE	(4 * PAGE_SIZE)
21 
22 /* Define the socket priority to use for connections were it is desirable
23  * that the NIC consider performing optimized packet processing or filtering.
24  * A non-zero value being sufficient to indicate general consideration of any
25  * possible optimization.  Making it a module param allows for alternative
26  * values that may be unique for some NIC implementations.
27  */
28 static int so_priority;
29 module_param(so_priority, int, 0644);
30 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
31 
32 #define NVMET_TCP_RECV_BUDGET		8
33 #define NVMET_TCP_SEND_BUDGET		8
34 #define NVMET_TCP_IO_WORK_BUDGET	64
35 
36 enum nvmet_tcp_send_state {
37 	NVMET_TCP_SEND_DATA_PDU,
38 	NVMET_TCP_SEND_DATA,
39 	NVMET_TCP_SEND_R2T,
40 	NVMET_TCP_SEND_DDGST,
41 	NVMET_TCP_SEND_RESPONSE
42 };
43 
44 enum nvmet_tcp_recv_state {
45 	NVMET_TCP_RECV_PDU,
46 	NVMET_TCP_RECV_DATA,
47 	NVMET_TCP_RECV_DDGST,
48 	NVMET_TCP_RECV_ERR,
49 };
50 
51 enum {
52 	NVMET_TCP_F_INIT_FAILED = (1 << 0),
53 };
54 
55 struct nvmet_tcp_cmd {
56 	struct nvmet_tcp_queue		*queue;
57 	struct nvmet_req		req;
58 
59 	struct nvme_tcp_cmd_pdu		*cmd_pdu;
60 	struct nvme_tcp_rsp_pdu		*rsp_pdu;
61 	struct nvme_tcp_data_pdu	*data_pdu;
62 	struct nvme_tcp_r2t_pdu		*r2t_pdu;
63 
64 	u32				rbytes_done;
65 	u32				wbytes_done;
66 
67 	u32				pdu_len;
68 	u32				pdu_recv;
69 	int				sg_idx;
70 	int				nr_mapped;
71 	struct msghdr			recv_msg;
72 	struct kvec			*iov;
73 	u32				flags;
74 
75 	struct list_head		entry;
76 	struct llist_node		lentry;
77 
78 	/* send state */
79 	u32				offset;
80 	struct scatterlist		*cur_sg;
81 	enum nvmet_tcp_send_state	state;
82 
83 	__le32				exp_ddgst;
84 	__le32				recv_ddgst;
85 };
86 
87 enum nvmet_tcp_queue_state {
88 	NVMET_TCP_Q_CONNECTING,
89 	NVMET_TCP_Q_LIVE,
90 	NVMET_TCP_Q_DISCONNECTING,
91 };
92 
93 struct nvmet_tcp_queue {
94 	struct socket		*sock;
95 	struct nvmet_tcp_port	*port;
96 	struct work_struct	io_work;
97 	int			cpu;
98 	struct nvmet_cq		nvme_cq;
99 	struct nvmet_sq		nvme_sq;
100 
101 	/* send state */
102 	struct nvmet_tcp_cmd	*cmds;
103 	unsigned int		nr_cmds;
104 	struct list_head	free_list;
105 	struct llist_head	resp_list;
106 	struct list_head	resp_send_list;
107 	int			send_list_len;
108 	struct nvmet_tcp_cmd	*snd_cmd;
109 
110 	/* recv state */
111 	int			offset;
112 	int			left;
113 	enum nvmet_tcp_recv_state rcv_state;
114 	struct nvmet_tcp_cmd	*cmd;
115 	union nvme_tcp_pdu	pdu;
116 
117 	/* digest state */
118 	bool			hdr_digest;
119 	bool			data_digest;
120 	struct ahash_request	*snd_hash;
121 	struct ahash_request	*rcv_hash;
122 
123 	spinlock_t		state_lock;
124 	enum nvmet_tcp_queue_state state;
125 
126 	struct sockaddr_storage	sockaddr;
127 	struct sockaddr_storage	sockaddr_peer;
128 	struct work_struct	release_work;
129 
130 	int			idx;
131 	struct list_head	queue_list;
132 
133 	struct nvmet_tcp_cmd	connect;
134 
135 	struct page_frag_cache	pf_cache;
136 
137 	void (*data_ready)(struct sock *);
138 	void (*state_change)(struct sock *);
139 	void (*write_space)(struct sock *);
140 };
141 
142 struct nvmet_tcp_port {
143 	struct socket		*sock;
144 	struct work_struct	accept_work;
145 	struct nvmet_port	*nport;
146 	struct sockaddr_storage addr;
147 	int			last_cpu;
148 	void (*data_ready)(struct sock *);
149 };
150 
151 static DEFINE_IDA(nvmet_tcp_queue_ida);
152 static LIST_HEAD(nvmet_tcp_queue_list);
153 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
154 
155 static struct workqueue_struct *nvmet_tcp_wq;
156 static struct nvmet_fabrics_ops nvmet_tcp_ops;
157 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
158 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
159 
160 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
161 		struct nvmet_tcp_cmd *cmd)
162 {
163 	return cmd - queue->cmds;
164 }
165 
166 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
167 {
168 	return nvme_is_write(cmd->req.cmd) &&
169 		cmd->rbytes_done < cmd->req.transfer_len;
170 }
171 
172 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
173 {
174 	return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
175 }
176 
177 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
178 {
179 	return !nvme_is_write(cmd->req.cmd) &&
180 		cmd->req.transfer_len > 0 &&
181 		!cmd->req.cqe->status;
182 }
183 
184 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
185 {
186 	return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
187 		!cmd->rbytes_done;
188 }
189 
190 static inline struct nvmet_tcp_cmd *
191 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
192 {
193 	struct nvmet_tcp_cmd *cmd;
194 
195 	cmd = list_first_entry_or_null(&queue->free_list,
196 				struct nvmet_tcp_cmd, entry);
197 	if (!cmd)
198 		return NULL;
199 	list_del_init(&cmd->entry);
200 
201 	cmd->rbytes_done = cmd->wbytes_done = 0;
202 	cmd->pdu_len = 0;
203 	cmd->pdu_recv = 0;
204 	cmd->iov = NULL;
205 	cmd->flags = 0;
206 	return cmd;
207 }
208 
209 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
210 {
211 	if (unlikely(cmd == &cmd->queue->connect))
212 		return;
213 
214 	list_add_tail(&cmd->entry, &cmd->queue->free_list);
215 }
216 
217 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
218 {
219 	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
220 }
221 
222 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
223 {
224 	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
225 }
226 
227 static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
228 		void *pdu, size_t len)
229 {
230 	struct scatterlist sg;
231 
232 	sg_init_one(&sg, pdu, len);
233 	ahash_request_set_crypt(hash, &sg, pdu + len, len);
234 	crypto_ahash_digest(hash);
235 }
236 
237 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
238 	void *pdu, size_t len)
239 {
240 	struct nvme_tcp_hdr *hdr = pdu;
241 	__le32 recv_digest;
242 	__le32 exp_digest;
243 
244 	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
245 		pr_err("queue %d: header digest enabled but no header digest\n",
246 			queue->idx);
247 		return -EPROTO;
248 	}
249 
250 	recv_digest = *(__le32 *)(pdu + hdr->hlen);
251 	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
252 	exp_digest = *(__le32 *)(pdu + hdr->hlen);
253 	if (recv_digest != exp_digest) {
254 		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
255 			queue->idx, le32_to_cpu(recv_digest),
256 			le32_to_cpu(exp_digest));
257 		return -EPROTO;
258 	}
259 
260 	return 0;
261 }
262 
263 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
264 {
265 	struct nvme_tcp_hdr *hdr = pdu;
266 	u8 digest_len = nvmet_tcp_hdgst_len(queue);
267 	u32 len;
268 
269 	len = le32_to_cpu(hdr->plen) - hdr->hlen -
270 		(hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
271 
272 	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
273 		pr_err("queue %d: data digest flag is cleared\n", queue->idx);
274 		return -EPROTO;
275 	}
276 
277 	return 0;
278 }
279 
280 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
281 {
282 	struct scatterlist *sg;
283 	int i;
284 
285 	sg = &cmd->req.sg[cmd->sg_idx];
286 
287 	for (i = 0; i < cmd->nr_mapped; i++)
288 		kunmap(sg_page(&sg[i]));
289 }
290 
291 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
292 {
293 	struct kvec *iov = cmd->iov;
294 	struct scatterlist *sg;
295 	u32 length, offset, sg_offset;
296 
297 	length = cmd->pdu_len;
298 	cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
299 	offset = cmd->rbytes_done;
300 	cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
301 	sg_offset = offset % PAGE_SIZE;
302 	sg = &cmd->req.sg[cmd->sg_idx];
303 
304 	while (length) {
305 		u32 iov_len = min_t(u32, length, sg->length - sg_offset);
306 
307 		iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
308 		iov->iov_len = iov_len;
309 
310 		length -= iov_len;
311 		sg = sg_next(sg);
312 		iov++;
313 	}
314 
315 	iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
316 		cmd->nr_mapped, cmd->pdu_len);
317 }
318 
319 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
320 {
321 	queue->rcv_state = NVMET_TCP_RECV_ERR;
322 	if (queue->nvme_sq.ctrl)
323 		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
324 	else
325 		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
326 }
327 
328 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
329 {
330 	struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
331 	u32 len = le32_to_cpu(sgl->length);
332 
333 	if (!len)
334 		return 0;
335 
336 	if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
337 			  NVME_SGL_FMT_OFFSET)) {
338 		if (!nvme_is_write(cmd->req.cmd))
339 			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
340 
341 		if (len > cmd->req.port->inline_data_size)
342 			return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
343 		cmd->pdu_len = len;
344 	}
345 	cmd->req.transfer_len += len;
346 
347 	cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
348 	if (!cmd->req.sg)
349 		return NVME_SC_INTERNAL;
350 	cmd->cur_sg = cmd->req.sg;
351 
352 	if (nvmet_tcp_has_data_in(cmd)) {
353 		cmd->iov = kmalloc_array(cmd->req.sg_cnt,
354 				sizeof(*cmd->iov), GFP_KERNEL);
355 		if (!cmd->iov)
356 			goto err;
357 	}
358 
359 	return 0;
360 err:
361 	sgl_free(cmd->req.sg);
362 	return NVME_SC_INTERNAL;
363 }
364 
365 static void nvmet_tcp_ddgst(struct ahash_request *hash,
366 		struct nvmet_tcp_cmd *cmd)
367 {
368 	ahash_request_set_crypt(hash, cmd->req.sg,
369 		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
370 	crypto_ahash_digest(hash);
371 }
372 
373 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
374 {
375 	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
376 	struct nvmet_tcp_queue *queue = cmd->queue;
377 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
378 	u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
379 
380 	cmd->offset = 0;
381 	cmd->state = NVMET_TCP_SEND_DATA_PDU;
382 
383 	pdu->hdr.type = nvme_tcp_c2h_data;
384 	pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
385 						NVME_TCP_F_DATA_SUCCESS : 0);
386 	pdu->hdr.hlen = sizeof(*pdu);
387 	pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
388 	pdu->hdr.plen =
389 		cpu_to_le32(pdu->hdr.hlen + hdgst +
390 				cmd->req.transfer_len + ddgst);
391 	pdu->command_id = cmd->req.cqe->command_id;
392 	pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
393 	pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
394 
395 	if (queue->data_digest) {
396 		pdu->hdr.flags |= NVME_TCP_F_DDGST;
397 		nvmet_tcp_ddgst(queue->snd_hash, cmd);
398 	}
399 
400 	if (cmd->queue->hdr_digest) {
401 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
402 		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
403 	}
404 }
405 
406 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
407 {
408 	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
409 	struct nvmet_tcp_queue *queue = cmd->queue;
410 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
411 
412 	cmd->offset = 0;
413 	cmd->state = NVMET_TCP_SEND_R2T;
414 
415 	pdu->hdr.type = nvme_tcp_r2t;
416 	pdu->hdr.flags = 0;
417 	pdu->hdr.hlen = sizeof(*pdu);
418 	pdu->hdr.pdo = 0;
419 	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
420 
421 	pdu->command_id = cmd->req.cmd->common.command_id;
422 	pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
423 	pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
424 	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
425 	if (cmd->queue->hdr_digest) {
426 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
427 		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
428 	}
429 }
430 
431 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
432 {
433 	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
434 	struct nvmet_tcp_queue *queue = cmd->queue;
435 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
436 
437 	cmd->offset = 0;
438 	cmd->state = NVMET_TCP_SEND_RESPONSE;
439 
440 	pdu->hdr.type = nvme_tcp_rsp;
441 	pdu->hdr.flags = 0;
442 	pdu->hdr.hlen = sizeof(*pdu);
443 	pdu->hdr.pdo = 0;
444 	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
445 	if (cmd->queue->hdr_digest) {
446 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
447 		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
448 	}
449 }
450 
451 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
452 {
453 	struct llist_node *node;
454 
455 	node = llist_del_all(&queue->resp_list);
456 	if (!node)
457 		return;
458 
459 	while (node) {
460 		struct nvmet_tcp_cmd *cmd = llist_entry(node,
461 					struct nvmet_tcp_cmd, lentry);
462 
463 		list_add(&cmd->entry, &queue->resp_send_list);
464 		node = node->next;
465 		queue->send_list_len++;
466 	}
467 }
468 
469 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
470 {
471 	queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
472 				struct nvmet_tcp_cmd, entry);
473 	if (!queue->snd_cmd) {
474 		nvmet_tcp_process_resp_list(queue);
475 		queue->snd_cmd =
476 			list_first_entry_or_null(&queue->resp_send_list,
477 					struct nvmet_tcp_cmd, entry);
478 		if (unlikely(!queue->snd_cmd))
479 			return NULL;
480 	}
481 
482 	list_del_init(&queue->snd_cmd->entry);
483 	queue->send_list_len--;
484 
485 	if (nvmet_tcp_need_data_out(queue->snd_cmd))
486 		nvmet_setup_c2h_data_pdu(queue->snd_cmd);
487 	else if (nvmet_tcp_need_data_in(queue->snd_cmd))
488 		nvmet_setup_r2t_pdu(queue->snd_cmd);
489 	else
490 		nvmet_setup_response_pdu(queue->snd_cmd);
491 
492 	return queue->snd_cmd;
493 }
494 
495 static void nvmet_tcp_queue_response(struct nvmet_req *req)
496 {
497 	struct nvmet_tcp_cmd *cmd =
498 		container_of(req, struct nvmet_tcp_cmd, req);
499 	struct nvmet_tcp_queue	*queue = cmd->queue;
500 
501 	llist_add(&cmd->lentry, &queue->resp_list);
502 	queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
503 }
504 
505 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
506 {
507 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
508 	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
509 	int ret;
510 
511 	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
512 			offset_in_page(cmd->data_pdu) + cmd->offset,
513 			left, MSG_DONTWAIT | MSG_MORE);
514 	if (ret <= 0)
515 		return ret;
516 
517 	cmd->offset += ret;
518 	left -= ret;
519 
520 	if (left)
521 		return -EAGAIN;
522 
523 	cmd->state = NVMET_TCP_SEND_DATA;
524 	cmd->offset  = 0;
525 	return 1;
526 }
527 
528 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
529 {
530 	struct nvmet_tcp_queue *queue = cmd->queue;
531 	int ret;
532 
533 	while (cmd->cur_sg) {
534 		struct page *page = sg_page(cmd->cur_sg);
535 		u32 left = cmd->cur_sg->length - cmd->offset;
536 		int flags = MSG_DONTWAIT;
537 
538 		if ((!last_in_batch && cmd->queue->send_list_len) ||
539 		    cmd->wbytes_done + left < cmd->req.transfer_len ||
540 		    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
541 			flags |= MSG_MORE;
542 
543 		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
544 					left, flags);
545 		if (ret <= 0)
546 			return ret;
547 
548 		cmd->offset += ret;
549 		cmd->wbytes_done += ret;
550 
551 		/* Done with sg?*/
552 		if (cmd->offset == cmd->cur_sg->length) {
553 			cmd->cur_sg = sg_next(cmd->cur_sg);
554 			cmd->offset = 0;
555 		}
556 	}
557 
558 	if (queue->data_digest) {
559 		cmd->state = NVMET_TCP_SEND_DDGST;
560 		cmd->offset = 0;
561 	} else {
562 		if (queue->nvme_sq.sqhd_disabled) {
563 			cmd->queue->snd_cmd = NULL;
564 			nvmet_tcp_put_cmd(cmd);
565 		} else {
566 			nvmet_setup_response_pdu(cmd);
567 		}
568 	}
569 
570 	if (queue->nvme_sq.sqhd_disabled) {
571 		kfree(cmd->iov);
572 		sgl_free(cmd->req.sg);
573 	}
574 
575 	return 1;
576 
577 }
578 
579 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
580 		bool last_in_batch)
581 {
582 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
583 	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
584 	int flags = MSG_DONTWAIT;
585 	int ret;
586 
587 	if (!last_in_batch && cmd->queue->send_list_len)
588 		flags |= MSG_MORE;
589 	else
590 		flags |= MSG_EOR;
591 
592 	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
593 		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
594 	if (ret <= 0)
595 		return ret;
596 	cmd->offset += ret;
597 	left -= ret;
598 
599 	if (left)
600 		return -EAGAIN;
601 
602 	kfree(cmd->iov);
603 	sgl_free(cmd->req.sg);
604 	cmd->queue->snd_cmd = NULL;
605 	nvmet_tcp_put_cmd(cmd);
606 	return 1;
607 }
608 
609 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
610 {
611 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
612 	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
613 	int flags = MSG_DONTWAIT;
614 	int ret;
615 
616 	if (!last_in_batch && cmd->queue->send_list_len)
617 		flags |= MSG_MORE;
618 	else
619 		flags |= MSG_EOR;
620 
621 	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
622 		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
623 	if (ret <= 0)
624 		return ret;
625 	cmd->offset += ret;
626 	left -= ret;
627 
628 	if (left)
629 		return -EAGAIN;
630 
631 	cmd->queue->snd_cmd = NULL;
632 	return 1;
633 }
634 
635 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
636 {
637 	struct nvmet_tcp_queue *queue = cmd->queue;
638 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
639 	struct kvec iov = {
640 		.iov_base = &cmd->exp_ddgst + cmd->offset,
641 		.iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
642 	};
643 	int ret;
644 
645 	if (!last_in_batch && cmd->queue->send_list_len)
646 		msg.msg_flags |= MSG_MORE;
647 
648 	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
649 	if (unlikely(ret <= 0))
650 		return ret;
651 
652 	cmd->offset += ret;
653 
654 	if (queue->nvme_sq.sqhd_disabled) {
655 		cmd->queue->snd_cmd = NULL;
656 		nvmet_tcp_put_cmd(cmd);
657 	} else {
658 		nvmet_setup_response_pdu(cmd);
659 	}
660 	return 1;
661 }
662 
663 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
664 		bool last_in_batch)
665 {
666 	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
667 	int ret = 0;
668 
669 	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
670 		cmd = nvmet_tcp_fetch_cmd(queue);
671 		if (unlikely(!cmd))
672 			return 0;
673 	}
674 
675 	if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
676 		ret = nvmet_try_send_data_pdu(cmd);
677 		if (ret <= 0)
678 			goto done_send;
679 	}
680 
681 	if (cmd->state == NVMET_TCP_SEND_DATA) {
682 		ret = nvmet_try_send_data(cmd, last_in_batch);
683 		if (ret <= 0)
684 			goto done_send;
685 	}
686 
687 	if (cmd->state == NVMET_TCP_SEND_DDGST) {
688 		ret = nvmet_try_send_ddgst(cmd, last_in_batch);
689 		if (ret <= 0)
690 			goto done_send;
691 	}
692 
693 	if (cmd->state == NVMET_TCP_SEND_R2T) {
694 		ret = nvmet_try_send_r2t(cmd, last_in_batch);
695 		if (ret <= 0)
696 			goto done_send;
697 	}
698 
699 	if (cmd->state == NVMET_TCP_SEND_RESPONSE)
700 		ret = nvmet_try_send_response(cmd, last_in_batch);
701 
702 done_send:
703 	if (ret < 0) {
704 		if (ret == -EAGAIN)
705 			return 0;
706 		return ret;
707 	}
708 
709 	return 1;
710 }
711 
712 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
713 		int budget, int *sends)
714 {
715 	int i, ret = 0;
716 
717 	for (i = 0; i < budget; i++) {
718 		ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
719 		if (ret <= 0)
720 			break;
721 		(*sends)++;
722 	}
723 
724 	return ret;
725 }
726 
727 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
728 {
729 	queue->offset = 0;
730 	queue->left = sizeof(struct nvme_tcp_hdr);
731 	queue->cmd = NULL;
732 	queue->rcv_state = NVMET_TCP_RECV_PDU;
733 }
734 
735 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
736 {
737 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
738 
739 	ahash_request_free(queue->rcv_hash);
740 	ahash_request_free(queue->snd_hash);
741 	crypto_free_ahash(tfm);
742 }
743 
744 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
745 {
746 	struct crypto_ahash *tfm;
747 
748 	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
749 	if (IS_ERR(tfm))
750 		return PTR_ERR(tfm);
751 
752 	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
753 	if (!queue->snd_hash)
754 		goto free_tfm;
755 	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
756 
757 	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
758 	if (!queue->rcv_hash)
759 		goto free_snd_hash;
760 	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
761 
762 	return 0;
763 free_snd_hash:
764 	ahash_request_free(queue->snd_hash);
765 free_tfm:
766 	crypto_free_ahash(tfm);
767 	return -ENOMEM;
768 }
769 
770 
771 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
772 {
773 	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
774 	struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
775 	struct msghdr msg = {};
776 	struct kvec iov;
777 	int ret;
778 
779 	if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
780 		pr_err("bad nvme-tcp pdu length (%d)\n",
781 			le32_to_cpu(icreq->hdr.plen));
782 		nvmet_tcp_fatal_error(queue);
783 	}
784 
785 	if (icreq->pfv != NVME_TCP_PFV_1_0) {
786 		pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
787 		return -EPROTO;
788 	}
789 
790 	if (icreq->hpda != 0) {
791 		pr_err("queue %d: unsupported hpda %d\n", queue->idx,
792 			icreq->hpda);
793 		return -EPROTO;
794 	}
795 
796 	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
797 	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
798 	if (queue->hdr_digest || queue->data_digest) {
799 		ret = nvmet_tcp_alloc_crypto(queue);
800 		if (ret)
801 			return ret;
802 	}
803 
804 	memset(icresp, 0, sizeof(*icresp));
805 	icresp->hdr.type = nvme_tcp_icresp;
806 	icresp->hdr.hlen = sizeof(*icresp);
807 	icresp->hdr.pdo = 0;
808 	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
809 	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
810 	icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
811 	icresp->cpda = 0;
812 	if (queue->hdr_digest)
813 		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
814 	if (queue->data_digest)
815 		icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
816 
817 	iov.iov_base = icresp;
818 	iov.iov_len = sizeof(*icresp);
819 	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
820 	if (ret < 0)
821 		goto free_crypto;
822 
823 	queue->state = NVMET_TCP_Q_LIVE;
824 	nvmet_prepare_receive_pdu(queue);
825 	return 0;
826 free_crypto:
827 	if (queue->hdr_digest || queue->data_digest)
828 		nvmet_tcp_free_crypto(queue);
829 	return ret;
830 }
831 
832 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
833 		struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
834 {
835 	size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
836 	int ret;
837 
838 	if (!nvme_is_write(cmd->req.cmd) ||
839 	    data_len > cmd->req.port->inline_data_size) {
840 		nvmet_prepare_receive_pdu(queue);
841 		return;
842 	}
843 
844 	ret = nvmet_tcp_map_data(cmd);
845 	if (unlikely(ret)) {
846 		pr_err("queue %d: failed to map data\n", queue->idx);
847 		nvmet_tcp_fatal_error(queue);
848 		return;
849 	}
850 
851 	queue->rcv_state = NVMET_TCP_RECV_DATA;
852 	nvmet_tcp_map_pdu_iovec(cmd);
853 	cmd->flags |= NVMET_TCP_F_INIT_FAILED;
854 }
855 
856 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
857 {
858 	struct nvme_tcp_data_pdu *data = &queue->pdu.data;
859 	struct nvmet_tcp_cmd *cmd;
860 
861 	cmd = &queue->cmds[data->ttag];
862 
863 	if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
864 		pr_err("ttag %u unexpected data offset %u (expected %u)\n",
865 			data->ttag, le32_to_cpu(data->data_offset),
866 			cmd->rbytes_done);
867 		/* FIXME: use path and transport errors */
868 		nvmet_req_complete(&cmd->req,
869 			NVME_SC_INVALID_FIELD | NVME_SC_DNR);
870 		return -EPROTO;
871 	}
872 
873 	cmd->pdu_len = le32_to_cpu(data->data_length);
874 	cmd->pdu_recv = 0;
875 	nvmet_tcp_map_pdu_iovec(cmd);
876 	queue->cmd = cmd;
877 	queue->rcv_state = NVMET_TCP_RECV_DATA;
878 
879 	return 0;
880 }
881 
882 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
883 {
884 	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
885 	struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
886 	struct nvmet_req *req;
887 	int ret;
888 
889 	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
890 		if (hdr->type != nvme_tcp_icreq) {
891 			pr_err("unexpected pdu type (%d) before icreq\n",
892 				hdr->type);
893 			nvmet_tcp_fatal_error(queue);
894 			return -EPROTO;
895 		}
896 		return nvmet_tcp_handle_icreq(queue);
897 	}
898 
899 	if (hdr->type == nvme_tcp_h2c_data) {
900 		ret = nvmet_tcp_handle_h2c_data_pdu(queue);
901 		if (unlikely(ret))
902 			return ret;
903 		return 0;
904 	}
905 
906 	queue->cmd = nvmet_tcp_get_cmd(queue);
907 	if (unlikely(!queue->cmd)) {
908 		/* This should never happen */
909 		pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
910 			queue->idx, queue->nr_cmds, queue->send_list_len,
911 			nvme_cmd->common.opcode);
912 		nvmet_tcp_fatal_error(queue);
913 		return -ENOMEM;
914 	}
915 
916 	req = &queue->cmd->req;
917 	memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
918 
919 	if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
920 			&queue->nvme_sq, &nvmet_tcp_ops))) {
921 		pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
922 			req->cmd, req->cmd->common.command_id,
923 			req->cmd->common.opcode,
924 			le32_to_cpu(req->cmd->common.dptr.sgl.length));
925 
926 		nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
927 		return -EAGAIN;
928 	}
929 
930 	ret = nvmet_tcp_map_data(queue->cmd);
931 	if (unlikely(ret)) {
932 		pr_err("queue %d: failed to map data\n", queue->idx);
933 		if (nvmet_tcp_has_inline_data(queue->cmd))
934 			nvmet_tcp_fatal_error(queue);
935 		else
936 			nvmet_req_complete(req, ret);
937 		ret = -EAGAIN;
938 		goto out;
939 	}
940 
941 	if (nvmet_tcp_need_data_in(queue->cmd)) {
942 		if (nvmet_tcp_has_inline_data(queue->cmd)) {
943 			queue->rcv_state = NVMET_TCP_RECV_DATA;
944 			nvmet_tcp_map_pdu_iovec(queue->cmd);
945 			return 0;
946 		}
947 		/* send back R2T */
948 		nvmet_tcp_queue_response(&queue->cmd->req);
949 		goto out;
950 	}
951 
952 	queue->cmd->req.execute(&queue->cmd->req);
953 out:
954 	nvmet_prepare_receive_pdu(queue);
955 	return ret;
956 }
957 
958 static const u8 nvme_tcp_pdu_sizes[] = {
959 	[nvme_tcp_icreq]	= sizeof(struct nvme_tcp_icreq_pdu),
960 	[nvme_tcp_cmd]		= sizeof(struct nvme_tcp_cmd_pdu),
961 	[nvme_tcp_h2c_data]	= sizeof(struct nvme_tcp_data_pdu),
962 };
963 
964 static inline u8 nvmet_tcp_pdu_size(u8 type)
965 {
966 	size_t idx = type;
967 
968 	return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
969 		nvme_tcp_pdu_sizes[idx]) ?
970 			nvme_tcp_pdu_sizes[idx] : 0;
971 }
972 
973 static inline bool nvmet_tcp_pdu_valid(u8 type)
974 {
975 	switch (type) {
976 	case nvme_tcp_icreq:
977 	case nvme_tcp_cmd:
978 	case nvme_tcp_h2c_data:
979 		/* fallthru */
980 		return true;
981 	}
982 
983 	return false;
984 }
985 
986 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
987 {
988 	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
989 	int len;
990 	struct kvec iov;
991 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
992 
993 recv:
994 	iov.iov_base = (void *)&queue->pdu + queue->offset;
995 	iov.iov_len = queue->left;
996 	len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
997 			iov.iov_len, msg.msg_flags);
998 	if (unlikely(len < 0))
999 		return len;
1000 
1001 	queue->offset += len;
1002 	queue->left -= len;
1003 	if (queue->left)
1004 		return -EAGAIN;
1005 
1006 	if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1007 		u8 hdgst = nvmet_tcp_hdgst_len(queue);
1008 
1009 		if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1010 			pr_err("unexpected pdu type %d\n", hdr->type);
1011 			nvmet_tcp_fatal_error(queue);
1012 			return -EIO;
1013 		}
1014 
1015 		if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1016 			pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1017 			return -EIO;
1018 		}
1019 
1020 		queue->left = hdr->hlen - queue->offset + hdgst;
1021 		goto recv;
1022 	}
1023 
1024 	if (queue->hdr_digest &&
1025 	    nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1026 		nvmet_tcp_fatal_error(queue); /* fatal */
1027 		return -EPROTO;
1028 	}
1029 
1030 	if (queue->data_digest &&
1031 	    nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1032 		nvmet_tcp_fatal_error(queue); /* fatal */
1033 		return -EPROTO;
1034 	}
1035 
1036 	return nvmet_tcp_done_recv_pdu(queue);
1037 }
1038 
1039 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1040 {
1041 	struct nvmet_tcp_queue *queue = cmd->queue;
1042 
1043 	nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1044 	queue->offset = 0;
1045 	queue->left = NVME_TCP_DIGEST_LENGTH;
1046 	queue->rcv_state = NVMET_TCP_RECV_DDGST;
1047 }
1048 
1049 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1050 {
1051 	struct nvmet_tcp_cmd  *cmd = queue->cmd;
1052 	int ret;
1053 
1054 	while (msg_data_left(&cmd->recv_msg)) {
1055 		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1056 			cmd->recv_msg.msg_flags);
1057 		if (ret <= 0)
1058 			return ret;
1059 
1060 		cmd->pdu_recv += ret;
1061 		cmd->rbytes_done += ret;
1062 	}
1063 
1064 	nvmet_tcp_unmap_pdu_iovec(cmd);
1065 
1066 	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1067 	    cmd->rbytes_done == cmd->req.transfer_len) {
1068 		if (queue->data_digest) {
1069 			nvmet_tcp_prep_recv_ddgst(cmd);
1070 			return 0;
1071 		}
1072 		cmd->req.execute(&cmd->req);
1073 	}
1074 
1075 	nvmet_prepare_receive_pdu(queue);
1076 	return 0;
1077 }
1078 
1079 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1080 {
1081 	struct nvmet_tcp_cmd *cmd = queue->cmd;
1082 	int ret;
1083 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1084 	struct kvec iov = {
1085 		.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1086 		.iov_len = queue->left
1087 	};
1088 
1089 	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1090 			iov.iov_len, msg.msg_flags);
1091 	if (unlikely(ret < 0))
1092 		return ret;
1093 
1094 	queue->offset += ret;
1095 	queue->left -= ret;
1096 	if (queue->left)
1097 		return -EAGAIN;
1098 
1099 	if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1100 		pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1101 			queue->idx, cmd->req.cmd->common.command_id,
1102 			queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1103 			le32_to_cpu(cmd->exp_ddgst));
1104 		nvmet_tcp_finish_cmd(cmd);
1105 		nvmet_tcp_fatal_error(queue);
1106 		ret = -EPROTO;
1107 		goto out;
1108 	}
1109 
1110 	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1111 	    cmd->rbytes_done == cmd->req.transfer_len)
1112 		cmd->req.execute(&cmd->req);
1113 	ret = 0;
1114 out:
1115 	nvmet_prepare_receive_pdu(queue);
1116 	return ret;
1117 }
1118 
1119 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1120 {
1121 	int result = 0;
1122 
1123 	if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1124 		return 0;
1125 
1126 	if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1127 		result = nvmet_tcp_try_recv_pdu(queue);
1128 		if (result != 0)
1129 			goto done_recv;
1130 	}
1131 
1132 	if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1133 		result = nvmet_tcp_try_recv_data(queue);
1134 		if (result != 0)
1135 			goto done_recv;
1136 	}
1137 
1138 	if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1139 		result = nvmet_tcp_try_recv_ddgst(queue);
1140 		if (result != 0)
1141 			goto done_recv;
1142 	}
1143 
1144 done_recv:
1145 	if (result < 0) {
1146 		if (result == -EAGAIN)
1147 			return 0;
1148 		return result;
1149 	}
1150 	return 1;
1151 }
1152 
1153 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1154 		int budget, int *recvs)
1155 {
1156 	int i, ret = 0;
1157 
1158 	for (i = 0; i < budget; i++) {
1159 		ret = nvmet_tcp_try_recv_one(queue);
1160 		if (ret <= 0)
1161 			break;
1162 		(*recvs)++;
1163 	}
1164 
1165 	return ret;
1166 }
1167 
1168 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1169 {
1170 	spin_lock(&queue->state_lock);
1171 	if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1172 		queue->state = NVMET_TCP_Q_DISCONNECTING;
1173 		schedule_work(&queue->release_work);
1174 	}
1175 	spin_unlock(&queue->state_lock);
1176 }
1177 
1178 static void nvmet_tcp_io_work(struct work_struct *w)
1179 {
1180 	struct nvmet_tcp_queue *queue =
1181 		container_of(w, struct nvmet_tcp_queue, io_work);
1182 	bool pending;
1183 	int ret, ops = 0;
1184 
1185 	do {
1186 		pending = false;
1187 
1188 		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1189 		if (ret > 0) {
1190 			pending = true;
1191 		} else if (ret < 0) {
1192 			if (ret == -EPIPE || ret == -ECONNRESET)
1193 				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1194 			else
1195 				nvmet_tcp_fatal_error(queue);
1196 			return;
1197 		}
1198 
1199 		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1200 		if (ret > 0) {
1201 			/* transmitted message/data */
1202 			pending = true;
1203 		} else if (ret < 0) {
1204 			if (ret == -EPIPE || ret == -ECONNRESET)
1205 				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1206 			else
1207 				nvmet_tcp_fatal_error(queue);
1208 			return;
1209 		}
1210 
1211 	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1212 
1213 	/*
1214 	 * We exahusted our budget, requeue our selves
1215 	 */
1216 	if (pending)
1217 		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1218 }
1219 
1220 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1221 		struct nvmet_tcp_cmd *c)
1222 {
1223 	u8 hdgst = nvmet_tcp_hdgst_len(queue);
1224 
1225 	c->queue = queue;
1226 	c->req.port = queue->port->nport;
1227 
1228 	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1229 			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1230 	if (!c->cmd_pdu)
1231 		return -ENOMEM;
1232 	c->req.cmd = &c->cmd_pdu->cmd;
1233 
1234 	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1235 			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1236 	if (!c->rsp_pdu)
1237 		goto out_free_cmd;
1238 	c->req.cqe = &c->rsp_pdu->cqe;
1239 
1240 	c->data_pdu = page_frag_alloc(&queue->pf_cache,
1241 			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1242 	if (!c->data_pdu)
1243 		goto out_free_rsp;
1244 
1245 	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1246 			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1247 	if (!c->r2t_pdu)
1248 		goto out_free_data;
1249 
1250 	c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1251 
1252 	list_add_tail(&c->entry, &queue->free_list);
1253 
1254 	return 0;
1255 out_free_data:
1256 	page_frag_free(c->data_pdu);
1257 out_free_rsp:
1258 	page_frag_free(c->rsp_pdu);
1259 out_free_cmd:
1260 	page_frag_free(c->cmd_pdu);
1261 	return -ENOMEM;
1262 }
1263 
1264 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1265 {
1266 	page_frag_free(c->r2t_pdu);
1267 	page_frag_free(c->data_pdu);
1268 	page_frag_free(c->rsp_pdu);
1269 	page_frag_free(c->cmd_pdu);
1270 }
1271 
1272 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1273 {
1274 	struct nvmet_tcp_cmd *cmds;
1275 	int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1276 
1277 	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1278 	if (!cmds)
1279 		goto out;
1280 
1281 	for (i = 0; i < nr_cmds; i++) {
1282 		ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1283 		if (ret)
1284 			goto out_free;
1285 	}
1286 
1287 	queue->cmds = cmds;
1288 
1289 	return 0;
1290 out_free:
1291 	while (--i >= 0)
1292 		nvmet_tcp_free_cmd(cmds + i);
1293 	kfree(cmds);
1294 out:
1295 	return ret;
1296 }
1297 
1298 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1299 {
1300 	struct nvmet_tcp_cmd *cmds = queue->cmds;
1301 	int i;
1302 
1303 	for (i = 0; i < queue->nr_cmds; i++)
1304 		nvmet_tcp_free_cmd(cmds + i);
1305 
1306 	nvmet_tcp_free_cmd(&queue->connect);
1307 	kfree(cmds);
1308 }
1309 
1310 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1311 {
1312 	struct socket *sock = queue->sock;
1313 
1314 	write_lock_bh(&sock->sk->sk_callback_lock);
1315 	sock->sk->sk_data_ready =  queue->data_ready;
1316 	sock->sk->sk_state_change = queue->state_change;
1317 	sock->sk->sk_write_space = queue->write_space;
1318 	sock->sk->sk_user_data = NULL;
1319 	write_unlock_bh(&sock->sk->sk_callback_lock);
1320 }
1321 
1322 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1323 {
1324 	nvmet_req_uninit(&cmd->req);
1325 	nvmet_tcp_unmap_pdu_iovec(cmd);
1326 	kfree(cmd->iov);
1327 	sgl_free(cmd->req.sg);
1328 }
1329 
1330 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1331 {
1332 	struct nvmet_tcp_cmd *cmd = queue->cmds;
1333 	int i;
1334 
1335 	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1336 		if (nvmet_tcp_need_data_in(cmd))
1337 			nvmet_tcp_finish_cmd(cmd);
1338 	}
1339 
1340 	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1341 		/* failed in connect */
1342 		nvmet_tcp_finish_cmd(&queue->connect);
1343 	}
1344 }
1345 
1346 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1347 {
1348 	struct nvmet_tcp_queue *queue =
1349 		container_of(w, struct nvmet_tcp_queue, release_work);
1350 
1351 	mutex_lock(&nvmet_tcp_queue_mutex);
1352 	list_del_init(&queue->queue_list);
1353 	mutex_unlock(&nvmet_tcp_queue_mutex);
1354 
1355 	nvmet_tcp_restore_socket_callbacks(queue);
1356 	flush_work(&queue->io_work);
1357 
1358 	nvmet_tcp_uninit_data_in_cmds(queue);
1359 	nvmet_sq_destroy(&queue->nvme_sq);
1360 	cancel_work_sync(&queue->io_work);
1361 	sock_release(queue->sock);
1362 	nvmet_tcp_free_cmds(queue);
1363 	if (queue->hdr_digest || queue->data_digest)
1364 		nvmet_tcp_free_crypto(queue);
1365 	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1366 
1367 	kfree(queue);
1368 }
1369 
1370 static void nvmet_tcp_data_ready(struct sock *sk)
1371 {
1372 	struct nvmet_tcp_queue *queue;
1373 
1374 	read_lock_bh(&sk->sk_callback_lock);
1375 	queue = sk->sk_user_data;
1376 	if (likely(queue))
1377 		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1378 	read_unlock_bh(&sk->sk_callback_lock);
1379 }
1380 
1381 static void nvmet_tcp_write_space(struct sock *sk)
1382 {
1383 	struct nvmet_tcp_queue *queue;
1384 
1385 	read_lock_bh(&sk->sk_callback_lock);
1386 	queue = sk->sk_user_data;
1387 	if (unlikely(!queue))
1388 		goto out;
1389 
1390 	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1391 		queue->write_space(sk);
1392 		goto out;
1393 	}
1394 
1395 	if (sk_stream_is_writeable(sk)) {
1396 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1397 		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1398 	}
1399 out:
1400 	read_unlock_bh(&sk->sk_callback_lock);
1401 }
1402 
1403 static void nvmet_tcp_state_change(struct sock *sk)
1404 {
1405 	struct nvmet_tcp_queue *queue;
1406 
1407 	write_lock_bh(&sk->sk_callback_lock);
1408 	queue = sk->sk_user_data;
1409 	if (!queue)
1410 		goto done;
1411 
1412 	switch (sk->sk_state) {
1413 	case TCP_FIN_WAIT1:
1414 	case TCP_CLOSE_WAIT:
1415 	case TCP_CLOSE:
1416 		/* FALLTHRU */
1417 		sk->sk_user_data = NULL;
1418 		nvmet_tcp_schedule_release_queue(queue);
1419 		break;
1420 	default:
1421 		pr_warn("queue %d unhandled state %d\n",
1422 			queue->idx, sk->sk_state);
1423 	}
1424 done:
1425 	write_unlock_bh(&sk->sk_callback_lock);
1426 }
1427 
1428 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1429 {
1430 	struct socket *sock = queue->sock;
1431 	struct inet_sock *inet = inet_sk(sock->sk);
1432 	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1433 	int ret;
1434 
1435 	ret = kernel_getsockname(sock,
1436 		(struct sockaddr *)&queue->sockaddr);
1437 	if (ret < 0)
1438 		return ret;
1439 
1440 	ret = kernel_getpeername(sock,
1441 		(struct sockaddr *)&queue->sockaddr_peer);
1442 	if (ret < 0)
1443 		return ret;
1444 
1445 	/*
1446 	 * Cleanup whatever is sitting in the TCP transmit queue on socket
1447 	 * close. This is done to prevent stale data from being sent should
1448 	 * the network connection be restored before TCP times out.
1449 	 */
1450 	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1451 			(char *)&sol, sizeof(sol));
1452 	if (ret)
1453 		return ret;
1454 
1455 	if (so_priority > 0) {
1456 		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
1457 				(char *)&so_priority, sizeof(so_priority));
1458 		if (ret)
1459 			return ret;
1460 	}
1461 
1462 	/* Set socket type of service */
1463 	if (inet->rcv_tos > 0) {
1464 		int tos = inet->rcv_tos;
1465 
1466 		ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
1467 				(char *)&tos, sizeof(tos));
1468 		if (ret)
1469 			return ret;
1470 	}
1471 
1472 	write_lock_bh(&sock->sk->sk_callback_lock);
1473 	sock->sk->sk_user_data = queue;
1474 	queue->data_ready = sock->sk->sk_data_ready;
1475 	sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1476 	queue->state_change = sock->sk->sk_state_change;
1477 	sock->sk->sk_state_change = nvmet_tcp_state_change;
1478 	queue->write_space = sock->sk->sk_write_space;
1479 	sock->sk->sk_write_space = nvmet_tcp_write_space;
1480 	write_unlock_bh(&sock->sk->sk_callback_lock);
1481 
1482 	return 0;
1483 }
1484 
1485 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1486 		struct socket *newsock)
1487 {
1488 	struct nvmet_tcp_queue *queue;
1489 	int ret;
1490 
1491 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1492 	if (!queue)
1493 		return -ENOMEM;
1494 
1495 	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1496 	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1497 	queue->sock = newsock;
1498 	queue->port = port;
1499 	queue->nr_cmds = 0;
1500 	spin_lock_init(&queue->state_lock);
1501 	queue->state = NVMET_TCP_Q_CONNECTING;
1502 	INIT_LIST_HEAD(&queue->free_list);
1503 	init_llist_head(&queue->resp_list);
1504 	INIT_LIST_HEAD(&queue->resp_send_list);
1505 
1506 	queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1507 	if (queue->idx < 0) {
1508 		ret = queue->idx;
1509 		goto out_free_queue;
1510 	}
1511 
1512 	ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1513 	if (ret)
1514 		goto out_ida_remove;
1515 
1516 	ret = nvmet_sq_init(&queue->nvme_sq);
1517 	if (ret)
1518 		goto out_free_connect;
1519 
1520 	port->last_cpu = cpumask_next_wrap(port->last_cpu,
1521 				cpu_online_mask, -1, false);
1522 	queue->cpu = port->last_cpu;
1523 	nvmet_prepare_receive_pdu(queue);
1524 
1525 	mutex_lock(&nvmet_tcp_queue_mutex);
1526 	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1527 	mutex_unlock(&nvmet_tcp_queue_mutex);
1528 
1529 	ret = nvmet_tcp_set_queue_sock(queue);
1530 	if (ret)
1531 		goto out_destroy_sq;
1532 
1533 	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1534 
1535 	return 0;
1536 out_destroy_sq:
1537 	mutex_lock(&nvmet_tcp_queue_mutex);
1538 	list_del_init(&queue->queue_list);
1539 	mutex_unlock(&nvmet_tcp_queue_mutex);
1540 	nvmet_sq_destroy(&queue->nvme_sq);
1541 out_free_connect:
1542 	nvmet_tcp_free_cmd(&queue->connect);
1543 out_ida_remove:
1544 	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1545 out_free_queue:
1546 	kfree(queue);
1547 	return ret;
1548 }
1549 
1550 static void nvmet_tcp_accept_work(struct work_struct *w)
1551 {
1552 	struct nvmet_tcp_port *port =
1553 		container_of(w, struct nvmet_tcp_port, accept_work);
1554 	struct socket *newsock;
1555 	int ret;
1556 
1557 	while (true) {
1558 		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1559 		if (ret < 0) {
1560 			if (ret != -EAGAIN)
1561 				pr_warn("failed to accept err=%d\n", ret);
1562 			return;
1563 		}
1564 		ret = nvmet_tcp_alloc_queue(port, newsock);
1565 		if (ret) {
1566 			pr_err("failed to allocate queue\n");
1567 			sock_release(newsock);
1568 		}
1569 	}
1570 }
1571 
1572 static void nvmet_tcp_listen_data_ready(struct sock *sk)
1573 {
1574 	struct nvmet_tcp_port *port;
1575 
1576 	read_lock_bh(&sk->sk_callback_lock);
1577 	port = sk->sk_user_data;
1578 	if (!port)
1579 		goto out;
1580 
1581 	if (sk->sk_state == TCP_LISTEN)
1582 		schedule_work(&port->accept_work);
1583 out:
1584 	read_unlock_bh(&sk->sk_callback_lock);
1585 }
1586 
1587 static int nvmet_tcp_add_port(struct nvmet_port *nport)
1588 {
1589 	struct nvmet_tcp_port *port;
1590 	__kernel_sa_family_t af;
1591 	int opt, ret;
1592 
1593 	port = kzalloc(sizeof(*port), GFP_KERNEL);
1594 	if (!port)
1595 		return -ENOMEM;
1596 
1597 	switch (nport->disc_addr.adrfam) {
1598 	case NVMF_ADDR_FAMILY_IP4:
1599 		af = AF_INET;
1600 		break;
1601 	case NVMF_ADDR_FAMILY_IP6:
1602 		af = AF_INET6;
1603 		break;
1604 	default:
1605 		pr_err("address family %d not supported\n",
1606 				nport->disc_addr.adrfam);
1607 		ret = -EINVAL;
1608 		goto err_port;
1609 	}
1610 
1611 	ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1612 			nport->disc_addr.trsvcid, &port->addr);
1613 	if (ret) {
1614 		pr_err("malformed ip/port passed: %s:%s\n",
1615 			nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1616 		goto err_port;
1617 	}
1618 
1619 	port->nport = nport;
1620 	port->last_cpu = -1;
1621 	INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1622 	if (port->nport->inline_data_size < 0)
1623 		port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1624 
1625 	ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1626 				IPPROTO_TCP, &port->sock);
1627 	if (ret) {
1628 		pr_err("failed to create a socket\n");
1629 		goto err_port;
1630 	}
1631 
1632 	port->sock->sk->sk_user_data = port;
1633 	port->data_ready = port->sock->sk->sk_data_ready;
1634 	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1635 
1636 	opt = 1;
1637 	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1638 			TCP_NODELAY, (char *)&opt, sizeof(opt));
1639 	if (ret) {
1640 		pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1641 		goto err_sock;
1642 	}
1643 
1644 	ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1645 			(char *)&opt, sizeof(opt));
1646 	if (ret) {
1647 		pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1648 		goto err_sock;
1649 	}
1650 
1651 	if (so_priority > 0) {
1652 		ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY,
1653 				(char *)&so_priority, sizeof(so_priority));
1654 		if (ret) {
1655 			pr_err("failed to set SO_PRIORITY sock opt %d\n", ret);
1656 			goto err_sock;
1657 		}
1658 	}
1659 
1660 	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1661 			sizeof(port->addr));
1662 	if (ret) {
1663 		pr_err("failed to bind port socket %d\n", ret);
1664 		goto err_sock;
1665 	}
1666 
1667 	ret = kernel_listen(port->sock, 128);
1668 	if (ret) {
1669 		pr_err("failed to listen %d on port sock\n", ret);
1670 		goto err_sock;
1671 	}
1672 
1673 	nport->priv = port;
1674 	pr_info("enabling port %d (%pISpc)\n",
1675 		le16_to_cpu(nport->disc_addr.portid), &port->addr);
1676 
1677 	return 0;
1678 
1679 err_sock:
1680 	sock_release(port->sock);
1681 err_port:
1682 	kfree(port);
1683 	return ret;
1684 }
1685 
1686 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1687 {
1688 	struct nvmet_tcp_port *port = nport->priv;
1689 
1690 	write_lock_bh(&port->sock->sk->sk_callback_lock);
1691 	port->sock->sk->sk_data_ready = port->data_ready;
1692 	port->sock->sk->sk_user_data = NULL;
1693 	write_unlock_bh(&port->sock->sk->sk_callback_lock);
1694 	cancel_work_sync(&port->accept_work);
1695 
1696 	sock_release(port->sock);
1697 	kfree(port);
1698 }
1699 
1700 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1701 {
1702 	struct nvmet_tcp_queue *queue;
1703 
1704 	mutex_lock(&nvmet_tcp_queue_mutex);
1705 	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1706 		if (queue->nvme_sq.ctrl == ctrl)
1707 			kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1708 	mutex_unlock(&nvmet_tcp_queue_mutex);
1709 }
1710 
1711 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1712 {
1713 	struct nvmet_tcp_queue *queue =
1714 		container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1715 
1716 	if (sq->qid == 0) {
1717 		/* Let inflight controller teardown complete */
1718 		flush_scheduled_work();
1719 	}
1720 
1721 	queue->nr_cmds = sq->size * 2;
1722 	if (nvmet_tcp_alloc_cmds(queue))
1723 		return NVME_SC_INTERNAL;
1724 	return 0;
1725 }
1726 
1727 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1728 		struct nvmet_port *nport, char *traddr)
1729 {
1730 	struct nvmet_tcp_port *port = nport->priv;
1731 
1732 	if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1733 		struct nvmet_tcp_cmd *cmd =
1734 			container_of(req, struct nvmet_tcp_cmd, req);
1735 		struct nvmet_tcp_queue *queue = cmd->queue;
1736 
1737 		sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1738 	} else {
1739 		memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1740 	}
1741 }
1742 
1743 static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1744 	.owner			= THIS_MODULE,
1745 	.type			= NVMF_TRTYPE_TCP,
1746 	.msdbd			= 1,
1747 	.has_keyed_sgls		= 0,
1748 	.add_port		= nvmet_tcp_add_port,
1749 	.remove_port		= nvmet_tcp_remove_port,
1750 	.queue_response		= nvmet_tcp_queue_response,
1751 	.delete_ctrl		= nvmet_tcp_delete_ctrl,
1752 	.install_queue		= nvmet_tcp_install_queue,
1753 	.disc_traddr		= nvmet_tcp_disc_port_addr,
1754 };
1755 
1756 static int __init nvmet_tcp_init(void)
1757 {
1758 	int ret;
1759 
1760 	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1761 	if (!nvmet_tcp_wq)
1762 		return -ENOMEM;
1763 
1764 	ret = nvmet_register_transport(&nvmet_tcp_ops);
1765 	if (ret)
1766 		goto err;
1767 
1768 	return 0;
1769 err:
1770 	destroy_workqueue(nvmet_tcp_wq);
1771 	return ret;
1772 }
1773 
1774 static void __exit nvmet_tcp_exit(void)
1775 {
1776 	struct nvmet_tcp_queue *queue;
1777 
1778 	nvmet_unregister_transport(&nvmet_tcp_ops);
1779 
1780 	flush_scheduled_work();
1781 	mutex_lock(&nvmet_tcp_queue_mutex);
1782 	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1783 		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1784 	mutex_unlock(&nvmet_tcp_queue_mutex);
1785 	flush_scheduled_work();
1786 
1787 	destroy_workqueue(nvmet_tcp_wq);
1788 }
1789 
1790 module_init(nvmet_tcp_init);
1791 module_exit(nvmet_tcp_exit);
1792 
1793 MODULE_LICENSE("GPL v2");
1794 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
1795