xref: /openbmc/linux/net/ceph/messenger_v1.c (revision e9adcfec)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
3 
4 #include <linux/bvec.h>
5 #include <linux/crc32c.h>
6 #include <linux/net.h>
7 #include <linux/socket.h>
8 #include <net/sock.h>
9 
10 #include <linux/ceph/ceph_features.h>
11 #include <linux/ceph/decode.h>
12 #include <linux/ceph/libceph.h>
13 #include <linux/ceph/messenger.h>
14 
15 /* static tag bytes (protocol control messages) */
16 static char tag_msg = CEPH_MSGR_TAG_MSG;
17 static char tag_ack = CEPH_MSGR_TAG_ACK;
18 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
19 static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
20 
21 /*
22  * If @buf is NULL, discard up to @len bytes.
23  */
24 static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
25 {
26 	struct kvec iov = {buf, len};
27 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
28 	int r;
29 
30 	if (!buf)
31 		msg.msg_flags |= MSG_TRUNC;
32 
33 	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
34 	r = sock_recvmsg(sock, &msg, msg.msg_flags);
35 	if (r == -EAGAIN)
36 		r = 0;
37 	return r;
38 }
39 
40 static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
41 		     int page_offset, size_t length)
42 {
43 	struct bio_vec bvec = {
44 		.bv_page = page,
45 		.bv_offset = page_offset,
46 		.bv_len = length
47 	};
48 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
49 	int r;
50 
51 	BUG_ON(page_offset + length > PAGE_SIZE);
52 	iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length);
53 	r = sock_recvmsg(sock, &msg, msg.msg_flags);
54 	if (r == -EAGAIN)
55 		r = 0;
56 	return r;
57 }
58 
59 /*
60  * write something.  @more is true if caller will be sending more data
61  * shortly.
62  */
63 static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
64 			    size_t kvlen, size_t len, bool more)
65 {
66 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
67 	int r;
68 
69 	if (more)
70 		msg.msg_flags |= MSG_MORE;
71 	else
72 		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
73 
74 	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
75 	if (r == -EAGAIN)
76 		r = 0;
77 	return r;
78 }
79 
80 /*
81  * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
82  */
83 static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
84 			     int offset, size_t size, int more)
85 {
86 	ssize_t (*sendpage)(struct socket *sock, struct page *page,
87 			    int offset, size_t size, int flags);
88 	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
89 	int ret;
90 
91 	/*
92 	 * sendpage cannot properly handle pages with page_count == 0,
93 	 * we need to fall back to sendmsg if that's the case.
94 	 *
95 	 * Same goes for slab pages: skb_can_coalesce() allows
96 	 * coalescing neighboring slab objects into a single frag which
97 	 * triggers one of hardened usercopy checks.
98 	 */
99 	if (sendpage_ok(page))
100 		sendpage = sock->ops->sendpage;
101 	else
102 		sendpage = sock_no_sendpage;
103 
104 	ret = sendpage(sock, page, offset, size, flags);
105 	if (ret == -EAGAIN)
106 		ret = 0;
107 
108 	return ret;
109 }
110 
111 static void con_out_kvec_reset(struct ceph_connection *con)
112 {
113 	BUG_ON(con->v1.out_skip);
114 
115 	con->v1.out_kvec_left = 0;
116 	con->v1.out_kvec_bytes = 0;
117 	con->v1.out_kvec_cur = &con->v1.out_kvec[0];
118 }
119 
120 static void con_out_kvec_add(struct ceph_connection *con,
121 				size_t size, void *data)
122 {
123 	int index = con->v1.out_kvec_left;
124 
125 	BUG_ON(con->v1.out_skip);
126 	BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
127 
128 	con->v1.out_kvec[index].iov_len = size;
129 	con->v1.out_kvec[index].iov_base = data;
130 	con->v1.out_kvec_left++;
131 	con->v1.out_kvec_bytes += size;
132 }
133 
134 /*
135  * Chop off a kvec from the end.  Return residual number of bytes for
136  * that kvec, i.e. how many bytes would have been written if the kvec
137  * hadn't been nuked.
138  */
139 static int con_out_kvec_skip(struct ceph_connection *con)
140 {
141 	int skip = 0;
142 
143 	if (con->v1.out_kvec_bytes > 0) {
144 		skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
145 		BUG_ON(con->v1.out_kvec_bytes < skip);
146 		BUG_ON(!con->v1.out_kvec_left);
147 		con->v1.out_kvec_bytes -= skip;
148 		con->v1.out_kvec_left--;
149 	}
150 
151 	return skip;
152 }
153 
154 static size_t sizeof_footer(struct ceph_connection *con)
155 {
156 	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
157 	    sizeof(struct ceph_msg_footer) :
158 	    sizeof(struct ceph_msg_footer_old);
159 }
160 
161 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
162 {
163 	/* Initialize data cursor */
164 
165 	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
166 }
167 
168 /*
169  * Prepare footer for currently outgoing message, and finish things
170  * off.  Assumes out_kvec* are already valid.. we just add on to the end.
171  */
172 static void prepare_write_message_footer(struct ceph_connection *con)
173 {
174 	struct ceph_msg *m = con->out_msg;
175 
176 	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
177 
178 	dout("prepare_write_message_footer %p\n", con);
179 	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
180 	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
181 		if (con->ops->sign_message)
182 			con->ops->sign_message(m);
183 		else
184 			m->footer.sig = 0;
185 	} else {
186 		m->old_footer.flags = m->footer.flags;
187 	}
188 	con->v1.out_more = m->more_to_follow;
189 	con->v1.out_msg_done = true;
190 }
191 
192 /*
193  * Prepare headers for the next outgoing message.
194  */
195 static void prepare_write_message(struct ceph_connection *con)
196 {
197 	struct ceph_msg *m;
198 	u32 crc;
199 
200 	con_out_kvec_reset(con);
201 	con->v1.out_msg_done = false;
202 
203 	/* Sneak an ack in there first?  If we can get it into the same
204 	 * TCP packet that's a good thing. */
205 	if (con->in_seq > con->in_seq_acked) {
206 		con->in_seq_acked = con->in_seq;
207 		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
208 		con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
209 		con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
210 			&con->v1.out_temp_ack);
211 	}
212 
213 	ceph_con_get_out_msg(con);
214 	m = con->out_msg;
215 
216 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
217 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
218 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
219 	     m->data_length);
220 	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
221 	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
222 
223 	/* tag + hdr + front + middle */
224 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
225 	con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
226 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
227 
228 	if (m->middle)
229 		con_out_kvec_add(con, m->middle->vec.iov_len,
230 			m->middle->vec.iov_base);
231 
232 	/* fill in hdr crc and finalize hdr */
233 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
234 	con->out_msg->hdr.crc = cpu_to_le32(crc);
235 	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
236 
237 	/* fill in front and middle crc, footer */
238 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
239 	con->out_msg->footer.front_crc = cpu_to_le32(crc);
240 	if (m->middle) {
241 		crc = crc32c(0, m->middle->vec.iov_base,
242 				m->middle->vec.iov_len);
243 		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
244 	} else
245 		con->out_msg->footer.middle_crc = 0;
246 	dout("%s front_crc %u middle_crc %u\n", __func__,
247 	     le32_to_cpu(con->out_msg->footer.front_crc),
248 	     le32_to_cpu(con->out_msg->footer.middle_crc));
249 	con->out_msg->footer.flags = 0;
250 
251 	/* is there a data payload? */
252 	con->out_msg->footer.data_crc = 0;
253 	if (m->data_length) {
254 		prepare_message_data(con->out_msg, m->data_length);
255 		con->v1.out_more = 1;  /* data + footer will follow */
256 	} else {
257 		/* no, queue up footer too and be done */
258 		prepare_write_message_footer(con);
259 	}
260 
261 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
262 }
263 
264 /*
265  * Prepare an ack.
266  */
267 static void prepare_write_ack(struct ceph_connection *con)
268 {
269 	dout("prepare_write_ack %p %llu -> %llu\n", con,
270 	     con->in_seq_acked, con->in_seq);
271 	con->in_seq_acked = con->in_seq;
272 
273 	con_out_kvec_reset(con);
274 
275 	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
276 
277 	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
278 	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
279 			 &con->v1.out_temp_ack);
280 
281 	con->v1.out_more = 1;  /* more will follow.. eventually.. */
282 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
283 }
284 
285 /*
286  * Prepare to share the seq during handshake
287  */
288 static void prepare_write_seq(struct ceph_connection *con)
289 {
290 	dout("prepare_write_seq %p %llu -> %llu\n", con,
291 	     con->in_seq_acked, con->in_seq);
292 	con->in_seq_acked = con->in_seq;
293 
294 	con_out_kvec_reset(con);
295 
296 	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
297 	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
298 			 &con->v1.out_temp_ack);
299 
300 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
301 }
302 
303 /*
304  * Prepare to write keepalive byte.
305  */
306 static void prepare_write_keepalive(struct ceph_connection *con)
307 {
308 	dout("prepare_write_keepalive %p\n", con);
309 	con_out_kvec_reset(con);
310 	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
311 		struct timespec64 now;
312 
313 		ktime_get_real_ts64(&now);
314 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
315 		ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
316 		con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
317 				 &con->v1.out_temp_keepalive2);
318 	} else {
319 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
320 	}
321 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
322 }
323 
324 /*
325  * Connection negotiation.
326  */
327 
328 static int get_connect_authorizer(struct ceph_connection *con)
329 {
330 	struct ceph_auth_handshake *auth;
331 	int auth_proto;
332 
333 	if (!con->ops->get_authorizer) {
334 		con->v1.auth = NULL;
335 		con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
336 		con->v1.out_connect.authorizer_len = 0;
337 		return 0;
338 	}
339 
340 	auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
341 	if (IS_ERR(auth))
342 		return PTR_ERR(auth);
343 
344 	con->v1.auth = auth;
345 	con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
346 	con->v1.out_connect.authorizer_len =
347 		cpu_to_le32(auth->authorizer_buf_len);
348 	return 0;
349 }
350 
351 /*
352  * We connected to a peer and are saying hello.
353  */
354 static void prepare_write_banner(struct ceph_connection *con)
355 {
356 	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
357 	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
358 					&con->msgr->my_enc_addr);
359 
360 	con->v1.out_more = 0;
361 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
362 }
363 
364 static void __prepare_write_connect(struct ceph_connection *con)
365 {
366 	con_out_kvec_add(con, sizeof(con->v1.out_connect),
367 			 &con->v1.out_connect);
368 	if (con->v1.auth)
369 		con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
370 				 con->v1.auth->authorizer_buf);
371 
372 	con->v1.out_more = 0;
373 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
374 }
375 
376 static int prepare_write_connect(struct ceph_connection *con)
377 {
378 	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
379 	int proto;
380 	int ret;
381 
382 	switch (con->peer_name.type) {
383 	case CEPH_ENTITY_TYPE_MON:
384 		proto = CEPH_MONC_PROTOCOL;
385 		break;
386 	case CEPH_ENTITY_TYPE_OSD:
387 		proto = CEPH_OSDC_PROTOCOL;
388 		break;
389 	case CEPH_ENTITY_TYPE_MDS:
390 		proto = CEPH_MDSC_PROTOCOL;
391 		break;
392 	default:
393 		BUG();
394 	}
395 
396 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
397 	     con->v1.connect_seq, global_seq, proto);
398 
399 	con->v1.out_connect.features =
400 		cpu_to_le64(from_msgr(con->msgr)->supported_features);
401 	con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
402 	con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
403 	con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
404 	con->v1.out_connect.protocol_version = cpu_to_le32(proto);
405 	con->v1.out_connect.flags = 0;
406 
407 	ret = get_connect_authorizer(con);
408 	if (ret)
409 		return ret;
410 
411 	__prepare_write_connect(con);
412 	return 0;
413 }
414 
415 /*
416  * write as much of pending kvecs to the socket as we can.
417  *  1 -> done
418  *  0 -> socket full, but more to do
419  * <0 -> error
420  */
421 static int write_partial_kvec(struct ceph_connection *con)
422 {
423 	int ret;
424 
425 	dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
426 	while (con->v1.out_kvec_bytes > 0) {
427 		ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
428 				       con->v1.out_kvec_left,
429 				       con->v1.out_kvec_bytes,
430 				       con->v1.out_more);
431 		if (ret <= 0)
432 			goto out;
433 		con->v1.out_kvec_bytes -= ret;
434 		if (!con->v1.out_kvec_bytes)
435 			break;            /* done */
436 
437 		/* account for full iov entries consumed */
438 		while (ret >= con->v1.out_kvec_cur->iov_len) {
439 			BUG_ON(!con->v1.out_kvec_left);
440 			ret -= con->v1.out_kvec_cur->iov_len;
441 			con->v1.out_kvec_cur++;
442 			con->v1.out_kvec_left--;
443 		}
444 		/* and for a partially-consumed entry */
445 		if (ret) {
446 			con->v1.out_kvec_cur->iov_len -= ret;
447 			con->v1.out_kvec_cur->iov_base += ret;
448 		}
449 	}
450 	con->v1.out_kvec_left = 0;
451 	ret = 1;
452 out:
453 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
454 	     con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
455 	return ret;  /* done! */
456 }
457 
458 /*
459  * Write as much message data payload as we can.  If we finish, queue
460  * up the footer.
461  *  1 -> done, footer is now queued in out_kvec[].
462  *  0 -> socket full, but more to do
463  * <0 -> error
464  */
465 static int write_partial_message_data(struct ceph_connection *con)
466 {
467 	struct ceph_msg *msg = con->out_msg;
468 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
469 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
470 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
471 	u32 crc;
472 
473 	dout("%s %p msg %p\n", __func__, con, msg);
474 
475 	if (!msg->num_data_items)
476 		return -EINVAL;
477 
478 	/*
479 	 * Iterate through each page that contains data to be
480 	 * written, and send as much as possible for each.
481 	 *
482 	 * If we are calculating the data crc (the default), we will
483 	 * need to map the page.  If we have no pages, they have
484 	 * been revoked, so use the zero page.
485 	 */
486 	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
487 	while (cursor->total_resid) {
488 		struct page *page;
489 		size_t page_offset;
490 		size_t length;
491 		int ret;
492 
493 		if (!cursor->resid) {
494 			ceph_msg_data_advance(cursor, 0);
495 			continue;
496 		}
497 
498 		page = ceph_msg_data_next(cursor, &page_offset, &length);
499 		if (length == cursor->total_resid)
500 			more = MSG_MORE;
501 		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
502 					more);
503 		if (ret <= 0) {
504 			if (do_datacrc)
505 				msg->footer.data_crc = cpu_to_le32(crc);
506 
507 			return ret;
508 		}
509 		if (do_datacrc && cursor->need_crc)
510 			crc = ceph_crc32c_page(crc, page, page_offset, length);
511 		ceph_msg_data_advance(cursor, (size_t)ret);
512 	}
513 
514 	dout("%s %p msg %p done\n", __func__, con, msg);
515 
516 	/* prepare and queue up footer, too */
517 	if (do_datacrc)
518 		msg->footer.data_crc = cpu_to_le32(crc);
519 	else
520 		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
521 	con_out_kvec_reset(con);
522 	prepare_write_message_footer(con);
523 
524 	return 1;	/* must return > 0 to indicate success */
525 }
526 
527 /*
528  * write some zeros
529  */
530 static int write_partial_skip(struct ceph_connection *con)
531 {
532 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
533 	int ret;
534 
535 	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
536 	while (con->v1.out_skip > 0) {
537 		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
538 
539 		if (size == con->v1.out_skip)
540 			more = MSG_MORE;
541 		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
542 					more);
543 		if (ret <= 0)
544 			goto out;
545 		con->v1.out_skip -= ret;
546 	}
547 	ret = 1;
548 out:
549 	return ret;
550 }
551 
552 /*
553  * Prepare to read connection handshake, or an ack.
554  */
555 static void prepare_read_banner(struct ceph_connection *con)
556 {
557 	dout("prepare_read_banner %p\n", con);
558 	con->v1.in_base_pos = 0;
559 }
560 
561 static void prepare_read_connect(struct ceph_connection *con)
562 {
563 	dout("prepare_read_connect %p\n", con);
564 	con->v1.in_base_pos = 0;
565 }
566 
567 static void prepare_read_ack(struct ceph_connection *con)
568 {
569 	dout("prepare_read_ack %p\n", con);
570 	con->v1.in_base_pos = 0;
571 }
572 
573 static void prepare_read_seq(struct ceph_connection *con)
574 {
575 	dout("prepare_read_seq %p\n", con);
576 	con->v1.in_base_pos = 0;
577 	con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
578 }
579 
580 static void prepare_read_tag(struct ceph_connection *con)
581 {
582 	dout("prepare_read_tag %p\n", con);
583 	con->v1.in_base_pos = 0;
584 	con->v1.in_tag = CEPH_MSGR_TAG_READY;
585 }
586 
587 static void prepare_read_keepalive_ack(struct ceph_connection *con)
588 {
589 	dout("prepare_read_keepalive_ack %p\n", con);
590 	con->v1.in_base_pos = 0;
591 }
592 
593 /*
594  * Prepare to read a message.
595  */
596 static int prepare_read_message(struct ceph_connection *con)
597 {
598 	dout("prepare_read_message %p\n", con);
599 	BUG_ON(con->in_msg != NULL);
600 	con->v1.in_base_pos = 0;
601 	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
602 	return 0;
603 }
604 
605 static int read_partial(struct ceph_connection *con,
606 			int end, int size, void *object)
607 {
608 	while (con->v1.in_base_pos < end) {
609 		int left = end - con->v1.in_base_pos;
610 		int have = size - left;
611 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
612 		if (ret <= 0)
613 			return ret;
614 		con->v1.in_base_pos += ret;
615 	}
616 	return 1;
617 }
618 
619 /*
620  * Read all or part of the connect-side handshake on a new connection
621  */
622 static int read_partial_banner(struct ceph_connection *con)
623 {
624 	int size;
625 	int end;
626 	int ret;
627 
628 	dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
629 
630 	/* peer's banner */
631 	size = strlen(CEPH_BANNER);
632 	end = size;
633 	ret = read_partial(con, end, size, con->v1.in_banner);
634 	if (ret <= 0)
635 		goto out;
636 
637 	size = sizeof(con->v1.actual_peer_addr);
638 	end += size;
639 	ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
640 	if (ret <= 0)
641 		goto out;
642 	ceph_decode_banner_addr(&con->v1.actual_peer_addr);
643 
644 	size = sizeof(con->v1.peer_addr_for_me);
645 	end += size;
646 	ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
647 	if (ret <= 0)
648 		goto out;
649 	ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
650 
651 out:
652 	return ret;
653 }
654 
655 static int read_partial_connect(struct ceph_connection *con)
656 {
657 	int size;
658 	int end;
659 	int ret;
660 
661 	dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
662 
663 	size = sizeof(con->v1.in_reply);
664 	end = size;
665 	ret = read_partial(con, end, size, &con->v1.in_reply);
666 	if (ret <= 0)
667 		goto out;
668 
669 	if (con->v1.auth) {
670 		size = le32_to_cpu(con->v1.in_reply.authorizer_len);
671 		if (size > con->v1.auth->authorizer_reply_buf_len) {
672 			pr_err("authorizer reply too big: %d > %zu\n", size,
673 			       con->v1.auth->authorizer_reply_buf_len);
674 			ret = -EINVAL;
675 			goto out;
676 		}
677 
678 		end += size;
679 		ret = read_partial(con, end, size,
680 				   con->v1.auth->authorizer_reply_buf);
681 		if (ret <= 0)
682 			goto out;
683 	}
684 
685 	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
686 	     con, con->v1.in_reply.tag,
687 	     le32_to_cpu(con->v1.in_reply.connect_seq),
688 	     le32_to_cpu(con->v1.in_reply.global_seq));
689 out:
690 	return ret;
691 }
692 
693 /*
694  * Verify the hello banner looks okay.
695  */
696 static int verify_hello(struct ceph_connection *con)
697 {
698 	if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
699 		pr_err("connect to %s got bad banner\n",
700 		       ceph_pr_addr(&con->peer_addr));
701 		con->error_msg = "protocol error, bad banner";
702 		return -1;
703 	}
704 	return 0;
705 }
706 
707 static int process_banner(struct ceph_connection *con)
708 {
709 	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
710 
711 	dout("process_banner on %p\n", con);
712 
713 	if (verify_hello(con) < 0)
714 		return -1;
715 
716 	/*
717 	 * Make sure the other end is who we wanted.  note that the other
718 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
719 	 * them the benefit of the doubt.
720 	 */
721 	if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
722 		   sizeof(con->peer_addr)) != 0 &&
723 	    !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
724 	      con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
725 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
726 			ceph_pr_addr(&con->peer_addr),
727 			le32_to_cpu(con->peer_addr.nonce),
728 			ceph_pr_addr(&con->v1.actual_peer_addr),
729 			le32_to_cpu(con->v1.actual_peer_addr.nonce));
730 		con->error_msg = "wrong peer at address";
731 		return -1;
732 	}
733 
734 	/*
735 	 * did we learn our address?
736 	 */
737 	if (ceph_addr_is_blank(my_addr)) {
738 		memcpy(&my_addr->in_addr,
739 		       &con->v1.peer_addr_for_me.in_addr,
740 		       sizeof(con->v1.peer_addr_for_me.in_addr));
741 		ceph_addr_set_port(my_addr, 0);
742 		ceph_encode_my_addr(con->msgr);
743 		dout("process_banner learned my addr is %s\n",
744 		     ceph_pr_addr(my_addr));
745 	}
746 
747 	return 0;
748 }
749 
750 static int process_connect(struct ceph_connection *con)
751 {
752 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
753 	u64 req_feat = from_msgr(con->msgr)->required_features;
754 	u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
755 	int ret;
756 
757 	dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
758 
759 	if (con->v1.auth) {
760 		int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
761 
762 		/*
763 		 * Any connection that defines ->get_authorizer()
764 		 * should also define ->add_authorizer_challenge() and
765 		 * ->verify_authorizer_reply().
766 		 *
767 		 * See get_connect_authorizer().
768 		 */
769 		if (con->v1.in_reply.tag ==
770 				CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
771 			ret = con->ops->add_authorizer_challenge(
772 				con, con->v1.auth->authorizer_reply_buf, len);
773 			if (ret < 0)
774 				return ret;
775 
776 			con_out_kvec_reset(con);
777 			__prepare_write_connect(con);
778 			prepare_read_connect(con);
779 			return 0;
780 		}
781 
782 		if (len) {
783 			ret = con->ops->verify_authorizer_reply(con);
784 			if (ret < 0) {
785 				con->error_msg = "bad authorize reply";
786 				return ret;
787 			}
788 		}
789 	}
790 
791 	switch (con->v1.in_reply.tag) {
792 	case CEPH_MSGR_TAG_FEATURES:
793 		pr_err("%s%lld %s feature set mismatch,"
794 		       " my %llx < server's %llx, missing %llx\n",
795 		       ENTITY_NAME(con->peer_name),
796 		       ceph_pr_addr(&con->peer_addr),
797 		       sup_feat, server_feat, server_feat & ~sup_feat);
798 		con->error_msg = "missing required protocol features";
799 		return -1;
800 
801 	case CEPH_MSGR_TAG_BADPROTOVER:
802 		pr_err("%s%lld %s protocol version mismatch,"
803 		       " my %d != server's %d\n",
804 		       ENTITY_NAME(con->peer_name),
805 		       ceph_pr_addr(&con->peer_addr),
806 		       le32_to_cpu(con->v1.out_connect.protocol_version),
807 		       le32_to_cpu(con->v1.in_reply.protocol_version));
808 		con->error_msg = "protocol version mismatch";
809 		return -1;
810 
811 	case CEPH_MSGR_TAG_BADAUTHORIZER:
812 		con->v1.auth_retry++;
813 		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
814 		     con->v1.auth_retry);
815 		if (con->v1.auth_retry == 2) {
816 			con->error_msg = "connect authorization failure";
817 			return -1;
818 		}
819 		con_out_kvec_reset(con);
820 		ret = prepare_write_connect(con);
821 		if (ret < 0)
822 			return ret;
823 		prepare_read_connect(con);
824 		break;
825 
826 	case CEPH_MSGR_TAG_RESETSESSION:
827 		/*
828 		 * If we connected with a large connect_seq but the peer
829 		 * has no record of a session with us (no connection, or
830 		 * connect_seq == 0), they will send RESETSESION to indicate
831 		 * that they must have reset their session, and may have
832 		 * dropped messages.
833 		 */
834 		dout("process_connect got RESET peer seq %u\n",
835 		     le32_to_cpu(con->v1.in_reply.connect_seq));
836 		pr_info("%s%lld %s session reset\n",
837 			ENTITY_NAME(con->peer_name),
838 			ceph_pr_addr(&con->peer_addr));
839 		ceph_con_reset_session(con);
840 		con_out_kvec_reset(con);
841 		ret = prepare_write_connect(con);
842 		if (ret < 0)
843 			return ret;
844 		prepare_read_connect(con);
845 
846 		/* Tell ceph about it. */
847 		mutex_unlock(&con->mutex);
848 		if (con->ops->peer_reset)
849 			con->ops->peer_reset(con);
850 		mutex_lock(&con->mutex);
851 		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
852 			return -EAGAIN;
853 		break;
854 
855 	case CEPH_MSGR_TAG_RETRY_SESSION:
856 		/*
857 		 * If we sent a smaller connect_seq than the peer has, try
858 		 * again with a larger value.
859 		 */
860 		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
861 		     le32_to_cpu(con->v1.out_connect.connect_seq),
862 		     le32_to_cpu(con->v1.in_reply.connect_seq));
863 		con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
864 		con_out_kvec_reset(con);
865 		ret = prepare_write_connect(con);
866 		if (ret < 0)
867 			return ret;
868 		prepare_read_connect(con);
869 		break;
870 
871 	case CEPH_MSGR_TAG_RETRY_GLOBAL:
872 		/*
873 		 * If we sent a smaller global_seq than the peer has, try
874 		 * again with a larger value.
875 		 */
876 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
877 		     con->v1.peer_global_seq,
878 		     le32_to_cpu(con->v1.in_reply.global_seq));
879 		ceph_get_global_seq(con->msgr,
880 				    le32_to_cpu(con->v1.in_reply.global_seq));
881 		con_out_kvec_reset(con);
882 		ret = prepare_write_connect(con);
883 		if (ret < 0)
884 			return ret;
885 		prepare_read_connect(con);
886 		break;
887 
888 	case CEPH_MSGR_TAG_SEQ:
889 	case CEPH_MSGR_TAG_READY:
890 		if (req_feat & ~server_feat) {
891 			pr_err("%s%lld %s protocol feature mismatch,"
892 			       " my required %llx > server's %llx, need %llx\n",
893 			       ENTITY_NAME(con->peer_name),
894 			       ceph_pr_addr(&con->peer_addr),
895 			       req_feat, server_feat, req_feat & ~server_feat);
896 			con->error_msg = "missing required protocol features";
897 			return -1;
898 		}
899 
900 		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
901 		con->state = CEPH_CON_S_OPEN;
902 		con->v1.auth_retry = 0;    /* we authenticated; clear flag */
903 		con->v1.peer_global_seq =
904 			le32_to_cpu(con->v1.in_reply.global_seq);
905 		con->v1.connect_seq++;
906 		con->peer_features = server_feat;
907 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
908 		     con->v1.peer_global_seq,
909 		     le32_to_cpu(con->v1.in_reply.connect_seq),
910 		     con->v1.connect_seq);
911 		WARN_ON(con->v1.connect_seq !=
912 			le32_to_cpu(con->v1.in_reply.connect_seq));
913 
914 		if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
915 			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
916 
917 		con->delay = 0;      /* reset backoff memory */
918 
919 		if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
920 			prepare_write_seq(con);
921 			prepare_read_seq(con);
922 		} else {
923 			prepare_read_tag(con);
924 		}
925 		break;
926 
927 	case CEPH_MSGR_TAG_WAIT:
928 		/*
929 		 * If there is a connection race (we are opening
930 		 * connections to each other), one of us may just have
931 		 * to WAIT.  This shouldn't happen if we are the
932 		 * client.
933 		 */
934 		con->error_msg = "protocol error, got WAIT as client";
935 		return -1;
936 
937 	default:
938 		con->error_msg = "protocol error, garbage tag during connect";
939 		return -1;
940 	}
941 	return 0;
942 }
943 
944 /*
945  * read (part of) an ack
946  */
947 static int read_partial_ack(struct ceph_connection *con)
948 {
949 	int size = sizeof(con->v1.in_temp_ack);
950 	int end = size;
951 
952 	return read_partial(con, end, size, &con->v1.in_temp_ack);
953 }
954 
955 /*
956  * We can finally discard anything that's been acked.
957  */
958 static void process_ack(struct ceph_connection *con)
959 {
960 	u64 ack = le64_to_cpu(con->v1.in_temp_ack);
961 
962 	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
963 		ceph_con_discard_sent(con, ack);
964 	else
965 		ceph_con_discard_requeued(con, ack);
966 
967 	prepare_read_tag(con);
968 }
969 
970 static int read_partial_message_section(struct ceph_connection *con,
971 					struct kvec *section,
972 					unsigned int sec_len, u32 *crc)
973 {
974 	int ret, left;
975 
976 	BUG_ON(!section);
977 
978 	while (section->iov_len < sec_len) {
979 		BUG_ON(section->iov_base == NULL);
980 		left = sec_len - section->iov_len;
981 		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
982 				       section->iov_len, left);
983 		if (ret <= 0)
984 			return ret;
985 		section->iov_len += ret;
986 	}
987 	if (section->iov_len == sec_len)
988 		*crc = crc32c(0, section->iov_base, section->iov_len);
989 
990 	return 1;
991 }
992 
993 static int read_partial_msg_data(struct ceph_connection *con)
994 {
995 	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
996 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
997 	struct page *page;
998 	size_t page_offset;
999 	size_t length;
1000 	u32 crc = 0;
1001 	int ret;
1002 
1003 	if (do_datacrc)
1004 		crc = con->in_data_crc;
1005 	while (cursor->total_resid) {
1006 		if (!cursor->resid) {
1007 			ceph_msg_data_advance(cursor, 0);
1008 			continue;
1009 		}
1010 
1011 		page = ceph_msg_data_next(cursor, &page_offset, &length);
1012 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
1013 		if (ret <= 0) {
1014 			if (do_datacrc)
1015 				con->in_data_crc = crc;
1016 
1017 			return ret;
1018 		}
1019 
1020 		if (do_datacrc)
1021 			crc = ceph_crc32c_page(crc, page, page_offset, ret);
1022 		ceph_msg_data_advance(cursor, (size_t)ret);
1023 	}
1024 	if (do_datacrc)
1025 		con->in_data_crc = crc;
1026 
1027 	return 1;	/* must return > 0 to indicate success */
1028 }
1029 
1030 static int read_partial_msg_data_bounce(struct ceph_connection *con)
1031 {
1032 	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
1033 	struct page *page;
1034 	size_t off, len;
1035 	u32 crc;
1036 	int ret;
1037 
1038 	if (unlikely(!con->bounce_page)) {
1039 		con->bounce_page = alloc_page(GFP_NOIO);
1040 		if (!con->bounce_page) {
1041 			pr_err("failed to allocate bounce page\n");
1042 			return -ENOMEM;
1043 		}
1044 	}
1045 
1046 	crc = con->in_data_crc;
1047 	while (cursor->total_resid) {
1048 		if (!cursor->resid) {
1049 			ceph_msg_data_advance(cursor, 0);
1050 			continue;
1051 		}
1052 
1053 		page = ceph_msg_data_next(cursor, &off, &len);
1054 		ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len);
1055 		if (ret <= 0) {
1056 			con->in_data_crc = crc;
1057 			return ret;
1058 		}
1059 
1060 		crc = crc32c(crc, page_address(con->bounce_page), ret);
1061 		memcpy_to_page(page, off, page_address(con->bounce_page), ret);
1062 
1063 		ceph_msg_data_advance(cursor, ret);
1064 	}
1065 	con->in_data_crc = crc;
1066 
1067 	return 1;	/* must return > 0 to indicate success */
1068 }
1069 
1070 /*
1071  * read (part of) a message.
1072  */
1073 static int read_partial_message(struct ceph_connection *con)
1074 {
1075 	struct ceph_msg *m = con->in_msg;
1076 	int size;
1077 	int end;
1078 	int ret;
1079 	unsigned int front_len, middle_len, data_len;
1080 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
1081 	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
1082 	u64 seq;
1083 	u32 crc;
1084 
1085 	dout("read_partial_message con %p msg %p\n", con, m);
1086 
1087 	/* header */
1088 	size = sizeof(con->v1.in_hdr);
1089 	end = size;
1090 	ret = read_partial(con, end, size, &con->v1.in_hdr);
1091 	if (ret <= 0)
1092 		return ret;
1093 
1094 	crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
1095 	if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
1096 		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
1097 		       crc, con->v1.in_hdr.crc);
1098 		return -EBADMSG;
1099 	}
1100 
1101 	front_len = le32_to_cpu(con->v1.in_hdr.front_len);
1102 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1103 		return -EIO;
1104 	middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
1105 	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
1106 		return -EIO;
1107 	data_len = le32_to_cpu(con->v1.in_hdr.data_len);
1108 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
1109 		return -EIO;
1110 
1111 	/* verify seq# */
1112 	seq = le64_to_cpu(con->v1.in_hdr.seq);
1113 	if ((s64)seq - (s64)con->in_seq < 1) {
1114 		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
1115 			ENTITY_NAME(con->peer_name),
1116 			ceph_pr_addr(&con->peer_addr),
1117 			seq, con->in_seq + 1);
1118 		con->v1.in_base_pos = -front_len - middle_len - data_len -
1119 				      sizeof_footer(con);
1120 		con->v1.in_tag = CEPH_MSGR_TAG_READY;
1121 		return 1;
1122 	} else if ((s64)seq - (s64)con->in_seq > 1) {
1123 		pr_err("read_partial_message bad seq %lld expected %lld\n",
1124 		       seq, con->in_seq + 1);
1125 		con->error_msg = "bad message sequence # for incoming message";
1126 		return -EBADE;
1127 	}
1128 
1129 	/* allocate message? */
1130 	if (!con->in_msg) {
1131 		int skip = 0;
1132 
1133 		dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
1134 		     front_len, data_len);
1135 		ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
1136 		if (ret < 0)
1137 			return ret;
1138 
1139 		BUG_ON((!con->in_msg) ^ skip);
1140 		if (skip) {
1141 			/* skip this message */
1142 			dout("alloc_msg said skip message\n");
1143 			con->v1.in_base_pos = -front_len - middle_len -
1144 					      data_len - sizeof_footer(con);
1145 			con->v1.in_tag = CEPH_MSGR_TAG_READY;
1146 			con->in_seq++;
1147 			return 1;
1148 		}
1149 
1150 		BUG_ON(!con->in_msg);
1151 		BUG_ON(con->in_msg->con != con);
1152 		m = con->in_msg;
1153 		m->front.iov_len = 0;    /* haven't read it yet */
1154 		if (m->middle)
1155 			m->middle->vec.iov_len = 0;
1156 
1157 		/* prepare for data payload, if any */
1158 
1159 		if (data_len)
1160 			prepare_message_data(con->in_msg, data_len);
1161 	}
1162 
1163 	/* front */
1164 	ret = read_partial_message_section(con, &m->front, front_len,
1165 					   &con->in_front_crc);
1166 	if (ret <= 0)
1167 		return ret;
1168 
1169 	/* middle */
1170 	if (m->middle) {
1171 		ret = read_partial_message_section(con, &m->middle->vec,
1172 						   middle_len,
1173 						   &con->in_middle_crc);
1174 		if (ret <= 0)
1175 			return ret;
1176 	}
1177 
1178 	/* (page) data */
1179 	if (data_len) {
1180 		if (!m->num_data_items)
1181 			return -EIO;
1182 
1183 		if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
1184 			ret = read_partial_msg_data_bounce(con);
1185 		else
1186 			ret = read_partial_msg_data(con);
1187 		if (ret <= 0)
1188 			return ret;
1189 	}
1190 
1191 	/* footer */
1192 	size = sizeof_footer(con);
1193 	end += size;
1194 	ret = read_partial(con, end, size, &m->footer);
1195 	if (ret <= 0)
1196 		return ret;
1197 
1198 	if (!need_sign) {
1199 		m->footer.flags = m->old_footer.flags;
1200 		m->footer.sig = 0;
1201 	}
1202 
1203 	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1204 	     m, front_len, m->footer.front_crc, middle_len,
1205 	     m->footer.middle_crc, data_len, m->footer.data_crc);
1206 
1207 	/* crc ok? */
1208 	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1209 		pr_err("read_partial_message %p front crc %u != exp. %u\n",
1210 		       m, con->in_front_crc, m->footer.front_crc);
1211 		return -EBADMSG;
1212 	}
1213 	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1214 		pr_err("read_partial_message %p middle crc %u != exp %u\n",
1215 		       m, con->in_middle_crc, m->footer.middle_crc);
1216 		return -EBADMSG;
1217 	}
1218 	if (do_datacrc &&
1219 	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1220 	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1221 		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1222 		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1223 		return -EBADMSG;
1224 	}
1225 
1226 	if (need_sign && con->ops->check_message_signature &&
1227 	    con->ops->check_message_signature(m)) {
1228 		pr_err("read_partial_message %p signature check failed\n", m);
1229 		return -EBADMSG;
1230 	}
1231 
1232 	return 1; /* done! */
1233 }
1234 
1235 static int read_keepalive_ack(struct ceph_connection *con)
1236 {
1237 	struct ceph_timespec ceph_ts;
1238 	size_t size = sizeof(ceph_ts);
1239 	int ret = read_partial(con, size, size, &ceph_ts);
1240 	if (ret <= 0)
1241 		return ret;
1242 	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
1243 	prepare_read_tag(con);
1244 	return 1;
1245 }
1246 
1247 /*
1248  * Read what we can from the socket.
1249  */
1250 int ceph_con_v1_try_read(struct ceph_connection *con)
1251 {
1252 	int ret = -1;
1253 
1254 more:
1255 	dout("try_read start %p state %d\n", con, con->state);
1256 	if (con->state != CEPH_CON_S_V1_BANNER &&
1257 	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
1258 	    con->state != CEPH_CON_S_OPEN)
1259 		return 0;
1260 
1261 	BUG_ON(!con->sock);
1262 
1263 	dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
1264 	     con->v1.in_base_pos);
1265 
1266 	if (con->state == CEPH_CON_S_V1_BANNER) {
1267 		ret = read_partial_banner(con);
1268 		if (ret <= 0)
1269 			goto out;
1270 		ret = process_banner(con);
1271 		if (ret < 0)
1272 			goto out;
1273 
1274 		con->state = CEPH_CON_S_V1_CONNECT_MSG;
1275 
1276 		/*
1277 		 * Received banner is good, exchange connection info.
1278 		 * Do not reset out_kvec, as sending our banner raced
1279 		 * with receiving peer banner after connect completed.
1280 		 */
1281 		ret = prepare_write_connect(con);
1282 		if (ret < 0)
1283 			goto out;
1284 		prepare_read_connect(con);
1285 
1286 		/* Send connection info before awaiting response */
1287 		goto out;
1288 	}
1289 
1290 	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
1291 		ret = read_partial_connect(con);
1292 		if (ret <= 0)
1293 			goto out;
1294 		ret = process_connect(con);
1295 		if (ret < 0)
1296 			goto out;
1297 		goto more;
1298 	}
1299 
1300 	WARN_ON(con->state != CEPH_CON_S_OPEN);
1301 
1302 	if (con->v1.in_base_pos < 0) {
1303 		/*
1304 		 * skipping + discarding content.
1305 		 */
1306 		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
1307 		if (ret <= 0)
1308 			goto out;
1309 		dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
1310 		con->v1.in_base_pos += ret;
1311 		if (con->v1.in_base_pos)
1312 			goto more;
1313 	}
1314 	if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
1315 		/*
1316 		 * what's next?
1317 		 */
1318 		ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
1319 		if (ret <= 0)
1320 			goto out;
1321 		dout("try_read got tag %d\n", con->v1.in_tag);
1322 		switch (con->v1.in_tag) {
1323 		case CEPH_MSGR_TAG_MSG:
1324 			prepare_read_message(con);
1325 			break;
1326 		case CEPH_MSGR_TAG_ACK:
1327 			prepare_read_ack(con);
1328 			break;
1329 		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
1330 			prepare_read_keepalive_ack(con);
1331 			break;
1332 		case CEPH_MSGR_TAG_CLOSE:
1333 			ceph_con_close_socket(con);
1334 			con->state = CEPH_CON_S_CLOSED;
1335 			goto out;
1336 		default:
1337 			goto bad_tag;
1338 		}
1339 	}
1340 	if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
1341 		ret = read_partial_message(con);
1342 		if (ret <= 0) {
1343 			switch (ret) {
1344 			case -EBADMSG:
1345 				con->error_msg = "bad crc/signature";
1346 				fallthrough;
1347 			case -EBADE:
1348 				ret = -EIO;
1349 				break;
1350 			case -EIO:
1351 				con->error_msg = "io error";
1352 				break;
1353 			}
1354 			goto out;
1355 		}
1356 		if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
1357 			goto more;
1358 		ceph_con_process_message(con);
1359 		if (con->state == CEPH_CON_S_OPEN)
1360 			prepare_read_tag(con);
1361 		goto more;
1362 	}
1363 	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
1364 	    con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
1365 		/*
1366 		 * the final handshake seq exchange is semantically
1367 		 * equivalent to an ACK
1368 		 */
1369 		ret = read_partial_ack(con);
1370 		if (ret <= 0)
1371 			goto out;
1372 		process_ack(con);
1373 		goto more;
1374 	}
1375 	if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
1376 		ret = read_keepalive_ack(con);
1377 		if (ret <= 0)
1378 			goto out;
1379 		goto more;
1380 	}
1381 
1382 out:
1383 	dout("try_read done on %p ret %d\n", con, ret);
1384 	return ret;
1385 
1386 bad_tag:
1387 	pr_err("try_read bad tag %d\n", con->v1.in_tag);
1388 	con->error_msg = "protocol error, garbage tag";
1389 	ret = -1;
1390 	goto out;
1391 }
1392 
1393 /*
1394  * Write something to the socket.  Called in a worker thread when the
1395  * socket appears to be writeable and we have something ready to send.
1396  */
1397 int ceph_con_v1_try_write(struct ceph_connection *con)
1398 {
1399 	int ret = 1;
1400 
1401 	dout("try_write start %p state %d\n", con, con->state);
1402 	if (con->state != CEPH_CON_S_PREOPEN &&
1403 	    con->state != CEPH_CON_S_V1_BANNER &&
1404 	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
1405 	    con->state != CEPH_CON_S_OPEN)
1406 		return 0;
1407 
1408 	/* open the socket first? */
1409 	if (con->state == CEPH_CON_S_PREOPEN) {
1410 		BUG_ON(con->sock);
1411 		con->state = CEPH_CON_S_V1_BANNER;
1412 
1413 		con_out_kvec_reset(con);
1414 		prepare_write_banner(con);
1415 		prepare_read_banner(con);
1416 
1417 		BUG_ON(con->in_msg);
1418 		con->v1.in_tag = CEPH_MSGR_TAG_READY;
1419 		dout("try_write initiating connect on %p new state %d\n",
1420 		     con, con->state);
1421 		ret = ceph_tcp_connect(con);
1422 		if (ret < 0) {
1423 			con->error_msg = "connect error";
1424 			goto out;
1425 		}
1426 	}
1427 
1428 more:
1429 	dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
1430 	BUG_ON(!con->sock);
1431 
1432 	/* kvec data queued? */
1433 	if (con->v1.out_kvec_left) {
1434 		ret = write_partial_kvec(con);
1435 		if (ret <= 0)
1436 			goto out;
1437 	}
1438 	if (con->v1.out_skip) {
1439 		ret = write_partial_skip(con);
1440 		if (ret <= 0)
1441 			goto out;
1442 	}
1443 
1444 	/* msg pages? */
1445 	if (con->out_msg) {
1446 		if (con->v1.out_msg_done) {
1447 			ceph_msg_put(con->out_msg);
1448 			con->out_msg = NULL;   /* we're done with this one */
1449 			goto do_next;
1450 		}
1451 
1452 		ret = write_partial_message_data(con);
1453 		if (ret == 1)
1454 			goto more;  /* we need to send the footer, too! */
1455 		if (ret == 0)
1456 			goto out;
1457 		if (ret < 0) {
1458 			dout("try_write write_partial_message_data err %d\n",
1459 			     ret);
1460 			goto out;
1461 		}
1462 	}
1463 
1464 do_next:
1465 	if (con->state == CEPH_CON_S_OPEN) {
1466 		if (ceph_con_flag_test_and_clear(con,
1467 				CEPH_CON_F_KEEPALIVE_PENDING)) {
1468 			prepare_write_keepalive(con);
1469 			goto more;
1470 		}
1471 		/* is anything else pending? */
1472 		if (!list_empty(&con->out_queue)) {
1473 			prepare_write_message(con);
1474 			goto more;
1475 		}
1476 		if (con->in_seq > con->in_seq_acked) {
1477 			prepare_write_ack(con);
1478 			goto more;
1479 		}
1480 	}
1481 
1482 	/* Nothing to do! */
1483 	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
1484 	dout("try_write nothing else to write.\n");
1485 	ret = 0;
1486 out:
1487 	dout("try_write done on %p ret %d\n", con, ret);
1488 	return ret;
1489 }
1490 
1491 void ceph_con_v1_revoke(struct ceph_connection *con)
1492 {
1493 	struct ceph_msg *msg = con->out_msg;
1494 
1495 	WARN_ON(con->v1.out_skip);
1496 	/* footer */
1497 	if (con->v1.out_msg_done) {
1498 		con->v1.out_skip += con_out_kvec_skip(con);
1499 	} else {
1500 		WARN_ON(!msg->data_length);
1501 		con->v1.out_skip += sizeof_footer(con);
1502 	}
1503 	/* data, middle, front */
1504 	if (msg->data_length)
1505 		con->v1.out_skip += msg->cursor.total_resid;
1506 	if (msg->middle)
1507 		con->v1.out_skip += con_out_kvec_skip(con);
1508 	con->v1.out_skip += con_out_kvec_skip(con);
1509 
1510 	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
1511 	     con->v1.out_kvec_bytes, con->v1.out_skip);
1512 }
1513 
1514 void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
1515 {
1516 	unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
1517 	unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
1518 	unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
1519 
1520 	/* skip rest of message */
1521 	con->v1.in_base_pos = con->v1.in_base_pos -
1522 			sizeof(struct ceph_msg_header) -
1523 			front_len -
1524 			middle_len -
1525 			data_len -
1526 			sizeof(struct ceph_msg_footer);
1527 
1528 	con->v1.in_tag = CEPH_MSGR_TAG_READY;
1529 	con->in_seq++;
1530 
1531 	dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
1532 }
1533 
1534 bool ceph_con_v1_opened(struct ceph_connection *con)
1535 {
1536 	return con->v1.connect_seq;
1537 }
1538 
1539 void ceph_con_v1_reset_session(struct ceph_connection *con)
1540 {
1541 	con->v1.connect_seq = 0;
1542 	con->v1.peer_global_seq = 0;
1543 }
1544 
1545 void ceph_con_v1_reset_protocol(struct ceph_connection *con)
1546 {
1547 	con->v1.out_skip = 0;
1548 }
1549