xref: /openbmc/linux/net/ceph/messenger.c (revision a1ef447deed5f5add172cd83efa48c46cb2b1a0d)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
33d14c5d2SYehuda Sadeh 
43d14c5d2SYehuda Sadeh #include <linux/crc32c.h>
53d14c5d2SYehuda Sadeh #include <linux/ctype.h>
63d14c5d2SYehuda Sadeh #include <linux/highmem.h>
73d14c5d2SYehuda Sadeh #include <linux/inet.h>
83d14c5d2SYehuda Sadeh #include <linux/kthread.h>
93d14c5d2SYehuda Sadeh #include <linux/net.h>
10757856d2SIlya Dryomov #include <linux/nsproxy.h>
11633ee407SIlya Dryomov #include <linux/sched/mm.h>
123d14c5d2SYehuda Sadeh #include <linux/slab.h>
133d14c5d2SYehuda Sadeh #include <linux/socket.h>
143d14c5d2SYehuda Sadeh #include <linux/string.h>
153ebc21f7SAlex Elder #ifdef	CONFIG_BLOCK
163d14c5d2SYehuda Sadeh #include <linux/bio.h>
173ebc21f7SAlex Elder #endif	/* CONFIG_BLOCK */
18ee3b56f2SNoah Watkins #include <linux/dns_resolver.h>
193d14c5d2SYehuda Sadeh #include <net/tcp.h>
2040e0b090SPeilin Ye #include <trace/events/sock.h>
213d14c5d2SYehuda Sadeh 
222b3e0c90SIlya Dryomov #include <linux/ceph/ceph_features.h>
233d14c5d2SYehuda Sadeh #include <linux/ceph/libceph.h>
243d14c5d2SYehuda Sadeh #include <linux/ceph/messenger.h>
253d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
263d14c5d2SYehuda Sadeh #include <linux/ceph/pagelist.h>
27bc3b2d7fSPaul Gortmaker #include <linux/export.h>
283d14c5d2SYehuda Sadeh 
293d14c5d2SYehuda Sadeh /*
303d14c5d2SYehuda Sadeh  * Ceph uses the messenger to exchange ceph_msg messages with other
313d14c5d2SYehuda Sadeh  * hosts in the system.  The messenger provides ordered and reliable
323d14c5d2SYehuda Sadeh  * delivery.  We tolerate TCP disconnects by reconnecting (with
333d14c5d2SYehuda Sadeh  * exponential backoff) in the case of a fault (disconnection, bad
343d14c5d2SYehuda Sadeh  * crc, protocol error).  Acks allow sent messages to be discarded by
353d14c5d2SYehuda Sadeh  * the sender.
363d14c5d2SYehuda Sadeh  */
373d14c5d2SYehuda Sadeh 
38bc18f4b1SAlex Elder /*
39bc18f4b1SAlex Elder  * We track the state of the socket on a given connection using
40bc18f4b1SAlex Elder  * values defined below.  The transition to a new socket state is
41bc18f4b1SAlex Elder  * handled by a function which verifies we aren't coming from an
42bc18f4b1SAlex Elder  * unexpected state.
43bc18f4b1SAlex Elder  *
44bc18f4b1SAlex Elder  *      --------
45bc18f4b1SAlex Elder  *      | NEW* |  transient initial state
46bc18f4b1SAlex Elder  *      --------
47bc18f4b1SAlex Elder  *          | con_sock_state_init()
48bc18f4b1SAlex Elder  *          v
49bc18f4b1SAlex Elder  *      ----------
50bc18f4b1SAlex Elder  *      | CLOSED |  initialized, but no socket (and no
51bc18f4b1SAlex Elder  *      ----------  TCP connection)
52bc18f4b1SAlex Elder  *       ^      \
53bc18f4b1SAlex Elder  *       |       \ con_sock_state_connecting()
54bc18f4b1SAlex Elder  *       |        ----------------------
55bc18f4b1SAlex Elder  *       |                              \
56bc18f4b1SAlex Elder  *       + con_sock_state_closed()       \
57fbb85a47SSage Weil  *       |+---------------------------    \
58fbb85a47SSage Weil  *       | \                          \    \
59fbb85a47SSage Weil  *       |  -----------                \    \
60fbb85a47SSage Weil  *       |  | CLOSING |  socket event;  \    \
61fbb85a47SSage Weil  *       |  -----------  await close     \    \
62fbb85a47SSage Weil  *       |       ^                        \   |
63fbb85a47SSage Weil  *       |       |                         \  |
64fbb85a47SSage Weil  *       |       + con_sock_state_closing() \ |
65fbb85a47SSage Weil  *       |      / \                         | |
66fbb85a47SSage Weil  *       |     /   ---------------          | |
67fbb85a47SSage Weil  *       |    /                   \         v v
68bc18f4b1SAlex Elder  *       |   /                    --------------
69bc18f4b1SAlex Elder  *       |  /    -----------------| CONNECTING |  socket created, TCP
70bc18f4b1SAlex Elder  *       |  |   /                 --------------  connect initiated
71bc18f4b1SAlex Elder  *       |  |   | con_sock_state_connected()
72bc18f4b1SAlex Elder  *       |  |   v
73bc18f4b1SAlex Elder  *      -------------
74bc18f4b1SAlex Elder  *      | CONNECTED |  TCP connection established
75bc18f4b1SAlex Elder  *      -------------
76bc18f4b1SAlex Elder  *
77bc18f4b1SAlex Elder  * State values for ceph_connection->sock_state; NEW is assumed to be 0.
78bc18f4b1SAlex Elder  */
79ce2c8903SAlex Elder 
80ce2c8903SAlex Elder #define CON_SOCK_STATE_NEW		0	/* -> CLOSED */
81ce2c8903SAlex Elder #define CON_SOCK_STATE_CLOSED		1	/* -> CONNECTING */
82ce2c8903SAlex Elder #define CON_SOCK_STATE_CONNECTING	2	/* -> CONNECTED or -> CLOSING */
83ce2c8903SAlex Elder #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
84ce2c8903SAlex Elder #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
85ce2c8903SAlex Elder 
con_flag_valid(unsigned long con_flag)86c9ffc77aSAlex Elder static bool con_flag_valid(unsigned long con_flag)
87c9ffc77aSAlex Elder {
88c9ffc77aSAlex Elder 	switch (con_flag) {
893fefd43eSIlya Dryomov 	case CEPH_CON_F_LOSSYTX:
903fefd43eSIlya Dryomov 	case CEPH_CON_F_KEEPALIVE_PENDING:
913fefd43eSIlya Dryomov 	case CEPH_CON_F_WRITE_PENDING:
923fefd43eSIlya Dryomov 	case CEPH_CON_F_SOCK_CLOSED:
933fefd43eSIlya Dryomov 	case CEPH_CON_F_BACKOFF:
94c9ffc77aSAlex Elder 		return true;
95c9ffc77aSAlex Elder 	default:
96c9ffc77aSAlex Elder 		return false;
97c9ffc77aSAlex Elder 	}
98c9ffc77aSAlex Elder }
99c9ffc77aSAlex Elder 
ceph_con_flag_clear(struct ceph_connection * con,unsigned long con_flag)1006503e0b6SIlya Dryomov void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
101c9ffc77aSAlex Elder {
102c9ffc77aSAlex Elder 	BUG_ON(!con_flag_valid(con_flag));
103c9ffc77aSAlex Elder 
104c9ffc77aSAlex Elder 	clear_bit(con_flag, &con->flags);
105c9ffc77aSAlex Elder }
106c9ffc77aSAlex Elder 
ceph_con_flag_set(struct ceph_connection * con,unsigned long con_flag)1076503e0b6SIlya Dryomov void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
108c9ffc77aSAlex Elder {
109c9ffc77aSAlex Elder 	BUG_ON(!con_flag_valid(con_flag));
110c9ffc77aSAlex Elder 
111c9ffc77aSAlex Elder 	set_bit(con_flag, &con->flags);
112c9ffc77aSAlex Elder }
113c9ffc77aSAlex Elder 
ceph_con_flag_test(struct ceph_connection * con,unsigned long con_flag)1146503e0b6SIlya Dryomov bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
115c9ffc77aSAlex Elder {
116c9ffc77aSAlex Elder 	BUG_ON(!con_flag_valid(con_flag));
117c9ffc77aSAlex Elder 
118c9ffc77aSAlex Elder 	return test_bit(con_flag, &con->flags);
119c9ffc77aSAlex Elder }
120c9ffc77aSAlex Elder 
ceph_con_flag_test_and_clear(struct ceph_connection * con,unsigned long con_flag)1216503e0b6SIlya Dryomov bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
122c9ffc77aSAlex Elder 				  unsigned long con_flag)
123c9ffc77aSAlex Elder {
124c9ffc77aSAlex Elder 	BUG_ON(!con_flag_valid(con_flag));
125c9ffc77aSAlex Elder 
126c9ffc77aSAlex Elder 	return test_and_clear_bit(con_flag, &con->flags);
127c9ffc77aSAlex Elder }
128c9ffc77aSAlex Elder 
ceph_con_flag_test_and_set(struct ceph_connection * con,unsigned long con_flag)1296503e0b6SIlya Dryomov bool ceph_con_flag_test_and_set(struct ceph_connection *con,
130c9ffc77aSAlex Elder 				unsigned long con_flag)
131c9ffc77aSAlex Elder {
132c9ffc77aSAlex Elder 	BUG_ON(!con_flag_valid(con_flag));
133c9ffc77aSAlex Elder 
134c9ffc77aSAlex Elder 	return test_and_set_bit(con_flag, &con->flags);
135c9ffc77aSAlex Elder }
136c9ffc77aSAlex Elder 
137e3d5d638SAlex Elder /* Slab caches for frequently-allocated structures */
138e3d5d638SAlex Elder 
139e3d5d638SAlex Elder static struct kmem_cache	*ceph_msg_cache;
140e3d5d638SAlex Elder 
1413d14c5d2SYehuda Sadeh #ifdef CONFIG_LOCKDEP
1423d14c5d2SYehuda Sadeh static struct lock_class_key socket_class;
1433d14c5d2SYehuda Sadeh #endif
1443d14c5d2SYehuda Sadeh 
1453d14c5d2SYehuda Sadeh static void queue_con(struct ceph_connection *con);
14637ab77acSIlya Dryomov static void cancel_con(struct ceph_connection *con);
14768931622SIlya Dryomov static void ceph_con_workfn(struct work_struct *);
14893209264SAlex Elder static void con_fault(struct ceph_connection *con);
1493d14c5d2SYehuda Sadeh 
1503d14c5d2SYehuda Sadeh /*
151f64a9317SAlex Elder  * Nicely render a sockaddr as a string.  An array of formatted
152f64a9317SAlex Elder  * strings is used, to approximate reentrancy.
1533d14c5d2SYehuda Sadeh  */
154f64a9317SAlex Elder #define ADDR_STR_COUNT_LOG	5	/* log2(# address strings in array) */
155f64a9317SAlex Elder #define ADDR_STR_COUNT		(1 << ADDR_STR_COUNT_LOG)
156f64a9317SAlex Elder #define ADDR_STR_COUNT_MASK	(ADDR_STR_COUNT - 1)
157f64a9317SAlex Elder #define MAX_ADDR_STR_LEN	64	/* 54 is enough */
158f64a9317SAlex Elder 
159f64a9317SAlex Elder static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
160f64a9317SAlex Elder static atomic_t addr_str_seq = ATOMIC_INIT(0);
1613d14c5d2SYehuda Sadeh 
162699921d9SIlya Dryomov struct page *ceph_zero_page;		/* used in certain error cases */
16357666519SAlex Elder 
ceph_pr_addr(const struct ceph_entity_addr * addr)164b726ec97SJeff Layton const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
1653d14c5d2SYehuda Sadeh {
1663d14c5d2SYehuda Sadeh 	int i;
1673d14c5d2SYehuda Sadeh 	char *s;
168b726ec97SJeff Layton 	struct sockaddr_storage ss = addr->in_addr; /* align */
169b726ec97SJeff Layton 	struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
170b726ec97SJeff Layton 	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
1713d14c5d2SYehuda Sadeh 
172f64a9317SAlex Elder 	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
1733d14c5d2SYehuda Sadeh 	s = addr_str[i];
1743d14c5d2SYehuda Sadeh 
175b726ec97SJeff Layton 	switch (ss.ss_family) {
1763d14c5d2SYehuda Sadeh 	case AF_INET:
177d3c3c0a8SJeff Layton 		snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
178d3c3c0a8SJeff Layton 			 le32_to_cpu(addr->type), &in4->sin_addr,
179bd406145SAlex Elder 			 ntohs(in4->sin_port));
1803d14c5d2SYehuda Sadeh 		break;
1813d14c5d2SYehuda Sadeh 
1823d14c5d2SYehuda Sadeh 	case AF_INET6:
183d3c3c0a8SJeff Layton 		snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
184d3c3c0a8SJeff Layton 			 le32_to_cpu(addr->type), &in6->sin6_addr,
185bd406145SAlex Elder 			 ntohs(in6->sin6_port));
1863d14c5d2SYehuda Sadeh 		break;
1873d14c5d2SYehuda Sadeh 
1883d14c5d2SYehuda Sadeh 	default:
189d3002b97SAlex Elder 		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
190b726ec97SJeff Layton 			 ss.ss_family);
1913d14c5d2SYehuda Sadeh 	}
1923d14c5d2SYehuda Sadeh 
1933d14c5d2SYehuda Sadeh 	return s;
1943d14c5d2SYehuda Sadeh }
1953d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_pr_addr);
1963d14c5d2SYehuda Sadeh 
ceph_encode_my_addr(struct ceph_messenger * msgr)1976503e0b6SIlya Dryomov void ceph_encode_my_addr(struct ceph_messenger *msgr)
1983d14c5d2SYehuda Sadeh {
199cd1a677cSIlya Dryomov 	if (!ceph_msgr2(from_msgr(msgr))) {
200cd1a677cSIlya Dryomov 		memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
201cd1a677cSIlya Dryomov 		       sizeof(msgr->my_enc_addr));
2022c66de56SJeff Layton 		ceph_encode_banner_addr(&msgr->my_enc_addr);
2033d14c5d2SYehuda Sadeh 	}
204cd1a677cSIlya Dryomov }
2053d14c5d2SYehuda Sadeh 
2063d14c5d2SYehuda Sadeh /*
2073d14c5d2SYehuda Sadeh  * work queue for all reading and writing to/from the socket.
2083d14c5d2SYehuda Sadeh  */
209e0f43c94SAlex Elder static struct workqueue_struct *ceph_msgr_wq;
2103d14c5d2SYehuda Sadeh 
ceph_msgr_slab_init(void)211e3d5d638SAlex Elder static int ceph_msgr_slab_init(void)
212e3d5d638SAlex Elder {
213e3d5d638SAlex Elder 	BUG_ON(ceph_msg_cache);
2145ee61e95SGeliang Tang 	ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
21581b36be4SAlex Elder 	if (!ceph_msg_cache)
21681b36be4SAlex Elder 		return -ENOMEM;
21781b36be4SAlex Elder 
21881b36be4SAlex Elder 	return 0;
219e3d5d638SAlex Elder }
220e3d5d638SAlex Elder 
ceph_msgr_slab_exit(void)221e3d5d638SAlex Elder static void ceph_msgr_slab_exit(void)
222e3d5d638SAlex Elder {
223e3d5d638SAlex Elder 	BUG_ON(!ceph_msg_cache);
224e3d5d638SAlex Elder 	kmem_cache_destroy(ceph_msg_cache);
225e3d5d638SAlex Elder 	ceph_msg_cache = NULL;
226e3d5d638SAlex Elder }
227e3d5d638SAlex Elder 
_ceph_msgr_exit(void)22815417167SAlex Elder static void _ceph_msgr_exit(void)
2296173d1f0SAlex Elder {
230d3002b97SAlex Elder 	if (ceph_msgr_wq) {
2316173d1f0SAlex Elder 		destroy_workqueue(ceph_msgr_wq);
232d3002b97SAlex Elder 		ceph_msgr_wq = NULL;
233d3002b97SAlex Elder 	}
2346173d1f0SAlex Elder 
235699921d9SIlya Dryomov 	BUG_ON(!ceph_zero_page);
236699921d9SIlya Dryomov 	put_page(ceph_zero_page);
237699921d9SIlya Dryomov 	ceph_zero_page = NULL;
238d920ff6fSBenoît Canet 
239d920ff6fSBenoît Canet 	ceph_msgr_slab_exit();
2406173d1f0SAlex Elder }
2416173d1f0SAlex Elder 
ceph_msgr_init(void)24257a35dfbSChengguang Xu int __init ceph_msgr_init(void)
2433d14c5d2SYehuda Sadeh {
244d920ff6fSBenoît Canet 	if (ceph_msgr_slab_init())
245d920ff6fSBenoît Canet 		return -ENOMEM;
246d920ff6fSBenoît Canet 
247699921d9SIlya Dryomov 	BUG_ON(ceph_zero_page);
248699921d9SIlya Dryomov 	ceph_zero_page = ZERO_PAGE(0);
249699921d9SIlya Dryomov 	get_page(ceph_zero_page);
25057666519SAlex Elder 
251f9865f06SIlya Dryomov 	/*
252f9865f06SIlya Dryomov 	 * The number of active work items is limited by the number of
253f9865f06SIlya Dryomov 	 * connections, so leave @max_active at default.
254f9865f06SIlya Dryomov 	 */
255f9865f06SIlya Dryomov 	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0);
2566173d1f0SAlex Elder 	if (ceph_msgr_wq)
2576173d1f0SAlex Elder 		return 0;
25857666519SAlex Elder 
2596173d1f0SAlex Elder 	pr_err("msgr_init failed to create workqueue\n");
2606173d1f0SAlex Elder 	_ceph_msgr_exit();
26157666519SAlex Elder 
262d96c9043SSage Weil 	return -ENOMEM;
2633d14c5d2SYehuda Sadeh }
2643d14c5d2SYehuda Sadeh 
ceph_msgr_exit(void)2653d14c5d2SYehuda Sadeh void ceph_msgr_exit(void)
2663d14c5d2SYehuda Sadeh {
26757666519SAlex Elder 	BUG_ON(ceph_msgr_wq == NULL);
26857666519SAlex Elder 
2696173d1f0SAlex Elder 	_ceph_msgr_exit();
2703d14c5d2SYehuda Sadeh }
2713d14c5d2SYehuda Sadeh 
ceph_msgr_flush(void)2723d14c5d2SYehuda Sadeh void ceph_msgr_flush(void)
2733d14c5d2SYehuda Sadeh {
2743d14c5d2SYehuda Sadeh 	flush_workqueue(ceph_msgr_wq);
2753d14c5d2SYehuda Sadeh }
2763d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_msgr_flush);
2773d14c5d2SYehuda Sadeh 
278ce2c8903SAlex Elder /* Connection socket state transition functions */
279ce2c8903SAlex Elder 
con_sock_state_init(struct ceph_connection * con)280ce2c8903SAlex Elder static void con_sock_state_init(struct ceph_connection *con)
281ce2c8903SAlex Elder {
282ce2c8903SAlex Elder 	int old_state;
283ce2c8903SAlex Elder 
284ce2c8903SAlex Elder 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
285ce2c8903SAlex Elder 	if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
286ce2c8903SAlex Elder 		printk("%s: unexpected old state %d\n", __func__, old_state);
2878007b8d6SSage Weil 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
2888007b8d6SSage Weil 	     CON_SOCK_STATE_CLOSED);
289ce2c8903SAlex Elder }
290ce2c8903SAlex Elder 
con_sock_state_connecting(struct ceph_connection * con)291ce2c8903SAlex Elder static void con_sock_state_connecting(struct ceph_connection *con)
292ce2c8903SAlex Elder {
293ce2c8903SAlex Elder 	int old_state;
294ce2c8903SAlex Elder 
295ce2c8903SAlex Elder 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
296ce2c8903SAlex Elder 	if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
297ce2c8903SAlex Elder 		printk("%s: unexpected old state %d\n", __func__, old_state);
2988007b8d6SSage Weil 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
2998007b8d6SSage Weil 	     CON_SOCK_STATE_CONNECTING);
300ce2c8903SAlex Elder }
301ce2c8903SAlex Elder 
con_sock_state_connected(struct ceph_connection * con)302ce2c8903SAlex Elder static void con_sock_state_connected(struct ceph_connection *con)
303ce2c8903SAlex Elder {
304ce2c8903SAlex Elder 	int old_state;
305ce2c8903SAlex Elder 
306ce2c8903SAlex Elder 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
307ce2c8903SAlex Elder 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
308ce2c8903SAlex Elder 		printk("%s: unexpected old state %d\n", __func__, old_state);
3098007b8d6SSage Weil 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
3108007b8d6SSage Weil 	     CON_SOCK_STATE_CONNECTED);
311ce2c8903SAlex Elder }
312ce2c8903SAlex Elder 
con_sock_state_closing(struct ceph_connection * con)313ce2c8903SAlex Elder static void con_sock_state_closing(struct ceph_connection *con)
314ce2c8903SAlex Elder {
315ce2c8903SAlex Elder 	int old_state;
316ce2c8903SAlex Elder 
317ce2c8903SAlex Elder 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
318ce2c8903SAlex Elder 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
319ce2c8903SAlex Elder 			old_state != CON_SOCK_STATE_CONNECTED &&
320ce2c8903SAlex Elder 			old_state != CON_SOCK_STATE_CLOSING))
321ce2c8903SAlex Elder 		printk("%s: unexpected old state %d\n", __func__, old_state);
3228007b8d6SSage Weil 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
3238007b8d6SSage Weil 	     CON_SOCK_STATE_CLOSING);
324ce2c8903SAlex Elder }
325ce2c8903SAlex Elder 
con_sock_state_closed(struct ceph_connection * con)326ce2c8903SAlex Elder static void con_sock_state_closed(struct ceph_connection *con)
327ce2c8903SAlex Elder {
328ce2c8903SAlex Elder 	int old_state;
329ce2c8903SAlex Elder 
330ce2c8903SAlex Elder 	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
331ce2c8903SAlex Elder 	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
332fbb85a47SSage Weil 		    old_state != CON_SOCK_STATE_CLOSING &&
3338007b8d6SSage Weil 		    old_state != CON_SOCK_STATE_CONNECTING &&
3348007b8d6SSage Weil 		    old_state != CON_SOCK_STATE_CLOSED))
335ce2c8903SAlex Elder 		printk("%s: unexpected old state %d\n", __func__, old_state);
3368007b8d6SSage Weil 	dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
3378007b8d6SSage Weil 	     CON_SOCK_STATE_CLOSED);
338ce2c8903SAlex Elder }
3393d14c5d2SYehuda Sadeh 
3403d14c5d2SYehuda Sadeh /*
3413d14c5d2SYehuda Sadeh  * socket callback functions
3423d14c5d2SYehuda Sadeh  */
3433d14c5d2SYehuda Sadeh 
3443d14c5d2SYehuda Sadeh /* data available on socket, or listen socket received a connect */
ceph_sock_data_ready(struct sock * sk)345676d2369SDavid S. Miller static void ceph_sock_data_ready(struct sock *sk)
3463d14c5d2SYehuda Sadeh {
347bd406145SAlex Elder 	struct ceph_connection *con = sk->sk_user_data;
34840e0b090SPeilin Ye 
34940e0b090SPeilin Ye 	trace_sk_data_ready(sk);
35040e0b090SPeilin Ye 
351a2a32584SGuanjun He 	if (atomic_read(&con->msgr->stopping)) {
352a2a32584SGuanjun He 		return;
353a2a32584SGuanjun He 	}
354bd406145SAlex Elder 
3553d14c5d2SYehuda Sadeh 	if (sk->sk_state != TCP_CLOSE_WAIT) {
35630be780aSIlya Dryomov 		dout("%s %p state = %d, queueing work\n", __func__,
3573d14c5d2SYehuda Sadeh 		     con, con->state);
3583d14c5d2SYehuda Sadeh 		queue_con(con);
3593d14c5d2SYehuda Sadeh 	}
3603d14c5d2SYehuda Sadeh }
3613d14c5d2SYehuda Sadeh 
3623d14c5d2SYehuda Sadeh /* socket has buffer space for writing */
ceph_sock_write_space(struct sock * sk)363327800bdSAlex Elder static void ceph_sock_write_space(struct sock *sk)
3643d14c5d2SYehuda Sadeh {
365d3002b97SAlex Elder 	struct ceph_connection *con = sk->sk_user_data;
3663d14c5d2SYehuda Sadeh 
367182fac26SJim Schutt 	/* only queue to workqueue if there is data we want to write,
368182fac26SJim Schutt 	 * and there is sufficient space in the socket buffer to accept
369327800bdSAlex Elder 	 * more data.  clear SOCK_NOSPACE so that ceph_sock_write_space()
370182fac26SJim Schutt 	 * doesn't get called again until try_write() fills the socket
371182fac26SJim Schutt 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
372182fac26SJim Schutt 	 * and net/core/stream.c:sk_stream_write_space().
373182fac26SJim Schutt 	 */
3746503e0b6SIlya Dryomov 	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
37564dc6130SEric Dumazet 		if (sk_stream_is_writeable(sk)) {
376327800bdSAlex Elder 			dout("%s %p queueing write work\n", __func__, con);
377182fac26SJim Schutt 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
3783d14c5d2SYehuda Sadeh 			queue_con(con);
379182fac26SJim Schutt 		}
3803d14c5d2SYehuda Sadeh 	} else {
381327800bdSAlex Elder 		dout("%s %p nothing to write\n", __func__, con);
3823d14c5d2SYehuda Sadeh 	}
3833d14c5d2SYehuda Sadeh }
3843d14c5d2SYehuda Sadeh 
3853d14c5d2SYehuda Sadeh /* socket's state has changed */
ceph_sock_state_change(struct sock * sk)386327800bdSAlex Elder static void ceph_sock_state_change(struct sock *sk)
3873d14c5d2SYehuda Sadeh {
388bd406145SAlex Elder 	struct ceph_connection *con = sk->sk_user_data;
3893d14c5d2SYehuda Sadeh 
39030be780aSIlya Dryomov 	dout("%s %p state = %d sk_state = %u\n", __func__,
3913d14c5d2SYehuda Sadeh 	     con, con->state, sk->sk_state);
3923d14c5d2SYehuda Sadeh 
3933d14c5d2SYehuda Sadeh 	switch (sk->sk_state) {
3943d14c5d2SYehuda Sadeh 	case TCP_CLOSE:
395327800bdSAlex Elder 		dout("%s TCP_CLOSE\n", __func__);
396df561f66SGustavo A. R. Silva 		fallthrough;
3973d14c5d2SYehuda Sadeh 	case TCP_CLOSE_WAIT:
398327800bdSAlex Elder 		dout("%s TCP_CLOSE_WAIT\n", __func__);
399ce2c8903SAlex Elder 		con_sock_state_closing(con);
4006503e0b6SIlya Dryomov 		ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
4013d14c5d2SYehuda Sadeh 		queue_con(con);
4023d14c5d2SYehuda Sadeh 		break;
4033d14c5d2SYehuda Sadeh 	case TCP_ESTABLISHED:
404327800bdSAlex Elder 		dout("%s TCP_ESTABLISHED\n", __func__);
405ce2c8903SAlex Elder 		con_sock_state_connected(con);
4063d14c5d2SYehuda Sadeh 		queue_con(con);
4073d14c5d2SYehuda Sadeh 		break;
408d3002b97SAlex Elder 	default:	/* Everything else is uninteresting */
409d3002b97SAlex Elder 		break;
4103d14c5d2SYehuda Sadeh 	}
4113d14c5d2SYehuda Sadeh }
4123d14c5d2SYehuda Sadeh 
4133d14c5d2SYehuda Sadeh /*
4143d14c5d2SYehuda Sadeh  * set up socket callbacks
4153d14c5d2SYehuda Sadeh  */
set_sock_callbacks(struct socket * sock,struct ceph_connection * con)4163d14c5d2SYehuda Sadeh static void set_sock_callbacks(struct socket *sock,
4173d14c5d2SYehuda Sadeh 			       struct ceph_connection *con)
4183d14c5d2SYehuda Sadeh {
4193d14c5d2SYehuda Sadeh 	struct sock *sk = sock->sk;
420bd406145SAlex Elder 	sk->sk_user_data = con;
421327800bdSAlex Elder 	sk->sk_data_ready = ceph_sock_data_ready;
422327800bdSAlex Elder 	sk->sk_write_space = ceph_sock_write_space;
423327800bdSAlex Elder 	sk->sk_state_change = ceph_sock_state_change;
4243d14c5d2SYehuda Sadeh }
4253d14c5d2SYehuda Sadeh 
4263d14c5d2SYehuda Sadeh 
4273d14c5d2SYehuda Sadeh /*
4283d14c5d2SYehuda Sadeh  * socket helpers
4293d14c5d2SYehuda Sadeh  */
4303d14c5d2SYehuda Sadeh 
4313d14c5d2SYehuda Sadeh /*
4323d14c5d2SYehuda Sadeh  * initiate connection to a remote socket.
4333d14c5d2SYehuda Sadeh  */
ceph_tcp_connect(struct ceph_connection * con)4346503e0b6SIlya Dryomov int ceph_tcp_connect(struct ceph_connection *con)
4353d14c5d2SYehuda Sadeh {
436cede185bSJeff Layton 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
4373d14c5d2SYehuda Sadeh 	struct socket *sock;
438633ee407SIlya Dryomov 	unsigned int noio_flag;
4393d14c5d2SYehuda Sadeh 	int ret;
4403d14c5d2SYehuda Sadeh 
4416503e0b6SIlya Dryomov 	dout("%s con %p peer_addr %s\n", __func__, con,
4426503e0b6SIlya Dryomov 	     ceph_pr_addr(&con->peer_addr));
4433d14c5d2SYehuda Sadeh 	BUG_ON(con->sock);
444633ee407SIlya Dryomov 
445633ee407SIlya Dryomov 	/* sock_create_kern() allocates with GFP_KERNEL */
446633ee407SIlya Dryomov 	noio_flag = memalloc_noio_save();
447cede185bSJeff Layton 	ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
448eeb1bd5cSEric W. Biederman 			       SOCK_STREAM, IPPROTO_TCP, &sock);
449633ee407SIlya Dryomov 	memalloc_noio_restore(noio_flag);
4503d14c5d2SYehuda Sadeh 	if (ret)
45141617d0cSAlex Elder 		return ret;
4526d7fdb0aSIlya Dryomov 	sock->sk->sk_allocation = GFP_NOFS;
45398123866SBenjamin Coddington 	sock->sk->sk_use_task_frag = false;
4543d14c5d2SYehuda Sadeh 
4553d14c5d2SYehuda Sadeh #ifdef CONFIG_LOCKDEP
4563d14c5d2SYehuda Sadeh 	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
4573d14c5d2SYehuda Sadeh #endif
4583d14c5d2SYehuda Sadeh 
4593d14c5d2SYehuda Sadeh 	set_sock_callbacks(sock, con);
4603d14c5d2SYehuda Sadeh 
46189a86be0SSage Weil 	con_sock_state_connecting(con);
462*7563cf17SJordan Rife 	ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss),
4633d14c5d2SYehuda Sadeh 			     O_NONBLOCK);
4643d14c5d2SYehuda Sadeh 	if (ret == -EINPROGRESS) {
4653d14c5d2SYehuda Sadeh 		dout("connect %s EINPROGRESS sk_state = %u\n",
466b726ec97SJeff Layton 		     ceph_pr_addr(&con->peer_addr),
4673d14c5d2SYehuda Sadeh 		     sock->sk->sk_state);
468a5bc3129SAlex Elder 	} else if (ret < 0) {
4693d14c5d2SYehuda Sadeh 		pr_err("connect %s error %d\n",
470b726ec97SJeff Layton 		       ceph_pr_addr(&con->peer_addr), ret);
4713d14c5d2SYehuda Sadeh 		sock_release(sock);
47241617d0cSAlex Elder 		return ret;
473a5bc3129SAlex Elder 	}
47489baaa57SMike Christie 
47512abc5eeSChristoph Hellwig 	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
47612abc5eeSChristoph Hellwig 		tcp_sock_set_nodelay(sock->sk);
477ba988f87SChaitanya Huilgol 
478a5bc3129SAlex Elder 	con->sock = sock;
47941617d0cSAlex Elder 	return 0;
4803d14c5d2SYehuda Sadeh }
4813d14c5d2SYehuda Sadeh 
482e5c93883SIlya Dryomov /*
4833d14c5d2SYehuda Sadeh  * Shutdown/close the socket for the given connection.
4843d14c5d2SYehuda Sadeh  */
ceph_con_close_socket(struct ceph_connection * con)4856503e0b6SIlya Dryomov int ceph_con_close_socket(struct ceph_connection *con)
4863d14c5d2SYehuda Sadeh {
4878007b8d6SSage Weil 	int rc = 0;
4883d14c5d2SYehuda Sadeh 
4896503e0b6SIlya Dryomov 	dout("%s con %p sock %p\n", __func__, con, con->sock);
4908007b8d6SSage Weil 	if (con->sock) {
4913d14c5d2SYehuda Sadeh 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
4923d14c5d2SYehuda Sadeh 		sock_release(con->sock);
4933d14c5d2SYehuda Sadeh 		con->sock = NULL;
4948007b8d6SSage Weil 	}
495456ea468SAlex Elder 
496456ea468SAlex Elder 	/*
4974a861692SSage Weil 	 * Forcibly clear the SOCK_CLOSED flag.  It gets set
498456ea468SAlex Elder 	 * independent of the connection mutex, and we could have
499456ea468SAlex Elder 	 * received a socket close event before we had the chance to
500456ea468SAlex Elder 	 * shut the socket down.
501456ea468SAlex Elder 	 */
5026503e0b6SIlya Dryomov 	ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
5038007b8d6SSage Weil 
504ce2c8903SAlex Elder 	con_sock_state_closed(con);
5053d14c5d2SYehuda Sadeh 	return rc;
5063d14c5d2SYehuda Sadeh }
5073d14c5d2SYehuda Sadeh 
ceph_con_reset_protocol(struct ceph_connection * con)5083596f4c1SIlya Dryomov static void ceph_con_reset_protocol(struct ceph_connection *con)
5093596f4c1SIlya Dryomov {
5103596f4c1SIlya Dryomov 	dout("%s con %p\n", __func__, con);
5113596f4c1SIlya Dryomov 
5126503e0b6SIlya Dryomov 	ceph_con_close_socket(con);
5133596f4c1SIlya Dryomov 	if (con->in_msg) {
5143596f4c1SIlya Dryomov 		WARN_ON(con->in_msg->con != con);
5153596f4c1SIlya Dryomov 		ceph_msg_put(con->in_msg);
5163596f4c1SIlya Dryomov 		con->in_msg = NULL;
5173596f4c1SIlya Dryomov 	}
5183596f4c1SIlya Dryomov 	if (con->out_msg) {
5193596f4c1SIlya Dryomov 		WARN_ON(con->out_msg->con != con);
5203596f4c1SIlya Dryomov 		ceph_msg_put(con->out_msg);
5213596f4c1SIlya Dryomov 		con->out_msg = NULL;
5223596f4c1SIlya Dryomov 	}
523038b8d1dSIlya Dryomov 	if (con->bounce_page) {
524038b8d1dSIlya Dryomov 		__free_page(con->bounce_page);
525038b8d1dSIlya Dryomov 		con->bounce_page = NULL;
526038b8d1dSIlya Dryomov 	}
5273596f4c1SIlya Dryomov 
528cd1a677cSIlya Dryomov 	if (ceph_msgr2(from_msgr(con->msgr)))
529cd1a677cSIlya Dryomov 		ceph_con_v2_reset_protocol(con);
530cd1a677cSIlya Dryomov 	else
531566050e1SIlya Dryomov 		ceph_con_v1_reset_protocol(con);
5323596f4c1SIlya Dryomov }
5333596f4c1SIlya Dryomov 
5343d14c5d2SYehuda Sadeh /*
5353d14c5d2SYehuda Sadeh  * Reset a connection.  Discard all incoming and outgoing messages
5363d14c5d2SYehuda Sadeh  * and clear *_seq state.
5373d14c5d2SYehuda Sadeh  */
ceph_msg_remove(struct ceph_msg * msg)5383d14c5d2SYehuda Sadeh static void ceph_msg_remove(struct ceph_msg *msg)
5393d14c5d2SYehuda Sadeh {
5403d14c5d2SYehuda Sadeh 	list_del_init(&msg->list_head);
54138941f80SAlex Elder 
5423d14c5d2SYehuda Sadeh 	ceph_msg_put(msg);
5433d14c5d2SYehuda Sadeh }
544cd1a677cSIlya Dryomov 
ceph_msg_remove_list(struct list_head * head)5453d14c5d2SYehuda Sadeh static void ceph_msg_remove_list(struct list_head *head)
5463d14c5d2SYehuda Sadeh {
5473d14c5d2SYehuda Sadeh 	while (!list_empty(head)) {
5483d14c5d2SYehuda Sadeh 		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
5493d14c5d2SYehuda Sadeh 							list_head);
5503d14c5d2SYehuda Sadeh 		ceph_msg_remove(msg);
5513d14c5d2SYehuda Sadeh 	}
5523d14c5d2SYehuda Sadeh }
5533d14c5d2SYehuda Sadeh 
ceph_con_reset_session(struct ceph_connection * con)5546503e0b6SIlya Dryomov void ceph_con_reset_session(struct ceph_connection *con)
5553d14c5d2SYehuda Sadeh {
5565963c3d0SIlya Dryomov 	dout("%s con %p\n", __func__, con);
5573596f4c1SIlya Dryomov 
5583596f4c1SIlya Dryomov 	WARN_ON(con->in_msg);
5593596f4c1SIlya Dryomov 	WARN_ON(con->out_msg);
5603d14c5d2SYehuda Sadeh 	ceph_msg_remove_list(&con->out_queue);
5613d14c5d2SYehuda Sadeh 	ceph_msg_remove_list(&con->out_sent);
5623d14c5d2SYehuda Sadeh 	con->out_seq = 0;
5633d14c5d2SYehuda Sadeh 	con->in_seq = 0;
5643d14c5d2SYehuda Sadeh 	con->in_seq_acked = 0;
565a3da057bSIlya Dryomov 
566cd1a677cSIlya Dryomov 	if (ceph_msgr2(from_msgr(con->msgr)))
567cd1a677cSIlya Dryomov 		ceph_con_v2_reset_session(con);
568cd1a677cSIlya Dryomov 	else
569566050e1SIlya Dryomov 		ceph_con_v1_reset_session(con);
5703d14c5d2SYehuda Sadeh }
5713d14c5d2SYehuda Sadeh 
5723d14c5d2SYehuda Sadeh /*
5733d14c5d2SYehuda Sadeh  * mark a peer down.  drop any open connections.
5743d14c5d2SYehuda Sadeh  */
ceph_con_close(struct ceph_connection * con)5753d14c5d2SYehuda Sadeh void ceph_con_close(struct ceph_connection *con)
5763d14c5d2SYehuda Sadeh {
5778c50c817SSage Weil 	mutex_lock(&con->mutex);
578b726ec97SJeff Layton 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
5796d7f62bfSIlya Dryomov 	con->state = CEPH_CON_S_CLOSED;
580a5988c49SAlex Elder 
5816503e0b6SIlya Dryomov 	ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next
5826503e0b6SIlya Dryomov 							  connect */
5836503e0b6SIlya Dryomov 	ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
5846503e0b6SIlya Dryomov 	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
5856503e0b6SIlya Dryomov 	ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
586a5988c49SAlex Elder 
5873596f4c1SIlya Dryomov 	ceph_con_reset_protocol(con);
5885963c3d0SIlya Dryomov 	ceph_con_reset_session(con);
58937ab77acSIlya Dryomov 	cancel_con(con);
5903d14c5d2SYehuda Sadeh 	mutex_unlock(&con->mutex);
5913d14c5d2SYehuda Sadeh }
5923d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_con_close);
5933d14c5d2SYehuda Sadeh 
5943d14c5d2SYehuda Sadeh /*
5953d14c5d2SYehuda Sadeh  * Reopen a closed connection, with a new peer address.
5963d14c5d2SYehuda Sadeh  */
ceph_con_open(struct ceph_connection * con,__u8 entity_type,__u64 entity_num,struct ceph_entity_addr * addr)597b7a9e5ddSSage Weil void ceph_con_open(struct ceph_connection *con,
598b7a9e5ddSSage Weil 		   __u8 entity_type, __u64 entity_num,
599b7a9e5ddSSage Weil 		   struct ceph_entity_addr *addr)
6003d14c5d2SYehuda Sadeh {
6015469155fSSage Weil 	mutex_lock(&con->mutex);
602b726ec97SJeff Layton 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
6038dacc7daSSage Weil 
6046d7f62bfSIlya Dryomov 	WARN_ON(con->state != CEPH_CON_S_CLOSED);
6056d7f62bfSIlya Dryomov 	con->state = CEPH_CON_S_PREOPEN;
606a5988c49SAlex Elder 
607b7a9e5ddSSage Weil 	con->peer_name.type = (__u8) entity_type;
608b7a9e5ddSSage Weil 	con->peer_name.num = cpu_to_le64(entity_num);
609b7a9e5ddSSage Weil 
6103d14c5d2SYehuda Sadeh 	memcpy(&con->peer_addr, addr, sizeof(*addr));
6113d14c5d2SYehuda Sadeh 	con->delay = 0;      /* reset backoff memory */
6125469155fSSage Weil 	mutex_unlock(&con->mutex);
6133d14c5d2SYehuda Sadeh 	queue_con(con);
6143d14c5d2SYehuda Sadeh }
6153d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_con_open);
6163d14c5d2SYehuda Sadeh 
6173d14c5d2SYehuda Sadeh /*
6183d14c5d2SYehuda Sadeh  * return true if this connection ever successfully opened
6193d14c5d2SYehuda Sadeh  */
ceph_con_opened(struct ceph_connection * con)6203d14c5d2SYehuda Sadeh bool ceph_con_opened(struct ceph_connection *con)
6213d14c5d2SYehuda Sadeh {
622cd1a677cSIlya Dryomov 	if (ceph_msgr2(from_msgr(con->msgr)))
623cd1a677cSIlya Dryomov 		return ceph_con_v2_opened(con);
624cd1a677cSIlya Dryomov 
625566050e1SIlya Dryomov 	return ceph_con_v1_opened(con);
6263d14c5d2SYehuda Sadeh }
6273d14c5d2SYehuda Sadeh 
6283d14c5d2SYehuda Sadeh /*
6293d14c5d2SYehuda Sadeh  * initialize a new connection.
6303d14c5d2SYehuda Sadeh  */
ceph_con_init(struct ceph_connection * con,void * private,const struct ceph_connection_operations * ops,struct ceph_messenger * msgr)6311bfd89f4SAlex Elder void ceph_con_init(struct ceph_connection *con, void *private,
6321bfd89f4SAlex Elder 	const struct ceph_connection_operations *ops,
633b7a9e5ddSSage Weil 	struct ceph_messenger *msgr)
6343d14c5d2SYehuda Sadeh {
6353d14c5d2SYehuda Sadeh 	dout("con_init %p\n", con);
6363d14c5d2SYehuda Sadeh 	memset(con, 0, sizeof(*con));
6371bfd89f4SAlex Elder 	con->private = private;
6381bfd89f4SAlex Elder 	con->ops = ops;
6393d14c5d2SYehuda Sadeh 	con->msgr = msgr;
640ce2c8903SAlex Elder 
641ce2c8903SAlex Elder 	con_sock_state_init(con);
642ce2c8903SAlex Elder 
6433d14c5d2SYehuda Sadeh 	mutex_init(&con->mutex);
6443d14c5d2SYehuda Sadeh 	INIT_LIST_HEAD(&con->out_queue);
6453d14c5d2SYehuda Sadeh 	INIT_LIST_HEAD(&con->out_sent);
64668931622SIlya Dryomov 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
647a5988c49SAlex Elder 
6486d7f62bfSIlya Dryomov 	con->state = CEPH_CON_S_CLOSED;
6493d14c5d2SYehuda Sadeh }
6503d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_con_init);
6513d14c5d2SYehuda Sadeh 
6523d14c5d2SYehuda Sadeh /*
6533d14c5d2SYehuda Sadeh  * We maintain a global counter to order connection attempts.  Get
6543d14c5d2SYehuda Sadeh  * a unique seq greater than @gt.
6553d14c5d2SYehuda Sadeh  */
ceph_get_global_seq(struct ceph_messenger * msgr,u32 gt)6566503e0b6SIlya Dryomov u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
6573d14c5d2SYehuda Sadeh {
6583d14c5d2SYehuda Sadeh 	u32 ret;
6593d14c5d2SYehuda Sadeh 
6603d14c5d2SYehuda Sadeh 	spin_lock(&msgr->global_seq_lock);
6613d14c5d2SYehuda Sadeh 	if (msgr->global_seq < gt)
6623d14c5d2SYehuda Sadeh 		msgr->global_seq = gt;
6633d14c5d2SYehuda Sadeh 	ret = ++msgr->global_seq;
6643d14c5d2SYehuda Sadeh 	spin_unlock(&msgr->global_seq_lock);
6653d14c5d2SYehuda Sadeh 	return ret;
6663d14c5d2SYehuda Sadeh }
6673d14c5d2SYehuda Sadeh 
66802471928SIlya Dryomov /*
66902471928SIlya Dryomov  * Discard messages that have been acked by the server.
67002471928SIlya Dryomov  */
ceph_con_discard_sent(struct ceph_connection * con,u64 ack_seq)6716503e0b6SIlya Dryomov void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
67202471928SIlya Dryomov {
67302471928SIlya Dryomov 	struct ceph_msg *msg;
67402471928SIlya Dryomov 	u64 seq;
67502471928SIlya Dryomov 
67602471928SIlya Dryomov 	dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
67702471928SIlya Dryomov 	while (!list_empty(&con->out_sent)) {
67802471928SIlya Dryomov 		msg = list_first_entry(&con->out_sent, struct ceph_msg,
67902471928SIlya Dryomov 				       list_head);
68002471928SIlya Dryomov 		WARN_ON(msg->needs_out_seq);
68102471928SIlya Dryomov 		seq = le64_to_cpu(msg->hdr.seq);
68202471928SIlya Dryomov 		if (seq > ack_seq)
68302471928SIlya Dryomov 			break;
68402471928SIlya Dryomov 
68502471928SIlya Dryomov 		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
68602471928SIlya Dryomov 		     msg, seq);
68702471928SIlya Dryomov 		ceph_msg_remove(msg);
68802471928SIlya Dryomov 	}
68902471928SIlya Dryomov }
69002471928SIlya Dryomov 
69102471928SIlya Dryomov /*
69202471928SIlya Dryomov  * Discard messages that have been requeued in con_fault(), up to
69302471928SIlya Dryomov  * reconnect_seq.  This avoids gratuitously resending messages that
69402471928SIlya Dryomov  * the server had received and handled prior to reconnect.
69502471928SIlya Dryomov  */
ceph_con_discard_requeued(struct ceph_connection * con,u64 reconnect_seq)6966503e0b6SIlya Dryomov void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
69702471928SIlya Dryomov {
69802471928SIlya Dryomov 	struct ceph_msg *msg;
69902471928SIlya Dryomov 	u64 seq;
70002471928SIlya Dryomov 
70102471928SIlya Dryomov 	dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
70202471928SIlya Dryomov 	while (!list_empty(&con->out_queue)) {
70302471928SIlya Dryomov 		msg = list_first_entry(&con->out_queue, struct ceph_msg,
70402471928SIlya Dryomov 				       list_head);
70502471928SIlya Dryomov 		if (msg->needs_out_seq)
70602471928SIlya Dryomov 			break;
70702471928SIlya Dryomov 		seq = le64_to_cpu(msg->hdr.seq);
70802471928SIlya Dryomov 		if (seq > reconnect_seq)
70902471928SIlya Dryomov 			break;
71002471928SIlya Dryomov 
71102471928SIlya Dryomov 		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
71202471928SIlya Dryomov 		     msg, seq);
71302471928SIlya Dryomov 		ceph_msg_remove(msg);
71402471928SIlya Dryomov 	}
71502471928SIlya Dryomov }
71602471928SIlya Dryomov 
717df6ad1f9SAlex Elder #ifdef CONFIG_BLOCK
7186aaa4511SAlex Elder 
7196aaa4511SAlex Elder /*
7206aaa4511SAlex Elder  * For a bio data item, a piece is whatever remains of the next
7216aaa4511SAlex Elder  * entry in the current bio iovec, or the first entry in the next
7226aaa4511SAlex Elder  * bio in the list.
7236aaa4511SAlex Elder  */
ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)7248ae4f4f5SAlex Elder static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
72525aff7c5SAlex Elder 					size_t length)
7266aaa4511SAlex Elder {
7278ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
7285359a17dSIlya Dryomov 	struct ceph_bio_iter *it = &cursor->bio_iter;
7296aaa4511SAlex Elder 
7305359a17dSIlya Dryomov 	cursor->resid = min_t(size_t, length, data->bio_length);
7315359a17dSIlya Dryomov 	*it = data->bio_pos;
7325359a17dSIlya Dryomov 	if (cursor->resid < it->iter.bi_size)
7335359a17dSIlya Dryomov 		it->iter.bi_size = cursor->resid;
7346aaa4511SAlex Elder 
7355359a17dSIlya Dryomov 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
7366aaa4511SAlex Elder }
7376aaa4511SAlex Elder 
ceph_msg_data_bio_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)7388ae4f4f5SAlex Elder static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
7396aaa4511SAlex Elder 						size_t *page_offset,
7406aaa4511SAlex Elder 						size_t *length)
7416aaa4511SAlex Elder {
7425359a17dSIlya Dryomov 	struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
7435359a17dSIlya Dryomov 					   cursor->bio_iter.iter);
7446aaa4511SAlex Elder 
7455359a17dSIlya Dryomov 	*page_offset = bv.bv_offset;
7465359a17dSIlya Dryomov 	*length = bv.bv_len;
7475359a17dSIlya Dryomov 	return bv.bv_page;
7486aaa4511SAlex Elder }
7496aaa4511SAlex Elder 
ceph_msg_data_bio_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)7508ae4f4f5SAlex Elder static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
7518ae4f4f5SAlex Elder 					size_t bytes)
7526aaa4511SAlex Elder {
7535359a17dSIlya Dryomov 	struct ceph_bio_iter *it = &cursor->bio_iter;
754187df763SIlya Dryomov 	struct page *page = bio_iter_page(it->bio, it->iter);
7556aaa4511SAlex Elder 
7565359a17dSIlya Dryomov 	BUG_ON(bytes > cursor->resid);
7575359a17dSIlya Dryomov 	BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
75825aff7c5SAlex Elder 	cursor->resid -= bytes;
7595359a17dSIlya Dryomov 	bio_advance_iter(it->bio, &it->iter, bytes);
760f38a5181SKent Overstreet 
761da4ab869SJeff Layton 	if (!cursor->resid)
7625359a17dSIlya Dryomov 		return false;   /* no more data */
763f38a5181SKent Overstreet 
764187df763SIlya Dryomov 	if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
765187df763SIlya Dryomov 		       page == bio_iter_page(it->bio, it->iter)))
7666aaa4511SAlex Elder 		return false;	/* more bytes to process in this segment */
7676aaa4511SAlex Elder 
7685359a17dSIlya Dryomov 	if (!it->iter.bi_size) {
7695359a17dSIlya Dryomov 		it->bio = it->bio->bi_next;
7705359a17dSIlya Dryomov 		it->iter = it->bio->bi_iter;
7715359a17dSIlya Dryomov 		if (cursor->resid < it->iter.bi_size)
7725359a17dSIlya Dryomov 			it->iter.bi_size = cursor->resid;
7730ec1d15eSIlya Dryomov 	}
7746aaa4511SAlex Elder 
7755359a17dSIlya Dryomov 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
7766aaa4511SAlex Elder 	return true;
7776aaa4511SAlex Elder }
778ea96571fSAlex Elder #endif /* CONFIG_BLOCK */
779df6ad1f9SAlex Elder 
ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)780b9e281c2SIlya Dryomov static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
781b9e281c2SIlya Dryomov 					size_t length)
782b9e281c2SIlya Dryomov {
783b9e281c2SIlya Dryomov 	struct ceph_msg_data *data = cursor->data;
784b9e281c2SIlya Dryomov 	struct bio_vec *bvecs = data->bvec_pos.bvecs;
785b9e281c2SIlya Dryomov 
786b9e281c2SIlya Dryomov 	cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size);
787b9e281c2SIlya Dryomov 	cursor->bvec_iter = data->bvec_pos.iter;
788b9e281c2SIlya Dryomov 	cursor->bvec_iter.bi_size = cursor->resid;
789b9e281c2SIlya Dryomov 
790b9e281c2SIlya Dryomov 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
791b9e281c2SIlya Dryomov }
792b9e281c2SIlya Dryomov 
ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)793b9e281c2SIlya Dryomov static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
794b9e281c2SIlya Dryomov 						size_t *page_offset,
795b9e281c2SIlya Dryomov 						size_t *length)
796b9e281c2SIlya Dryomov {
797b9e281c2SIlya Dryomov 	struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs,
798b9e281c2SIlya Dryomov 					   cursor->bvec_iter);
799b9e281c2SIlya Dryomov 
800b9e281c2SIlya Dryomov 	*page_offset = bv.bv_offset;
801b9e281c2SIlya Dryomov 	*length = bv.bv_len;
802b9e281c2SIlya Dryomov 	return bv.bv_page;
803b9e281c2SIlya Dryomov }
804b9e281c2SIlya Dryomov 
ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)805b9e281c2SIlya Dryomov static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
806b9e281c2SIlya Dryomov 					size_t bytes)
807b9e281c2SIlya Dryomov {
808b9e281c2SIlya Dryomov 	struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
809187df763SIlya Dryomov 	struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter);
810b9e281c2SIlya Dryomov 
811b9e281c2SIlya Dryomov 	BUG_ON(bytes > cursor->resid);
812b9e281c2SIlya Dryomov 	BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
813b9e281c2SIlya Dryomov 	cursor->resid -= bytes;
814b9e281c2SIlya Dryomov 	bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
815b9e281c2SIlya Dryomov 
816da4ab869SJeff Layton 	if (!cursor->resid)
817b9e281c2SIlya Dryomov 		return false;   /* no more data */
818b9e281c2SIlya Dryomov 
819187df763SIlya Dryomov 	if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
820187df763SIlya Dryomov 		       page == bvec_iter_page(bvecs, cursor->bvec_iter)))
821b9e281c2SIlya Dryomov 		return false;	/* more bytes to process in this segment */
822b9e281c2SIlya Dryomov 
823b9e281c2SIlya Dryomov 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
824b9e281c2SIlya Dryomov 	return true;
825b9e281c2SIlya Dryomov }
826b9e281c2SIlya Dryomov 
827fe38a2b6SAlex Elder /*
828e766d7b5SAlex Elder  * For a page array, a piece comes from the first page in the array
829e766d7b5SAlex Elder  * that has not already been fully consumed.
830e766d7b5SAlex Elder  */
ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)8318ae4f4f5SAlex Elder static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
83225aff7c5SAlex Elder 					size_t length)
833e766d7b5SAlex Elder {
8348ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
835e766d7b5SAlex Elder 	int page_count;
836e766d7b5SAlex Elder 
837e766d7b5SAlex Elder 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
838e766d7b5SAlex Elder 
839e766d7b5SAlex Elder 	BUG_ON(!data->pages);
840e766d7b5SAlex Elder 	BUG_ON(!data->length);
841e766d7b5SAlex Elder 
842ca8b3a69SAlex Elder 	cursor->resid = min(length, data->length);
843e766d7b5SAlex Elder 	page_count = calc_pages_for(data->alignment, (u64)data->length);
844e766d7b5SAlex Elder 	cursor->page_offset = data->alignment & ~PAGE_MASK;
845e766d7b5SAlex Elder 	cursor->page_index = 0;
84625aff7c5SAlex Elder 	BUG_ON(page_count > (int)USHRT_MAX);
847e766d7b5SAlex Elder 	cursor->page_count = (unsigned short)page_count;
84856fc5659SAlex Elder 	BUG_ON(length > SIZE_MAX - cursor->page_offset);
849e766d7b5SAlex Elder }
850e766d7b5SAlex Elder 
8518ae4f4f5SAlex Elder static struct page *
ceph_msg_data_pages_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)8528ae4f4f5SAlex Elder ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
8538ae4f4f5SAlex Elder 					size_t *page_offset, size_t *length)
854e766d7b5SAlex Elder {
8558ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
856e766d7b5SAlex Elder 
857e766d7b5SAlex Elder 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
858e766d7b5SAlex Elder 
859e766d7b5SAlex Elder 	BUG_ON(cursor->page_index >= cursor->page_count);
860e766d7b5SAlex Elder 	BUG_ON(cursor->page_offset >= PAGE_SIZE);
861e766d7b5SAlex Elder 
862e766d7b5SAlex Elder 	*page_offset = cursor->page_offset;
863da4ab869SJeff Layton 	*length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
864e766d7b5SAlex Elder 	return data->pages[cursor->page_index];
865e766d7b5SAlex Elder }
866e766d7b5SAlex Elder 
ceph_msg_data_pages_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)8678ae4f4f5SAlex Elder static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
868e766d7b5SAlex Elder 						size_t bytes)
869e766d7b5SAlex Elder {
8708ae4f4f5SAlex Elder 	BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
871e766d7b5SAlex Elder 
872e766d7b5SAlex Elder 	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
873e766d7b5SAlex Elder 
874e766d7b5SAlex Elder 	/* Advance the cursor page offset */
875e766d7b5SAlex Elder 
876e766d7b5SAlex Elder 	cursor->resid -= bytes;
8775df521b1SAlex Elder 	cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
8785df521b1SAlex Elder 	if (!bytes || cursor->page_offset)
879e766d7b5SAlex Elder 		return false;	/* more bytes to process in the current page */
880e766d7b5SAlex Elder 
881d90deda6SYan, Zheng 	if (!cursor->resid)
882d90deda6SYan, Zheng 		return false;   /* no more data */
883d90deda6SYan, Zheng 
8845df521b1SAlex Elder 	/* Move on to the next page; offset is already at 0 */
885e766d7b5SAlex Elder 
886e766d7b5SAlex Elder 	BUG_ON(cursor->page_index >= cursor->page_count);
887e766d7b5SAlex Elder 	cursor->page_index++;
888e766d7b5SAlex Elder 	return true;
889e766d7b5SAlex Elder }
890e766d7b5SAlex Elder 
891e766d7b5SAlex Elder /*
892dd236fcbSAlex Elder  * For a pagelist, a piece is whatever remains to be consumed in the
893dd236fcbSAlex Elder  * first page in the list, or the front of the next page.
894fe38a2b6SAlex Elder  */
8958ae4f4f5SAlex Elder static void
ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)8968ae4f4f5SAlex Elder ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
89725aff7c5SAlex Elder 					size_t length)
898fe38a2b6SAlex Elder {
8998ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
900fe38a2b6SAlex Elder 	struct ceph_pagelist *pagelist;
901fe38a2b6SAlex Elder 	struct page *page;
902fe38a2b6SAlex Elder 
903dd236fcbSAlex Elder 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
904fe38a2b6SAlex Elder 
905fe38a2b6SAlex Elder 	pagelist = data->pagelist;
906fe38a2b6SAlex Elder 	BUG_ON(!pagelist);
90725aff7c5SAlex Elder 
90825aff7c5SAlex Elder 	if (!length)
909fe38a2b6SAlex Elder 		return;		/* pagelist can be assigned but empty */
910fe38a2b6SAlex Elder 
911fe38a2b6SAlex Elder 	BUG_ON(list_empty(&pagelist->head));
912fe38a2b6SAlex Elder 	page = list_first_entry(&pagelist->head, struct page, lru);
913fe38a2b6SAlex Elder 
914ca8b3a69SAlex Elder 	cursor->resid = min(length, pagelist->length);
915fe38a2b6SAlex Elder 	cursor->page = page;
916fe38a2b6SAlex Elder 	cursor->offset = 0;
917fe38a2b6SAlex Elder }
918fe38a2b6SAlex Elder 
9198ae4f4f5SAlex Elder static struct page *
ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)9208ae4f4f5SAlex Elder ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
9218ae4f4f5SAlex Elder 				size_t *page_offset, size_t *length)
922fe38a2b6SAlex Elder {
9238ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
924fe38a2b6SAlex Elder 	struct ceph_pagelist *pagelist;
925fe38a2b6SAlex Elder 
926fe38a2b6SAlex Elder 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
927fe38a2b6SAlex Elder 
928fe38a2b6SAlex Elder 	pagelist = data->pagelist;
929fe38a2b6SAlex Elder 	BUG_ON(!pagelist);
930fe38a2b6SAlex Elder 
931fe38a2b6SAlex Elder 	BUG_ON(!cursor->page);
93225aff7c5SAlex Elder 	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
933fe38a2b6SAlex Elder 
9345df521b1SAlex Elder 	/* offset of first page in pagelist is always 0 */
935fe38a2b6SAlex Elder 	*page_offset = cursor->offset & ~PAGE_MASK;
936da4ab869SJeff Layton 	*length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
9378ae4f4f5SAlex Elder 	return cursor->page;
938fe38a2b6SAlex Elder }
939fe38a2b6SAlex Elder 
ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)9408ae4f4f5SAlex Elder static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
941dd236fcbSAlex Elder 						size_t bytes)
942fe38a2b6SAlex Elder {
9438ae4f4f5SAlex Elder 	struct ceph_msg_data *data = cursor->data;
944fe38a2b6SAlex Elder 	struct ceph_pagelist *pagelist;
945fe38a2b6SAlex Elder 
946fe38a2b6SAlex Elder 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
947fe38a2b6SAlex Elder 
948fe38a2b6SAlex Elder 	pagelist = data->pagelist;
949fe38a2b6SAlex Elder 	BUG_ON(!pagelist);
95025aff7c5SAlex Elder 
95125aff7c5SAlex Elder 	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
952fe38a2b6SAlex Elder 	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
953fe38a2b6SAlex Elder 
954fe38a2b6SAlex Elder 	/* Advance the cursor offset */
955fe38a2b6SAlex Elder 
95625aff7c5SAlex Elder 	cursor->resid -= bytes;
957fe38a2b6SAlex Elder 	cursor->offset += bytes;
9585df521b1SAlex Elder 	/* offset of first page in pagelist is always 0 */
959fe38a2b6SAlex Elder 	if (!bytes || cursor->offset & ~PAGE_MASK)
960fe38a2b6SAlex Elder 		return false;	/* more bytes to process in the current page */
961fe38a2b6SAlex Elder 
962d90deda6SYan, Zheng 	if (!cursor->resid)
963d90deda6SYan, Zheng 		return false;   /* no more data */
964d90deda6SYan, Zheng 
965fe38a2b6SAlex Elder 	/* Move on to the next page */
966fe38a2b6SAlex Elder 
967fe38a2b6SAlex Elder 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
96817ddc49bSGeliang Tang 	cursor->page = list_next_entry(cursor->page, lru);
969fe38a2b6SAlex Elder 	return true;
970fe38a2b6SAlex Elder }
971fe38a2b6SAlex Elder 
ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor * cursor,size_t length)972dee0c5f8SJeff Layton static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
973dee0c5f8SJeff Layton 					   size_t length)
974dee0c5f8SJeff Layton {
975dee0c5f8SJeff Layton 	struct ceph_msg_data *data = cursor->data;
976dee0c5f8SJeff Layton 
977dee0c5f8SJeff Layton 	cursor->iov_iter = data->iter;
978dee0c5f8SJeff Layton 	cursor->lastlen = 0;
979dee0c5f8SJeff Layton 	iov_iter_truncate(&cursor->iov_iter, length);
980dee0c5f8SJeff Layton 	cursor->resid = iov_iter_count(&cursor->iov_iter);
981dee0c5f8SJeff Layton }
982dee0c5f8SJeff Layton 
ceph_msg_data_iter_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)983dee0c5f8SJeff Layton static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
984dee0c5f8SJeff Layton 					    size_t *page_offset, size_t *length)
985dee0c5f8SJeff Layton {
986dee0c5f8SJeff Layton 	struct page *page;
987dee0c5f8SJeff Layton 	ssize_t len;
988dee0c5f8SJeff Layton 
989dee0c5f8SJeff Layton 	if (cursor->lastlen)
990dee0c5f8SJeff Layton 		iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
991dee0c5f8SJeff Layton 
992dee0c5f8SJeff Layton 	len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE,
993dee0c5f8SJeff Layton 				  1, page_offset);
994dee0c5f8SJeff Layton 	BUG_ON(len < 0);
995dee0c5f8SJeff Layton 
996dee0c5f8SJeff Layton 	cursor->lastlen = len;
997dee0c5f8SJeff Layton 
998dee0c5f8SJeff Layton 	/*
999dee0c5f8SJeff Layton 	 * FIXME: The assumption is that the pages represented by the iov_iter
1000dee0c5f8SJeff Layton 	 *	  are pinned, with the references held by the upper-level
1001dee0c5f8SJeff Layton 	 *	  callers, or by virtue of being under writeback. Eventually,
1002dee0c5f8SJeff Layton 	 *	  we'll get an iov_iter_get_pages2 variant that doesn't take
1003dee0c5f8SJeff Layton 	 *	  page refs. Until then, just put the page ref.
1004dee0c5f8SJeff Layton 	 */
1005dee0c5f8SJeff Layton 	VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
1006dee0c5f8SJeff Layton 	put_page(page);
1007dee0c5f8SJeff Layton 
1008dee0c5f8SJeff Layton 	*length = min_t(size_t, len, cursor->resid);
1009dee0c5f8SJeff Layton 	return page;
1010dee0c5f8SJeff Layton }
1011dee0c5f8SJeff Layton 
ceph_msg_data_iter_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)1012dee0c5f8SJeff Layton static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
1013dee0c5f8SJeff Layton 				       size_t bytes)
1014dee0c5f8SJeff Layton {
1015dee0c5f8SJeff Layton 	BUG_ON(bytes > cursor->resid);
1016dee0c5f8SJeff Layton 	cursor->resid -= bytes;
1017dee0c5f8SJeff Layton 
1018dee0c5f8SJeff Layton 	if (bytes < cursor->lastlen) {
1019dee0c5f8SJeff Layton 		cursor->lastlen -= bytes;
1020dee0c5f8SJeff Layton 	} else {
1021dee0c5f8SJeff Layton 		iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
1022dee0c5f8SJeff Layton 		cursor->lastlen = 0;
1023dee0c5f8SJeff Layton 	}
1024dee0c5f8SJeff Layton 
1025dee0c5f8SJeff Layton 	return cursor->resid;
1026dee0c5f8SJeff Layton }
1027dee0c5f8SJeff Layton 
1028dd236fcbSAlex Elder /*
1029dd236fcbSAlex Elder  * Message data is handled (sent or received) in pieces, where each
1030dd236fcbSAlex Elder  * piece resides on a single page.  The network layer might not
1031dd236fcbSAlex Elder  * consume an entire piece at once.  A data item's cursor keeps
1032dd236fcbSAlex Elder  * track of which piece is next to process and how much remains to
1033dd236fcbSAlex Elder  * be processed in that piece.  It also tracks whether the current
1034dd236fcbSAlex Elder  * piece is the last one in the data item.
1035dd236fcbSAlex Elder  */
__ceph_msg_data_cursor_init(struct ceph_msg_data_cursor * cursor)1036ca8b3a69SAlex Elder static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
1037dd236fcbSAlex Elder {
1038ca8b3a69SAlex Elder 	size_t length = cursor->total_resid;
10398ae4f4f5SAlex Elder 
10408ae4f4f5SAlex Elder 	switch (cursor->data->type) {
1041dd236fcbSAlex Elder 	case CEPH_MSG_DATA_PAGELIST:
10428ae4f4f5SAlex Elder 		ceph_msg_data_pagelist_cursor_init(cursor, length);
1043dd236fcbSAlex Elder 		break;
1044e766d7b5SAlex Elder 	case CEPH_MSG_DATA_PAGES:
10458ae4f4f5SAlex Elder 		ceph_msg_data_pages_cursor_init(cursor, length);
1046e766d7b5SAlex Elder 		break;
1047dd236fcbSAlex Elder #ifdef CONFIG_BLOCK
1048dd236fcbSAlex Elder 	case CEPH_MSG_DATA_BIO:
10498ae4f4f5SAlex Elder 		ceph_msg_data_bio_cursor_init(cursor, length);
10506aaa4511SAlex Elder 		break;
1051dd236fcbSAlex Elder #endif /* CONFIG_BLOCK */
1052b9e281c2SIlya Dryomov 	case CEPH_MSG_DATA_BVECS:
1053b9e281c2SIlya Dryomov 		ceph_msg_data_bvecs_cursor_init(cursor, length);
1054b9e281c2SIlya Dryomov 		break;
1055dee0c5f8SJeff Layton 	case CEPH_MSG_DATA_ITER:
1056dee0c5f8SJeff Layton 		ceph_msg_data_iter_cursor_init(cursor, length);
1057dee0c5f8SJeff Layton 		break;
10586aaa4511SAlex Elder 	case CEPH_MSG_DATA_NONE:
1059dd236fcbSAlex Elder 	default:
1060dd236fcbSAlex Elder 		/* BUG(); */
1061dd236fcbSAlex Elder 		break;
1062dd236fcbSAlex Elder 	}
10638ae4f4f5SAlex Elder 	cursor->need_crc = true;
1064dd236fcbSAlex Elder }
1065dd236fcbSAlex Elder 
ceph_msg_data_cursor_init(struct ceph_msg_data_cursor * cursor,struct ceph_msg * msg,size_t length)10666503e0b6SIlya Dryomov void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
10678ee8abf7SIlya Dryomov 			       struct ceph_msg *msg, size_t length)
1068ca8b3a69SAlex Elder {
1069ca8b3a69SAlex Elder 	BUG_ON(!length);
1070ca8b3a69SAlex Elder 	BUG_ON(length > msg->data_length);
10710d9c1ab3SIlya Dryomov 	BUG_ON(!msg->num_data_items);
1072ca8b3a69SAlex Elder 
1073ca8b3a69SAlex Elder 	cursor->total_resid = length;
10740d9c1ab3SIlya Dryomov 	cursor->data = msg->data;
1075ec3bc567SJeff Layton 	cursor->sr_resid = 0;
1076ca8b3a69SAlex Elder 
1077ca8b3a69SAlex Elder 	__ceph_msg_data_cursor_init(cursor);
1078ca8b3a69SAlex Elder }
1079ca8b3a69SAlex Elder 
1080dd236fcbSAlex Elder /*
1081dd236fcbSAlex Elder  * Return the page containing the next piece to process for a given
1082dd236fcbSAlex Elder  * data item, and supply the page offset and length of that piece.
1083dd236fcbSAlex Elder  * Indicate whether this is the last piece in this data item.
1084dd236fcbSAlex Elder  */
ceph_msg_data_next(struct ceph_msg_data_cursor * cursor,size_t * page_offset,size_t * length)10856503e0b6SIlya Dryomov struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
1086da4ab869SJeff Layton 				size_t *page_offset, size_t *length)
1087dd236fcbSAlex Elder {
1088dd236fcbSAlex Elder 	struct page *page;
1089dd236fcbSAlex Elder 
10908ae4f4f5SAlex Elder 	switch (cursor->data->type) {
1091dd236fcbSAlex Elder 	case CEPH_MSG_DATA_PAGELIST:
10928ae4f4f5SAlex Elder 		page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
1093dd236fcbSAlex Elder 		break;
1094e766d7b5SAlex Elder 	case CEPH_MSG_DATA_PAGES:
10958ae4f4f5SAlex Elder 		page = ceph_msg_data_pages_next(cursor, page_offset, length);
1096e766d7b5SAlex Elder 		break;
1097dd236fcbSAlex Elder #ifdef CONFIG_BLOCK
1098dd236fcbSAlex Elder 	case CEPH_MSG_DATA_BIO:
10998ae4f4f5SAlex Elder 		page = ceph_msg_data_bio_next(cursor, page_offset, length);
11006aaa4511SAlex Elder 		break;
1101dd236fcbSAlex Elder #endif /* CONFIG_BLOCK */
1102b9e281c2SIlya Dryomov 	case CEPH_MSG_DATA_BVECS:
1103b9e281c2SIlya Dryomov 		page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
1104b9e281c2SIlya Dryomov 		break;
1105dee0c5f8SJeff Layton 	case CEPH_MSG_DATA_ITER:
1106dee0c5f8SJeff Layton 		page = ceph_msg_data_iter_next(cursor, page_offset, length);
1107dee0c5f8SJeff Layton 		break;
11086aaa4511SAlex Elder 	case CEPH_MSG_DATA_NONE:
1109dd236fcbSAlex Elder 	default:
1110dd236fcbSAlex Elder 		page = NULL;
1111dd236fcbSAlex Elder 		break;
1112dd236fcbSAlex Elder 	}
11135359a17dSIlya Dryomov 
1114dd236fcbSAlex Elder 	BUG_ON(!page);
1115dd236fcbSAlex Elder 	BUG_ON(*page_offset + *length > PAGE_SIZE);
1116dd236fcbSAlex Elder 	BUG_ON(!*length);
11175359a17dSIlya Dryomov 	BUG_ON(*length > cursor->resid);
1118dd236fcbSAlex Elder 
1119dd236fcbSAlex Elder 	return page;
1120dd236fcbSAlex Elder }
1121dd236fcbSAlex Elder 
1122dd236fcbSAlex Elder /*
1123dd236fcbSAlex Elder  * Returns true if the result moves the cursor on to the next piece
1124dd236fcbSAlex Elder  * of the data item.
1125dd236fcbSAlex Elder  */
ceph_msg_data_advance(struct ceph_msg_data_cursor * cursor,size_t bytes)11266503e0b6SIlya Dryomov void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
1127dd236fcbSAlex Elder {
1128dd236fcbSAlex Elder 	bool new_piece;
1129dd236fcbSAlex Elder 
113025aff7c5SAlex Elder 	BUG_ON(bytes > cursor->resid);
11318ae4f4f5SAlex Elder 	switch (cursor->data->type) {
1132dd236fcbSAlex Elder 	case CEPH_MSG_DATA_PAGELIST:
11338ae4f4f5SAlex Elder 		new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
1134dd236fcbSAlex Elder 		break;
1135e766d7b5SAlex Elder 	case CEPH_MSG_DATA_PAGES:
11368ae4f4f5SAlex Elder 		new_piece = ceph_msg_data_pages_advance(cursor, bytes);
1137e766d7b5SAlex Elder 		break;
1138dd236fcbSAlex Elder #ifdef CONFIG_BLOCK
1139dd236fcbSAlex Elder 	case CEPH_MSG_DATA_BIO:
11408ae4f4f5SAlex Elder 		new_piece = ceph_msg_data_bio_advance(cursor, bytes);
11416aaa4511SAlex Elder 		break;
1142dd236fcbSAlex Elder #endif /* CONFIG_BLOCK */
1143b9e281c2SIlya Dryomov 	case CEPH_MSG_DATA_BVECS:
1144b9e281c2SIlya Dryomov 		new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
1145b9e281c2SIlya Dryomov 		break;
1146dee0c5f8SJeff Layton 	case CEPH_MSG_DATA_ITER:
1147dee0c5f8SJeff Layton 		new_piece = ceph_msg_data_iter_advance(cursor, bytes);
1148dee0c5f8SJeff Layton 		break;
11496aaa4511SAlex Elder 	case CEPH_MSG_DATA_NONE:
1150dd236fcbSAlex Elder 	default:
1151dd236fcbSAlex Elder 		BUG();
1152dd236fcbSAlex Elder 		break;
1153dd236fcbSAlex Elder 	}
1154ca8b3a69SAlex Elder 	cursor->total_resid -= bytes;
1155dd236fcbSAlex Elder 
1156ca8b3a69SAlex Elder 	if (!cursor->resid && cursor->total_resid) {
11570d9c1ab3SIlya Dryomov 		cursor->data++;
1158ca8b3a69SAlex Elder 		__ceph_msg_data_cursor_init(cursor);
1159a51b272eSAlex Elder 		new_piece = true;
1160ca8b3a69SAlex Elder 	}
1161a51b272eSAlex Elder 	cursor->need_crc = new_piece;
1162dd236fcbSAlex Elder }
1163dd236fcbSAlex Elder 
ceph_crc32c_page(u32 crc,struct page * page,unsigned int page_offset,unsigned int length)11646503e0b6SIlya Dryomov u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
116535b62808SAlex Elder 		     unsigned int length)
116635b62808SAlex Elder {
116735b62808SAlex Elder 	char *kaddr;
116835b62808SAlex Elder 
116935b62808SAlex Elder 	kaddr = kmap(page);
117035b62808SAlex Elder 	BUG_ON(kaddr == NULL);
117135b62808SAlex Elder 	crc = crc32c(crc, kaddr + page_offset, length);
117235b62808SAlex Elder 	kunmap(page);
117335b62808SAlex Elder 
117435b62808SAlex Elder 	return crc;
117535b62808SAlex Elder }
11763d14c5d2SYehuda Sadeh 
ceph_addr_is_blank(const struct ceph_entity_addr * addr)11776503e0b6SIlya Dryomov bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
11783d14c5d2SYehuda Sadeh {
1179cede185bSJeff Layton 	struct sockaddr_storage ss = addr->in_addr; /* align */
1180cede185bSJeff Layton 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
1181cede185bSJeff Layton 	struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
1182c44bd69cSIlya Dryomov 
1183cede185bSJeff Layton 	switch (ss.ss_family) {
11843d14c5d2SYehuda Sadeh 	case AF_INET:
1185cede185bSJeff Layton 		return addr4->s_addr == htonl(INADDR_ANY);
11863d14c5d2SYehuda Sadeh 	case AF_INET6:
1187c44bd69cSIlya Dryomov 		return ipv6_addr_any(addr6);
1188c44bd69cSIlya Dryomov 	default:
1189c44bd69cSIlya Dryomov 		return true;
11903d14c5d2SYehuda Sadeh 	}
11913d14c5d2SYehuda Sadeh }
11928ff2c64cSIlya Dryomov EXPORT_SYMBOL(ceph_addr_is_blank);
11933d14c5d2SYehuda Sadeh 
ceph_addr_port(const struct ceph_entity_addr * addr)11946503e0b6SIlya Dryomov int ceph_addr_port(const struct ceph_entity_addr *addr)
11953d14c5d2SYehuda Sadeh {
1196cede185bSJeff Layton 	switch (get_unaligned(&addr->in_addr.ss_family)) {
11973d14c5d2SYehuda Sadeh 	case AF_INET:
1198cede185bSJeff Layton 		return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
11993d14c5d2SYehuda Sadeh 	case AF_INET6:
1200cede185bSJeff Layton 		return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
12013d14c5d2SYehuda Sadeh 	}
12023d14c5d2SYehuda Sadeh 	return 0;
12033d14c5d2SYehuda Sadeh }
12043d14c5d2SYehuda Sadeh 
ceph_addr_set_port(struct ceph_entity_addr * addr,int p)12056503e0b6SIlya Dryomov void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
12063d14c5d2SYehuda Sadeh {
1207cede185bSJeff Layton 	switch (get_unaligned(&addr->in_addr.ss_family)) {
12083d14c5d2SYehuda Sadeh 	case AF_INET:
1209cede185bSJeff Layton 		put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
1210a2a79609SSage Weil 		break;
12113d14c5d2SYehuda Sadeh 	case AF_INET6:
1212cede185bSJeff Layton 		put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
1213a2a79609SSage Weil 		break;
12143d14c5d2SYehuda Sadeh 	}
12153d14c5d2SYehuda Sadeh }
12163d14c5d2SYehuda Sadeh 
12173d14c5d2SYehuda Sadeh /*
1218ee3b56f2SNoah Watkins  * Unlike other *_pton function semantics, zero indicates success.
1219ee3b56f2SNoah Watkins  */
ceph_pton(const char * str,size_t len,struct ceph_entity_addr * addr,char delim,const char ** ipend)1220cede185bSJeff Layton static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
1221ee3b56f2SNoah Watkins 		char delim, const char **ipend)
1222ee3b56f2SNoah Watkins {
1223cede185bSJeff Layton 	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
1224ee3b56f2SNoah Watkins 
1225cede185bSJeff Layton 	if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
1226cede185bSJeff Layton 		put_unaligned(AF_INET, &addr->in_addr.ss_family);
1227ee3b56f2SNoah Watkins 		return 0;
1228ee3b56f2SNoah Watkins 	}
1229ee3b56f2SNoah Watkins 
1230cede185bSJeff Layton 	if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
1231cede185bSJeff Layton 		put_unaligned(AF_INET6, &addr->in_addr.ss_family);
1232ee3b56f2SNoah Watkins 		return 0;
1233ee3b56f2SNoah Watkins 	}
1234ee3b56f2SNoah Watkins 
1235ee3b56f2SNoah Watkins 	return -EINVAL;
1236ee3b56f2SNoah Watkins }
1237ee3b56f2SNoah Watkins 
1238ee3b56f2SNoah Watkins /*
1239ee3b56f2SNoah Watkins  * Extract hostname string and resolve using kernel DNS facility.
1240ee3b56f2SNoah Watkins  */
1241ee3b56f2SNoah Watkins #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
ceph_dns_resolve_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1242ee3b56f2SNoah Watkins static int ceph_dns_resolve_name(const char *name, size_t namelen,
1243cede185bSJeff Layton 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1244ee3b56f2SNoah Watkins {
1245ee3b56f2SNoah Watkins 	const char *end, *delim_p;
1246ee3b56f2SNoah Watkins 	char *colon_p, *ip_addr = NULL;
1247ee3b56f2SNoah Watkins 	int ip_len, ret;
1248ee3b56f2SNoah Watkins 
1249ee3b56f2SNoah Watkins 	/*
1250ee3b56f2SNoah Watkins 	 * The end of the hostname occurs immediately preceding the delimiter or
1251ee3b56f2SNoah Watkins 	 * the port marker (':') where the delimiter takes precedence.
1252ee3b56f2SNoah Watkins 	 */
1253ee3b56f2SNoah Watkins 	delim_p = memchr(name, delim, namelen);
1254ee3b56f2SNoah Watkins 	colon_p = memchr(name, ':', namelen);
1255ee3b56f2SNoah Watkins 
1256ee3b56f2SNoah Watkins 	if (delim_p && colon_p)
1257ee3b56f2SNoah Watkins 		end = delim_p < colon_p ? delim_p : colon_p;
1258ee3b56f2SNoah Watkins 	else if (!delim_p && colon_p)
1259ee3b56f2SNoah Watkins 		end = colon_p;
1260ee3b56f2SNoah Watkins 	else {
1261ee3b56f2SNoah Watkins 		end = delim_p;
1262ee3b56f2SNoah Watkins 		if (!end) /* case: hostname:/ */
1263ee3b56f2SNoah Watkins 			end = name + namelen;
1264ee3b56f2SNoah Watkins 	}
1265ee3b56f2SNoah Watkins 
1266ee3b56f2SNoah Watkins 	if (end <= name)
1267ee3b56f2SNoah Watkins 		return -EINVAL;
1268ee3b56f2SNoah Watkins 
1269ee3b56f2SNoah Watkins 	/* do dns_resolve upcall */
1270a58946c1SDavid Howells 	ip_len = dns_query(current->nsproxy->net_ns,
1271a58946c1SDavid Howells 			   NULL, name, end - name, NULL, &ip_addr, NULL, false);
1272ee3b56f2SNoah Watkins 	if (ip_len > 0)
1273cede185bSJeff Layton 		ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
1274ee3b56f2SNoah Watkins 	else
1275ee3b56f2SNoah Watkins 		ret = -ESRCH;
1276ee3b56f2SNoah Watkins 
1277ee3b56f2SNoah Watkins 	kfree(ip_addr);
1278ee3b56f2SNoah Watkins 
1279ee3b56f2SNoah Watkins 	*ipend = end;
1280ee3b56f2SNoah Watkins 
1281ee3b56f2SNoah Watkins 	pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
1282b726ec97SJeff Layton 			ret, ret ? "failed" : ceph_pr_addr(addr));
1283ee3b56f2SNoah Watkins 
1284ee3b56f2SNoah Watkins 	return ret;
1285ee3b56f2SNoah Watkins }
1286ee3b56f2SNoah Watkins #else
ceph_dns_resolve_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1287ee3b56f2SNoah Watkins static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
1288cede185bSJeff Layton 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1289ee3b56f2SNoah Watkins {
1290ee3b56f2SNoah Watkins 	return -EINVAL;
1291ee3b56f2SNoah Watkins }
1292ee3b56f2SNoah Watkins #endif
1293ee3b56f2SNoah Watkins 
1294ee3b56f2SNoah Watkins /*
1295ee3b56f2SNoah Watkins  * Parse a server name (IP or hostname). If a valid IP address is not found
1296ee3b56f2SNoah Watkins  * then try to extract a hostname to resolve using userspace DNS upcall.
1297ee3b56f2SNoah Watkins  */
ceph_parse_server_name(const char * name,size_t namelen,struct ceph_entity_addr * addr,char delim,const char ** ipend)1298ee3b56f2SNoah Watkins static int ceph_parse_server_name(const char *name, size_t namelen,
1299cede185bSJeff Layton 		struct ceph_entity_addr *addr, char delim, const char **ipend)
1300ee3b56f2SNoah Watkins {
1301ee3b56f2SNoah Watkins 	int ret;
1302ee3b56f2SNoah Watkins 
1303cede185bSJeff Layton 	ret = ceph_pton(name, namelen, addr, delim, ipend);
1304ee3b56f2SNoah Watkins 	if (ret)
1305cede185bSJeff Layton 		ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
1306ee3b56f2SNoah Watkins 
1307ee3b56f2SNoah Watkins 	return ret;
1308ee3b56f2SNoah Watkins }
1309ee3b56f2SNoah Watkins 
1310ee3b56f2SNoah Watkins /*
13113d14c5d2SYehuda Sadeh  * Parse an ip[:port] list into an addr array.  Use the default
13123d14c5d2SYehuda Sadeh  * monitor port if a port isn't specified.
13133d14c5d2SYehuda Sadeh  */
ceph_parse_ips(const char * c,const char * end,struct ceph_entity_addr * addr,int max_count,int * count,char delim)13143d14c5d2SYehuda Sadeh int ceph_parse_ips(const char *c, const char *end,
13153d14c5d2SYehuda Sadeh 		   struct ceph_entity_addr *addr,
13162d7c86a8SVenky Shankar 		   int max_count, int *count, char delim)
13173d14c5d2SYehuda Sadeh {
1318ee3b56f2SNoah Watkins 	int i, ret = -EINVAL;
13193d14c5d2SYehuda Sadeh 	const char *p = c;
13203d14c5d2SYehuda Sadeh 
13213d14c5d2SYehuda Sadeh 	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
13223d14c5d2SYehuda Sadeh 	for (i = 0; i < max_count; i++) {
13232d7c86a8SVenky Shankar 		char cur_delim = delim;
13243d14c5d2SYehuda Sadeh 		const char *ipend;
13253d14c5d2SYehuda Sadeh 		int port;
13263d14c5d2SYehuda Sadeh 
13273d14c5d2SYehuda Sadeh 		if (*p == '[') {
13282d7c86a8SVenky Shankar 			cur_delim = ']';
13293d14c5d2SYehuda Sadeh 			p++;
13303d14c5d2SYehuda Sadeh 		}
13313d14c5d2SYehuda Sadeh 
13322d7c86a8SVenky Shankar 		ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim,
13332d7c86a8SVenky Shankar 					     &ipend);
1334ee3b56f2SNoah Watkins 		if (ret)
13353d14c5d2SYehuda Sadeh 			goto bad;
1336ee3b56f2SNoah Watkins 		ret = -EINVAL;
1337ee3b56f2SNoah Watkins 
13383d14c5d2SYehuda Sadeh 		p = ipend;
13393d14c5d2SYehuda Sadeh 
13402d7c86a8SVenky Shankar 		if (cur_delim == ']') {
13413d14c5d2SYehuda Sadeh 			if (*p != ']') {
13423d14c5d2SYehuda Sadeh 				dout("missing matching ']'\n");
13433d14c5d2SYehuda Sadeh 				goto bad;
13443d14c5d2SYehuda Sadeh 			}
13453d14c5d2SYehuda Sadeh 			p++;
13463d14c5d2SYehuda Sadeh 		}
13473d14c5d2SYehuda Sadeh 
13483d14c5d2SYehuda Sadeh 		/* port? */
13493d14c5d2SYehuda Sadeh 		if (p < end && *p == ':') {
13503d14c5d2SYehuda Sadeh 			port = 0;
13513d14c5d2SYehuda Sadeh 			p++;
13523d14c5d2SYehuda Sadeh 			while (p < end && *p >= '0' && *p <= '9') {
13533d14c5d2SYehuda Sadeh 				port = (port * 10) + (*p - '0');
13543d14c5d2SYehuda Sadeh 				p++;
13553d14c5d2SYehuda Sadeh 			}
1356f48db1e9SIlya Dryomov 			if (port == 0)
1357f48db1e9SIlya Dryomov 				port = CEPH_MON_PORT;
1358f48db1e9SIlya Dryomov 			else if (port > 65535)
13593d14c5d2SYehuda Sadeh 				goto bad;
13603d14c5d2SYehuda Sadeh 		} else {
13613d14c5d2SYehuda Sadeh 			port = CEPH_MON_PORT;
13623d14c5d2SYehuda Sadeh 		}
13633d14c5d2SYehuda Sadeh 
13646503e0b6SIlya Dryomov 		ceph_addr_set_port(&addr[i], port);
1365cd1a677cSIlya Dryomov 		/*
1366cd1a677cSIlya Dryomov 		 * We want the type to be set according to ms_mode
1367cd1a677cSIlya Dryomov 		 * option, but options are normally parsed after mon
1368cd1a677cSIlya Dryomov 		 * addresses.  Rather than complicating parsing, set
1369cd1a677cSIlya Dryomov 		 * to LEGACY and override in build_initial_monmap()
1370cd1a677cSIlya Dryomov 		 * for mon addresses and ceph_messenger_init() for
1371cd1a677cSIlya Dryomov 		 * ip option.
1372cd1a677cSIlya Dryomov 		 */
1373d3c3c0a8SJeff Layton 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
1374cd1a677cSIlya Dryomov 		addr[i].nonce = 0;
13753d14c5d2SYehuda Sadeh 
13762d7c86a8SVenky Shankar 		dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i]));
13773d14c5d2SYehuda Sadeh 
13783d14c5d2SYehuda Sadeh 		if (p == end)
13793d14c5d2SYehuda Sadeh 			break;
13802d7c86a8SVenky Shankar 		if (*p != delim)
13813d14c5d2SYehuda Sadeh 			goto bad;
13823d14c5d2SYehuda Sadeh 		p++;
13833d14c5d2SYehuda Sadeh 	}
13843d14c5d2SYehuda Sadeh 
13853d14c5d2SYehuda Sadeh 	if (p != end)
13863d14c5d2SYehuda Sadeh 		goto bad;
13873d14c5d2SYehuda Sadeh 
13883d14c5d2SYehuda Sadeh 	if (count)
13893d14c5d2SYehuda Sadeh 		*count = i + 1;
13903d14c5d2SYehuda Sadeh 	return 0;
13913d14c5d2SYehuda Sadeh 
13923d14c5d2SYehuda Sadeh bad:
1393ee3b56f2SNoah Watkins 	return ret;
13943d14c5d2SYehuda Sadeh }
13953d14c5d2SYehuda Sadeh 
13963d14c5d2SYehuda Sadeh /*
13973d14c5d2SYehuda Sadeh  * Process message.  This happens in the worker thread.  The callback should
13983d14c5d2SYehuda Sadeh  * be careful not to do anything that waits on other incoming messages or it
13993d14c5d2SYehuda Sadeh  * may deadlock.
14003d14c5d2SYehuda Sadeh  */
ceph_con_process_message(struct ceph_connection * con)14016503e0b6SIlya Dryomov void ceph_con_process_message(struct ceph_connection *con)
14023d14c5d2SYehuda Sadeh {
1403583d0fefSIlya Dryomov 	struct ceph_msg *msg = con->in_msg;
14043d14c5d2SYehuda Sadeh 
140538941f80SAlex Elder 	BUG_ON(con->in_msg->con != con);
14063d14c5d2SYehuda Sadeh 	con->in_msg = NULL;
14073d14c5d2SYehuda Sadeh 
14083d14c5d2SYehuda Sadeh 	/* if first message, set peer_name */
14093d14c5d2SYehuda Sadeh 	if (con->peer_name.type == 0)
14103d14c5d2SYehuda Sadeh 		con->peer_name = msg->hdr.src;
14113d14c5d2SYehuda Sadeh 
14123d14c5d2SYehuda Sadeh 	con->in_seq++;
14133d14c5d2SYehuda Sadeh 	mutex_unlock(&con->mutex);
14143d14c5d2SYehuda Sadeh 
1415b77f8f0eSIlya Dryomov 	dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
14163d14c5d2SYehuda Sadeh 	     msg, le64_to_cpu(msg->hdr.seq),
14173d14c5d2SYehuda Sadeh 	     ENTITY_NAME(msg->hdr.src),
14183d14c5d2SYehuda Sadeh 	     le16_to_cpu(msg->hdr.type),
14193d14c5d2SYehuda Sadeh 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
14203d14c5d2SYehuda Sadeh 	     le32_to_cpu(msg->hdr.front_len),
1421b77f8f0eSIlya Dryomov 	     le32_to_cpu(msg->hdr.middle_len),
14223d14c5d2SYehuda Sadeh 	     le32_to_cpu(msg->hdr.data_len),
14233d14c5d2SYehuda Sadeh 	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
14243d14c5d2SYehuda Sadeh 	con->ops->dispatch(con, msg);
14253d14c5d2SYehuda Sadeh 
14263d14c5d2SYehuda Sadeh 	mutex_lock(&con->mutex);
14273d14c5d2SYehuda Sadeh }
14283d14c5d2SYehuda Sadeh 
14293d14c5d2SYehuda Sadeh /*
1430802c6d96SAlex Elder  * Atomically queue work on a connection after the specified delay.
1431802c6d96SAlex Elder  * Bump @con reference to avoid races with connection teardown.
1432802c6d96SAlex Elder  * Returns 0 if work was queued, or an error code otherwise.
14333d14c5d2SYehuda Sadeh  */
queue_con_delay(struct ceph_connection * con,unsigned long delay)1434802c6d96SAlex Elder static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
14353d14c5d2SYehuda Sadeh {
14363d14c5d2SYehuda Sadeh 	if (!con->ops->get(con)) {
1437802c6d96SAlex Elder 		dout("%s %p ref count 0\n", __func__, con);
1438802c6d96SAlex Elder 		return -ENOENT;
14393d14c5d2SYehuda Sadeh 	}
14403d14c5d2SYehuda Sadeh 
1441418af5b3SIlya Dryomov 	if (delay >= HZ)
1442418af5b3SIlya Dryomov 		delay = round_jiffies_relative(delay);
1443418af5b3SIlya Dryomov 
14445a5036c8SIlya Dryomov 	dout("%s %p %lu\n", __func__, con, delay);
1445802c6d96SAlex Elder 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
1446802c6d96SAlex Elder 		dout("%s %p - already queued\n", __func__, con);
14473d14c5d2SYehuda Sadeh 		con->ops->put(con);
1448802c6d96SAlex Elder 		return -EBUSY;
14493d14c5d2SYehuda Sadeh 	}
1450802c6d96SAlex Elder 
1451802c6d96SAlex Elder 	return 0;
1452802c6d96SAlex Elder }
1453802c6d96SAlex Elder 
queue_con(struct ceph_connection * con)1454802c6d96SAlex Elder static void queue_con(struct ceph_connection *con)
1455802c6d96SAlex Elder {
1456802c6d96SAlex Elder 	(void) queue_con_delay(con, 0);
14573d14c5d2SYehuda Sadeh }
14583d14c5d2SYehuda Sadeh 
cancel_con(struct ceph_connection * con)145937ab77acSIlya Dryomov static void cancel_con(struct ceph_connection *con)
146037ab77acSIlya Dryomov {
146137ab77acSIlya Dryomov 	if (cancel_delayed_work(&con->work)) {
146237ab77acSIlya Dryomov 		dout("%s %p\n", __func__, con);
146337ab77acSIlya Dryomov 		con->ops->put(con);
146437ab77acSIlya Dryomov 	}
146537ab77acSIlya Dryomov }
146637ab77acSIlya Dryomov 
con_sock_closed(struct ceph_connection * con)14677bb21d68SAlex Elder static bool con_sock_closed(struct ceph_connection *con)
14687bb21d68SAlex Elder {
14696503e0b6SIlya Dryomov 	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
14707bb21d68SAlex Elder 		return false;
14717bb21d68SAlex Elder 
14727bb21d68SAlex Elder #define CASE(x)								\
14736d7f62bfSIlya Dryomov 	case CEPH_CON_S_ ## x:						\
14747bb21d68SAlex Elder 		con->error_msg = "socket closed (con state " #x ")";	\
14757bb21d68SAlex Elder 		break;
14767bb21d68SAlex Elder 
14777bb21d68SAlex Elder 	switch (con->state) {
14787bb21d68SAlex Elder 	CASE(CLOSED);
14797bb21d68SAlex Elder 	CASE(PREOPEN);
14806d7f62bfSIlya Dryomov 	CASE(V1_BANNER);
14816d7f62bfSIlya Dryomov 	CASE(V1_CONNECT_MSG);
1482cd1a677cSIlya Dryomov 	CASE(V2_BANNER_PREFIX);
1483cd1a677cSIlya Dryomov 	CASE(V2_BANNER_PAYLOAD);
1484cd1a677cSIlya Dryomov 	CASE(V2_HELLO);
1485cd1a677cSIlya Dryomov 	CASE(V2_AUTH);
1486cd1a677cSIlya Dryomov 	CASE(V2_AUTH_SIGNATURE);
1487cd1a677cSIlya Dryomov 	CASE(V2_SESSION_CONNECT);
1488cd1a677cSIlya Dryomov 	CASE(V2_SESSION_RECONNECT);
14897bb21d68SAlex Elder 	CASE(OPEN);
14907bb21d68SAlex Elder 	CASE(STANDBY);
14917bb21d68SAlex Elder 	default:
14927bb21d68SAlex Elder 		BUG();
14937bb21d68SAlex Elder 	}
14947bb21d68SAlex Elder #undef CASE
14957bb21d68SAlex Elder 
14967bb21d68SAlex Elder 	return true;
14977bb21d68SAlex Elder }
14987bb21d68SAlex Elder 
con_backoff(struct ceph_connection * con)1499f20a39fdSAlex Elder static bool con_backoff(struct ceph_connection *con)
1500f20a39fdSAlex Elder {
1501f20a39fdSAlex Elder 	int ret;
1502f20a39fdSAlex Elder 
15036503e0b6SIlya Dryomov 	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
1504f20a39fdSAlex Elder 		return false;
1505f20a39fdSAlex Elder 
1506418af5b3SIlya Dryomov 	ret = queue_con_delay(con, con->delay);
1507f20a39fdSAlex Elder 	if (ret) {
1508f20a39fdSAlex Elder 		dout("%s: con %p FAILED to back off %lu\n", __func__,
1509f20a39fdSAlex Elder 			con, con->delay);
1510f20a39fdSAlex Elder 		BUG_ON(ret == -ENOENT);
15116503e0b6SIlya Dryomov 		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
1512f20a39fdSAlex Elder 	}
1513f20a39fdSAlex Elder 
1514f20a39fdSAlex Elder 	return true;
1515f20a39fdSAlex Elder }
1516f20a39fdSAlex Elder 
151793209264SAlex Elder /* Finish fault handling; con->mutex must *not* be held here */
151893209264SAlex Elder 
con_fault_finish(struct ceph_connection * con)151993209264SAlex Elder static void con_fault_finish(struct ceph_connection *con)
152093209264SAlex Elder {
1521f6330cc1SIlya Dryomov 	dout("%s %p\n", __func__, con);
1522f6330cc1SIlya Dryomov 
152393209264SAlex Elder 	/*
152493209264SAlex Elder 	 * in case we faulted due to authentication, invalidate our
152593209264SAlex Elder 	 * current tickets so that we can get new ones.
152693209264SAlex Elder 	 */
1527a56dd9bfSIlya Dryomov 	if (con->v1.auth_retry) {
1528a56dd9bfSIlya Dryomov 		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
1529f6330cc1SIlya Dryomov 		if (con->ops->invalidate_authorizer)
153093209264SAlex Elder 			con->ops->invalidate_authorizer(con);
1531a56dd9bfSIlya Dryomov 		con->v1.auth_retry = 0;
153293209264SAlex Elder 	}
153393209264SAlex Elder 
153493209264SAlex Elder 	if (con->ops->fault)
153593209264SAlex Elder 		con->ops->fault(con);
153693209264SAlex Elder }
153793209264SAlex Elder 
15383d14c5d2SYehuda Sadeh /*
15393d14c5d2SYehuda Sadeh  * Do some work on a connection.  Drop a connection ref when we're done.
15403d14c5d2SYehuda Sadeh  */
ceph_con_workfn(struct work_struct * work)154168931622SIlya Dryomov static void ceph_con_workfn(struct work_struct *work)
15423d14c5d2SYehuda Sadeh {
15433d14c5d2SYehuda Sadeh 	struct ceph_connection *con = container_of(work, struct ceph_connection,
15443d14c5d2SYehuda Sadeh 						   work.work);
154549659416SAlex Elder 	bool fault;
15463d14c5d2SYehuda Sadeh 
15473d14c5d2SYehuda Sadeh 	mutex_lock(&con->mutex);
154849659416SAlex Elder 	while (true) {
154949659416SAlex Elder 		int ret;
155049659416SAlex Elder 
155149659416SAlex Elder 		if ((fault = con_sock_closed(con))) {
1552f20a39fdSAlex Elder 			dout("%s: con %p SOCK_CLOSED\n", __func__, con);
155349659416SAlex Elder 			break;
155460bf8bf8SSage Weil 		}
1555f20a39fdSAlex Elder 		if (con_backoff(con)) {
1556f20a39fdSAlex Elder 			dout("%s: con %p BACKOFF\n", __func__, con);
155749659416SAlex Elder 			break;
155860bf8bf8SSage Weil 		}
15596d7f62bfSIlya Dryomov 		if (con->state == CEPH_CON_S_STANDBY) {
156049659416SAlex Elder 			dout("%s: con %p STANDBY\n", __func__, con);
156149659416SAlex Elder 			break;
1562e00de341SSage Weil 		}
15636d7f62bfSIlya Dryomov 		if (con->state == CEPH_CON_S_CLOSED) {
156449659416SAlex Elder 			dout("%s: con %p CLOSED\n", __func__, con);
15652e8cb100SSage Weil 			BUG_ON(con->sock);
156649659416SAlex Elder 			break;
15673d14c5d2SYehuda Sadeh 		}
15686d7f62bfSIlya Dryomov 		if (con->state == CEPH_CON_S_PREOPEN) {
156949659416SAlex Elder 			dout("%s: con %p PREOPEN\n", __func__, con);
15702e8cb100SSage Weil 			BUG_ON(con->sock);
15713d14c5d2SYehuda Sadeh 		}
15723d14c5d2SYehuda Sadeh 
1573cd1a677cSIlya Dryomov 		if (ceph_msgr2(from_msgr(con->msgr)))
1574cd1a677cSIlya Dryomov 			ret = ceph_con_v2_try_read(con);
1575cd1a677cSIlya Dryomov 		else
1576566050e1SIlya Dryomov 			ret = ceph_con_v1_try_read(con);
15773a140a0dSSage Weil 		if (ret < 0) {
157849659416SAlex Elder 			if (ret == -EAGAIN)
157949659416SAlex Elder 				continue;
158067c64eb7SIlya Dryomov 			if (!con->error_msg)
15813a140a0dSSage Weil 				con->error_msg = "socket error on read";
1582b6e7b6a1SAlex Elder 			fault = true;
158349659416SAlex Elder 			break;
15843a140a0dSSage Weil 		}
15850da5d703SSage Weil 
1586cd1a677cSIlya Dryomov 		if (ceph_msgr2(from_msgr(con->msgr)))
1587cd1a677cSIlya Dryomov 			ret = ceph_con_v2_try_write(con);
1588cd1a677cSIlya Dryomov 		else
1589566050e1SIlya Dryomov 			ret = ceph_con_v1_try_write(con);
15903a140a0dSSage Weil 		if (ret < 0) {
159149659416SAlex Elder 			if (ret == -EAGAIN)
159249659416SAlex Elder 				continue;
159367c64eb7SIlya Dryomov 			if (!con->error_msg)
15943a140a0dSSage Weil 				con->error_msg = "socket error on write";
1595b6e7b6a1SAlex Elder 			fault = true;
15963a140a0dSSage Weil 		}
159749659416SAlex Elder 
159849659416SAlex Elder 		break;	/* If we make it to here, we're done */
159949659416SAlex Elder 	}
1600b6e7b6a1SAlex Elder 	if (fault)
160193209264SAlex Elder 		con_fault(con);
160293209264SAlex Elder 	mutex_unlock(&con->mutex);
1603b6e7b6a1SAlex Elder 
1604b6e7b6a1SAlex Elder 	if (fault)
160593209264SAlex Elder 		con_fault_finish(con);
1606b6e7b6a1SAlex Elder 
1607b6e7b6a1SAlex Elder 	con->ops->put(con);
16083d14c5d2SYehuda Sadeh }
16093d14c5d2SYehuda Sadeh 
16103d14c5d2SYehuda Sadeh /*
16113d14c5d2SYehuda Sadeh  * Generic error/fault handler.  A retry mechanism is used with
16123d14c5d2SYehuda Sadeh  * exponential backoff
16133d14c5d2SYehuda Sadeh  */
con_fault(struct ceph_connection * con)161493209264SAlex Elder static void con_fault(struct ceph_connection *con)
16153d14c5d2SYehuda Sadeh {
161630be780aSIlya Dryomov 	dout("fault %p state %d to peer %s\n",
1617b726ec97SJeff Layton 	     con, con->state, ceph_pr_addr(&con->peer_addr));
16183d14c5d2SYehuda Sadeh 
161967c64eb7SIlya Dryomov 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1620b726ec97SJeff Layton 		ceph_pr_addr(&con->peer_addr), con->error_msg);
162167c64eb7SIlya Dryomov 	con->error_msg = NULL;
162267c64eb7SIlya Dryomov 
1623cd1a677cSIlya Dryomov 	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
1624cd1a677cSIlya Dryomov 		con->state == CEPH_CON_S_CLOSED);
16253d14c5d2SYehuda Sadeh 
16263596f4c1SIlya Dryomov 	ceph_con_reset_protocol(con);
16273d14c5d2SYehuda Sadeh 
16286503e0b6SIlya Dryomov 	if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
16298dacc7daSSage Weil 		dout("fault on LOSSYTX channel, marking CLOSED\n");
16306d7f62bfSIlya Dryomov 		con->state = CEPH_CON_S_CLOSED;
163193209264SAlex Elder 		return;
16323b5ede07SSage Weil 	}
16333b5ede07SSage Weil 
16343d14c5d2SYehuda Sadeh 	/* Requeue anything that hasn't been acked */
16353d14c5d2SYehuda Sadeh 	list_splice_init(&con->out_sent, &con->out_queue);
16363d14c5d2SYehuda Sadeh 
1637e76661d0SSage Weil 	/* If there are no messages queued or keepalive pending, place
1638e76661d0SSage Weil 	 * the connection in a STANDBY state */
1639e76661d0SSage Weil 	if (list_empty(&con->out_queue) &&
16406503e0b6SIlya Dryomov 	    !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
1641e00de341SSage Weil 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
16426503e0b6SIlya Dryomov 		ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
16436d7f62bfSIlya Dryomov 		con->state = CEPH_CON_S_STANDBY;
16443d14c5d2SYehuda Sadeh 	} else {
16453d14c5d2SYehuda Sadeh 		/* retry after a delay. */
16466d7f62bfSIlya Dryomov 		con->state = CEPH_CON_S_PREOPEN;
1647418af5b3SIlya Dryomov 		if (!con->delay) {
16483d14c5d2SYehuda Sadeh 			con->delay = BASE_DELAY_INTERVAL;
1649418af5b3SIlya Dryomov 		} else if (con->delay < MAX_DELAY_INTERVAL) {
16503d14c5d2SYehuda Sadeh 			con->delay *= 2;
1651418af5b3SIlya Dryomov 			if (con->delay > MAX_DELAY_INTERVAL)
1652418af5b3SIlya Dryomov 				con->delay = MAX_DELAY_INTERVAL;
1653418af5b3SIlya Dryomov 		}
16546503e0b6SIlya Dryomov 		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
16558618e30bSAlex Elder 		queue_con(con);
16563d14c5d2SYehuda Sadeh 	}
16573d14c5d2SYehuda Sadeh }
16583d14c5d2SYehuda Sadeh 
ceph_messenger_reset_nonce(struct ceph_messenger * msgr)1659120a75eaSYan, Zheng void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
1660120a75eaSYan, Zheng {
1661120a75eaSYan, Zheng 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
1662120a75eaSYan, Zheng 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
16636503e0b6SIlya Dryomov 	ceph_encode_my_addr(msgr);
1664120a75eaSYan, Zheng }
16653d14c5d2SYehuda Sadeh 
16663d14c5d2SYehuda Sadeh /*
166715d9882cSAlex Elder  * initialize a new messenger instance
16683d14c5d2SYehuda Sadeh  */
ceph_messenger_init(struct ceph_messenger * msgr,struct ceph_entity_addr * myaddr)166915d9882cSAlex Elder void ceph_messenger_init(struct ceph_messenger *msgr,
1670859bff51SIlya Dryomov 			 struct ceph_entity_addr *myaddr)
16713d14c5d2SYehuda Sadeh {
16723d14c5d2SYehuda Sadeh 	spin_lock_init(&msgr->global_seq_lock);
16733d14c5d2SYehuda Sadeh 
1674fd1a154cSIlya Dryomov 	if (myaddr) {
1675fd1a154cSIlya Dryomov 		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
1676fd1a154cSIlya Dryomov 		       sizeof(msgr->inst.addr.in_addr));
16776503e0b6SIlya Dryomov 		ceph_addr_set_port(&msgr->inst.addr, 0);
1678fd1a154cSIlya Dryomov 	}
16793d14c5d2SYehuda Sadeh 
1680cd1a677cSIlya Dryomov 	/*
1681cd1a677cSIlya Dryomov 	 * Since nautilus, clients are identified using type ANY.
1682cd1a677cSIlya Dryomov 	 * For msgr1, ceph_encode_banner_addr() munges it to NONE.
1683cd1a677cSIlya Dryomov 	 */
1684cd1a677cSIlya Dryomov 	msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
1685fd1a154cSIlya Dryomov 
1686fd1a154cSIlya Dryomov 	/* generate a random non-zero nonce */
1687fd1a154cSIlya Dryomov 	do {
1688fd1a154cSIlya Dryomov 		get_random_bytes(&msgr->inst.addr.nonce,
1689fd1a154cSIlya Dryomov 				 sizeof(msgr->inst.addr.nonce));
1690fd1a154cSIlya Dryomov 	} while (!msgr->inst.addr.nonce);
16916503e0b6SIlya Dryomov 	ceph_encode_my_addr(msgr);
16923d14c5d2SYehuda Sadeh 
1693a2a32584SGuanjun He 	atomic_set(&msgr->stopping, 0);
1694757856d2SIlya Dryomov 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
16953d14c5d2SYehuda Sadeh 
169615d9882cSAlex Elder 	dout("%s %p\n", __func__, msgr);
16973d14c5d2SYehuda Sadeh }
16983d14c5d2SYehuda Sadeh 
ceph_messenger_fini(struct ceph_messenger * msgr)1699757856d2SIlya Dryomov void ceph_messenger_fini(struct ceph_messenger *msgr)
1700757856d2SIlya Dryomov {
1701757856d2SIlya Dryomov 	put_net(read_pnet(&msgr->net));
1702757856d2SIlya Dryomov }
1703757856d2SIlya Dryomov 
msg_con_set(struct ceph_msg * msg,struct ceph_connection * con)1704583d0fefSIlya Dryomov static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
1705583d0fefSIlya Dryomov {
1706583d0fefSIlya Dryomov 	if (msg->con)
1707583d0fefSIlya Dryomov 		msg->con->ops->put(msg->con);
1708583d0fefSIlya Dryomov 
1709583d0fefSIlya Dryomov 	msg->con = con ? con->ops->get(con) : NULL;
1710583d0fefSIlya Dryomov 	BUG_ON(msg->con != con);
1711583d0fefSIlya Dryomov }
1712583d0fefSIlya Dryomov 
clear_standby(struct ceph_connection * con)1713e00de341SSage Weil static void clear_standby(struct ceph_connection *con)
1714e00de341SSage Weil {
1715e00de341SSage Weil 	/* come back from STANDBY? */
17166d7f62bfSIlya Dryomov 	if (con->state == CEPH_CON_S_STANDBY) {
1717e00de341SSage Weil 		dout("clear_standby %p and ++connect_seq\n", con);
17186d7f62bfSIlya Dryomov 		con->state = CEPH_CON_S_PREOPEN;
1719a56dd9bfSIlya Dryomov 		con->v1.connect_seq++;
17206503e0b6SIlya Dryomov 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
17216503e0b6SIlya Dryomov 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
1722e00de341SSage Weil 	}
1723e00de341SSage Weil }
1724e00de341SSage Weil 
17253d14c5d2SYehuda Sadeh /*
17263d14c5d2SYehuda Sadeh  * Queue up an outgoing message on the given connection.
1727771294feSIlya Dryomov  *
1728771294feSIlya Dryomov  * Consumes a ref on @msg.
17293d14c5d2SYehuda Sadeh  */
ceph_con_send(struct ceph_connection * con,struct ceph_msg * msg)17303d14c5d2SYehuda Sadeh void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
17313d14c5d2SYehuda Sadeh {
1732a59b55a6SSage Weil 	/* set src+dst */
1733a59b55a6SSage Weil 	msg->hdr.src = con->msgr->inst.name;
1734a59b55a6SSage Weil 	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1735a59b55a6SSage Weil 	msg->needs_out_seq = true;
1736a59b55a6SSage Weil 
1737a59b55a6SSage Weil 	mutex_lock(&con->mutex);
1738a59b55a6SSage Weil 
17396d7f62bfSIlya Dryomov 	if (con->state == CEPH_CON_S_CLOSED) {
17403d14c5d2SYehuda Sadeh 		dout("con_send %p closed, dropping %p\n", con, msg);
17413d14c5d2SYehuda Sadeh 		ceph_msg_put(msg);
1742a59b55a6SSage Weil 		mutex_unlock(&con->mutex);
17433d14c5d2SYehuda Sadeh 		return;
17443d14c5d2SYehuda Sadeh 	}
17453d14c5d2SYehuda Sadeh 
1746583d0fefSIlya Dryomov 	msg_con_set(msg, con);
17473d14c5d2SYehuda Sadeh 
17483d14c5d2SYehuda Sadeh 	BUG_ON(!list_empty(&msg->list_head));
17493d14c5d2SYehuda Sadeh 	list_add_tail(&msg->list_head, &con->out_queue);
17503d14c5d2SYehuda Sadeh 	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
17513d14c5d2SYehuda Sadeh 	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
17523d14c5d2SYehuda Sadeh 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
17533d14c5d2SYehuda Sadeh 	     le32_to_cpu(msg->hdr.front_len),
17543d14c5d2SYehuda Sadeh 	     le32_to_cpu(msg->hdr.middle_len),
17553d14c5d2SYehuda Sadeh 	     le32_to_cpu(msg->hdr.data_len));
175600650931SSage Weil 
175700650931SSage Weil 	clear_standby(con);
17583d14c5d2SYehuda Sadeh 	mutex_unlock(&con->mutex);
17593d14c5d2SYehuda Sadeh 
17603d14c5d2SYehuda Sadeh 	/* if there wasn't anything waiting to send before, queue
17613d14c5d2SYehuda Sadeh 	 * new work */
17626503e0b6SIlya Dryomov 	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
17633d14c5d2SYehuda Sadeh 		queue_con(con);
17643d14c5d2SYehuda Sadeh }
17653d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_con_send);
17663d14c5d2SYehuda Sadeh 
17673d14c5d2SYehuda Sadeh /*
17683d14c5d2SYehuda Sadeh  * Revoke a message that was previously queued for send
17693d14c5d2SYehuda Sadeh  */
ceph_msg_revoke(struct ceph_msg * msg)17706740a845SAlex Elder void ceph_msg_revoke(struct ceph_msg *msg)
17713d14c5d2SYehuda Sadeh {
17726740a845SAlex Elder 	struct ceph_connection *con = msg->con;
17736740a845SAlex Elder 
1774583d0fefSIlya Dryomov 	if (!con) {
1775583d0fefSIlya Dryomov 		dout("%s msg %p null con\n", __func__, msg);
17766740a845SAlex Elder 		return;		/* Message not in our possession */
1777583d0fefSIlya Dryomov 	}
17786740a845SAlex Elder 
17793d14c5d2SYehuda Sadeh 	mutex_lock(&con->mutex);
1780566050e1SIlya Dryomov 	if (list_empty(&msg->list_head)) {
1781566050e1SIlya Dryomov 		WARN_ON(con->out_msg == msg);
1782566050e1SIlya Dryomov 		dout("%s con %p msg %p not linked\n", __func__, con, msg);
17833d14c5d2SYehuda Sadeh 		mutex_unlock(&con->mutex);
1784566050e1SIlya Dryomov 		return;
1785566050e1SIlya Dryomov 	}
1786566050e1SIlya Dryomov 
1787566050e1SIlya Dryomov 	dout("%s con %p msg %p was linked\n", __func__, con, msg);
1788566050e1SIlya Dryomov 	msg->hdr.seq = 0;
1789566050e1SIlya Dryomov 	ceph_msg_remove(msg);
1790566050e1SIlya Dryomov 
1791566050e1SIlya Dryomov 	if (con->out_msg == msg) {
1792566050e1SIlya Dryomov 		WARN_ON(con->state != CEPH_CON_S_OPEN);
1793566050e1SIlya Dryomov 		dout("%s con %p msg %p was sending\n", __func__, con, msg);
1794cd1a677cSIlya Dryomov 		if (ceph_msgr2(from_msgr(con->msgr)))
1795cd1a677cSIlya Dryomov 			ceph_con_v2_revoke(con);
1796cd1a677cSIlya Dryomov 		else
1797566050e1SIlya Dryomov 			ceph_con_v1_revoke(con);
1798566050e1SIlya Dryomov 		ceph_msg_put(con->out_msg);
1799566050e1SIlya Dryomov 		con->out_msg = NULL;
1800566050e1SIlya Dryomov 	} else {
1801566050e1SIlya Dryomov 		dout("%s con %p msg %p not current, out_msg %p\n", __func__,
1802566050e1SIlya Dryomov 		     con, msg, con->out_msg);
1803566050e1SIlya Dryomov 	}
1804566050e1SIlya Dryomov 	mutex_unlock(&con->mutex);
1805566050e1SIlya Dryomov }
1806566050e1SIlya Dryomov 
18073d14c5d2SYehuda Sadeh /*
18083d14c5d2SYehuda Sadeh  * Revoke a message that we may be reading data into
18093d14c5d2SYehuda Sadeh  */
ceph_msg_revoke_incoming(struct ceph_msg * msg)18108921d114SAlex Elder void ceph_msg_revoke_incoming(struct ceph_msg *msg)
18113d14c5d2SYehuda Sadeh {
1812583d0fefSIlya Dryomov 	struct ceph_connection *con = msg->con;
18138921d114SAlex Elder 
1814583d0fefSIlya Dryomov 	if (!con) {
18158921d114SAlex Elder 		dout("%s msg %p null con\n", __func__, msg);
18168921d114SAlex Elder 		return;		/* Message not in our possession */
18178921d114SAlex Elder 	}
18188921d114SAlex Elder 
18193d14c5d2SYehuda Sadeh 	mutex_lock(&con->mutex);
18208921d114SAlex Elder 	if (con->in_msg == msg) {
1821566050e1SIlya Dryomov 		WARN_ON(con->state != CEPH_CON_S_OPEN);
1822566050e1SIlya Dryomov 		dout("%s con %p msg %p was recving\n", __func__, con, msg);
1823cd1a677cSIlya Dryomov 		if (ceph_msgr2(from_msgr(con->msgr)))
1824cd1a677cSIlya Dryomov 			ceph_con_v2_revoke_incoming(con);
1825cd1a677cSIlya Dryomov 		else
1826566050e1SIlya Dryomov 			ceph_con_v1_revoke_incoming(con);
18273d14c5d2SYehuda Sadeh 		ceph_msg_put(con->in_msg);
18283d14c5d2SYehuda Sadeh 		con->in_msg = NULL;
18293d14c5d2SYehuda Sadeh 	} else {
1830566050e1SIlya Dryomov 		dout("%s con %p msg %p not current, in_msg %p\n", __func__,
1831566050e1SIlya Dryomov 		     con, msg, con->in_msg);
18323d14c5d2SYehuda Sadeh 	}
18333d14c5d2SYehuda Sadeh 	mutex_unlock(&con->mutex);
18343d14c5d2SYehuda Sadeh }
18353d14c5d2SYehuda Sadeh 
18363d14c5d2SYehuda Sadeh /*
18373d14c5d2SYehuda Sadeh  * Queue a keepalive byte to ensure the tcp connection is alive.
18383d14c5d2SYehuda Sadeh  */
ceph_con_keepalive(struct ceph_connection * con)18393d14c5d2SYehuda Sadeh void ceph_con_keepalive(struct ceph_connection *con)
18403d14c5d2SYehuda Sadeh {
1841e00de341SSage Weil 	dout("con_keepalive %p\n", con);
184200650931SSage Weil 	mutex_lock(&con->mutex);
1843e00de341SSage Weil 	clear_standby(con);
18446503e0b6SIlya Dryomov 	ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
184500650931SSage Weil 	mutex_unlock(&con->mutex);
18464aac9228SIlya Dryomov 
18476503e0b6SIlya Dryomov 	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
18483d14c5d2SYehuda Sadeh 		queue_con(con);
18493d14c5d2SYehuda Sadeh }
18503d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_con_keepalive);
18513d14c5d2SYehuda Sadeh 
ceph_con_keepalive_expired(struct ceph_connection * con,unsigned long interval)18528b9558aaSYan, Zheng bool ceph_con_keepalive_expired(struct ceph_connection *con,
18538b9558aaSYan, Zheng 			       unsigned long interval)
18548b9558aaSYan, Zheng {
18558b9558aaSYan, Zheng 	if (interval > 0 &&
18568b9558aaSYan, Zheng 	    (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
1857473bd2d7SArnd Bergmann 		struct timespec64 now;
1858473bd2d7SArnd Bergmann 		struct timespec64 ts;
1859473bd2d7SArnd Bergmann 		ktime_get_real_ts64(&now);
1860473bd2d7SArnd Bergmann 		jiffies_to_timespec64(interval, &ts);
1861473bd2d7SArnd Bergmann 		ts = timespec64_add(con->last_keepalive_ack, ts);
1862473bd2d7SArnd Bergmann 		return timespec64_compare(&now, &ts) >= 0;
18638b9558aaSYan, Zheng 	}
18648b9558aaSYan, Zheng 	return false;
18658b9558aaSYan, Zheng }
18668b9558aaSYan, Zheng 
ceph_msg_data_add(struct ceph_msg * msg)18670d9c1ab3SIlya Dryomov static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
186843794509SAlex Elder {
18690d9c1ab3SIlya Dryomov 	BUG_ON(msg->num_data_items >= msg->max_data_items);
18700d9c1ab3SIlya Dryomov 	return &msg->data[msg->num_data_items++];
18716644ed7bSAlex Elder }
18726644ed7bSAlex Elder 
ceph_msg_data_destroy(struct ceph_msg_data * data)18736644ed7bSAlex Elder static void ceph_msg_data_destroy(struct ceph_msg_data *data)
18746644ed7bSAlex Elder {
1875e8862740SIlya Dryomov 	if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
1876e8862740SIlya Dryomov 		int num_pages = calc_pages_for(data->alignment, data->length);
1877e8862740SIlya Dryomov 		ceph_release_page_vector(data->pages, num_pages);
1878e8862740SIlya Dryomov 	} else if (data->type == CEPH_MSG_DATA_PAGELIST) {
18796644ed7bSAlex Elder 		ceph_pagelist_release(data->pagelist);
188043794509SAlex Elder 	}
1881e8862740SIlya Dryomov }
188243794509SAlex Elder 
ceph_msg_data_add_pages(struct ceph_msg * msg,struct page ** pages,size_t length,size_t alignment,bool own_pages)188390af3602SAlex Elder void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
1884e8862740SIlya Dryomov 			     size_t length, size_t alignment, bool own_pages)
188502afca6cSAlex Elder {
18866644ed7bSAlex Elder 	struct ceph_msg_data *data;
18876644ed7bSAlex Elder 
188807aa1558SAlex Elder 	BUG_ON(!pages);
188907aa1558SAlex Elder 	BUG_ON(!length);
189002afca6cSAlex Elder 
18910d9c1ab3SIlya Dryomov 	data = ceph_msg_data_add(msg);
18920d9c1ab3SIlya Dryomov 	data->type = CEPH_MSG_DATA_PAGES;
18936644ed7bSAlex Elder 	data->pages = pages;
18946644ed7bSAlex Elder 	data->length = length;
18956644ed7bSAlex Elder 	data->alignment = alignment & ~PAGE_MASK;
1896e8862740SIlya Dryomov 	data->own_pages = own_pages;
18976644ed7bSAlex Elder 
18985240d9f9SAlex Elder 	msg->data_length += length;
189902afca6cSAlex Elder }
190090af3602SAlex Elder EXPORT_SYMBOL(ceph_msg_data_add_pages);
19013d14c5d2SYehuda Sadeh 
ceph_msg_data_add_pagelist(struct ceph_msg * msg,struct ceph_pagelist * pagelist)190290af3602SAlex Elder void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
190327fa8385SAlex Elder 				struct ceph_pagelist *pagelist)
190427fa8385SAlex Elder {
19056644ed7bSAlex Elder 	struct ceph_msg_data *data;
19066644ed7bSAlex Elder 
190707aa1558SAlex Elder 	BUG_ON(!pagelist);
190807aa1558SAlex Elder 	BUG_ON(!pagelist->length);
190927fa8385SAlex Elder 
19100d9c1ab3SIlya Dryomov 	data = ceph_msg_data_add(msg);
19110d9c1ab3SIlya Dryomov 	data->type = CEPH_MSG_DATA_PAGELIST;
191289486833SIlya Dryomov 	refcount_inc(&pagelist->refcnt);
19136644ed7bSAlex Elder 	data->pagelist = pagelist;
19146644ed7bSAlex Elder 
19155240d9f9SAlex Elder 	msg->data_length += pagelist->length;
191627fa8385SAlex Elder }
191790af3602SAlex Elder EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
191827fa8385SAlex Elder 
1919ea96571fSAlex Elder #ifdef	CONFIG_BLOCK
ceph_msg_data_add_bio(struct ceph_msg * msg,struct ceph_bio_iter * bio_pos,u32 length)19205359a17dSIlya Dryomov void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
19215359a17dSIlya Dryomov 			   u32 length)
192227fa8385SAlex Elder {
19236644ed7bSAlex Elder 	struct ceph_msg_data *data;
192427fa8385SAlex Elder 
19250d9c1ab3SIlya Dryomov 	data = ceph_msg_data_add(msg);
19260d9c1ab3SIlya Dryomov 	data->type = CEPH_MSG_DATA_BIO;
19275359a17dSIlya Dryomov 	data->bio_pos = *bio_pos;
1928c851c495SAlex Elder 	data->bio_length = length;
19296644ed7bSAlex Elder 
19305240d9f9SAlex Elder 	msg->data_length += length;
193127fa8385SAlex Elder }
193290af3602SAlex Elder EXPORT_SYMBOL(ceph_msg_data_add_bio);
1933ea96571fSAlex Elder #endif	/* CONFIG_BLOCK */
193427fa8385SAlex Elder 
ceph_msg_data_add_bvecs(struct ceph_msg * msg,struct ceph_bvec_iter * bvec_pos)1935b9e281c2SIlya Dryomov void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
1936b9e281c2SIlya Dryomov 			     struct ceph_bvec_iter *bvec_pos)
1937b9e281c2SIlya Dryomov {
1938b9e281c2SIlya Dryomov 	struct ceph_msg_data *data;
1939b9e281c2SIlya Dryomov 
19400d9c1ab3SIlya Dryomov 	data = ceph_msg_data_add(msg);
19410d9c1ab3SIlya Dryomov 	data->type = CEPH_MSG_DATA_BVECS;
1942b9e281c2SIlya Dryomov 	data->bvec_pos = *bvec_pos;
1943b9e281c2SIlya Dryomov 
1944b9e281c2SIlya Dryomov 	msg->data_length += bvec_pos->iter.bi_size;
1945b9e281c2SIlya Dryomov }
1946b9e281c2SIlya Dryomov EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
1947b9e281c2SIlya Dryomov 
ceph_msg_data_add_iter(struct ceph_msg * msg,struct iov_iter * iter)1948dee0c5f8SJeff Layton void ceph_msg_data_add_iter(struct ceph_msg *msg,
1949dee0c5f8SJeff Layton 			    struct iov_iter *iter)
1950dee0c5f8SJeff Layton {
1951dee0c5f8SJeff Layton 	struct ceph_msg_data *data;
1952dee0c5f8SJeff Layton 
1953dee0c5f8SJeff Layton 	data = ceph_msg_data_add(msg);
1954dee0c5f8SJeff Layton 	data->type = CEPH_MSG_DATA_ITER;
1955dee0c5f8SJeff Layton 	data->iter = *iter;
1956dee0c5f8SJeff Layton 
1957dee0c5f8SJeff Layton 	msg->data_length += iov_iter_count(&data->iter);
1958dee0c5f8SJeff Layton }
1959dee0c5f8SJeff Layton 
19603d14c5d2SYehuda Sadeh /*
19613d14c5d2SYehuda Sadeh  * construct a new message with given type, size
19623d14c5d2SYehuda Sadeh  * the new msg has a ref count of 1.
19633d14c5d2SYehuda Sadeh  */
ceph_msg_new2(int type,int front_len,int max_data_items,gfp_t flags,bool can_fail)19640d9c1ab3SIlya Dryomov struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
19650d9c1ab3SIlya Dryomov 			       gfp_t flags, bool can_fail)
19663d14c5d2SYehuda Sadeh {
19673d14c5d2SYehuda Sadeh 	struct ceph_msg *m;
19683d14c5d2SYehuda Sadeh 
1969e3d5d638SAlex Elder 	m = kmem_cache_zalloc(ceph_msg_cache, flags);
19703d14c5d2SYehuda Sadeh 	if (m == NULL)
19713d14c5d2SYehuda Sadeh 		goto out;
197238941f80SAlex Elder 
19733d14c5d2SYehuda Sadeh 	m->hdr.type = cpu_to_le16(type);
19743d14c5d2SYehuda Sadeh 	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
19753d14c5d2SYehuda Sadeh 	m->hdr.front_len = cpu_to_le32(front_len);
19763d14c5d2SYehuda Sadeh 
19779516e45bSAlex Elder 	INIT_LIST_HEAD(&m->list_head);
19789516e45bSAlex Elder 	kref_init(&m->kref);
1979ca20892dSHenry C Chang 
19803d14c5d2SYehuda Sadeh 	/* front */
19813d14c5d2SYehuda Sadeh 	if (front_len) {
1982a421ef30SMichal Hocko 		m->front.iov_base = kvmalloc(front_len, flags);
19833d14c5d2SYehuda Sadeh 		if (m->front.iov_base == NULL) {
1984b61c2763SSage Weil 			dout("ceph_msg_new can't allocate %d bytes\n",
19853d14c5d2SYehuda Sadeh 			     front_len);
19863d14c5d2SYehuda Sadeh 			goto out2;
19873d14c5d2SYehuda Sadeh 		}
19883d14c5d2SYehuda Sadeh 	} else {
19893d14c5d2SYehuda Sadeh 		m->front.iov_base = NULL;
19903d14c5d2SYehuda Sadeh 	}
1991f2be82b0SIlya Dryomov 	m->front_alloc_len = m->front.iov_len = front_len;
19923d14c5d2SYehuda Sadeh 
19930d9c1ab3SIlya Dryomov 	if (max_data_items) {
19940d9c1ab3SIlya Dryomov 		m->data = kmalloc_array(max_data_items, sizeof(*m->data),
19950d9c1ab3SIlya Dryomov 					flags);
19960d9c1ab3SIlya Dryomov 		if (!m->data)
19970d9c1ab3SIlya Dryomov 			goto out2;
19980d9c1ab3SIlya Dryomov 
19990d9c1ab3SIlya Dryomov 		m->max_data_items = max_data_items;
20000d9c1ab3SIlya Dryomov 	}
20010d9c1ab3SIlya Dryomov 
20023d14c5d2SYehuda Sadeh 	dout("ceph_msg_new %p front %d\n", m, front_len);
20033d14c5d2SYehuda Sadeh 	return m;
20043d14c5d2SYehuda Sadeh 
20053d14c5d2SYehuda Sadeh out2:
20063d14c5d2SYehuda Sadeh 	ceph_msg_put(m);
20073d14c5d2SYehuda Sadeh out:
2008b61c2763SSage Weil 	if (!can_fail) {
2009b61c2763SSage Weil 		pr_err("msg_new can't create type %d front %d\n", type,
2010b61c2763SSage Weil 		       front_len);
2011f0ed1b7cSSage Weil 		WARN_ON(1);
2012b61c2763SSage Weil 	} else {
2013b61c2763SSage Weil 		dout("msg_new can't create type %d front %d\n", type,
2014b61c2763SSage Weil 		     front_len);
2015b61c2763SSage Weil 	}
20163d14c5d2SYehuda Sadeh 	return NULL;
20173d14c5d2SYehuda Sadeh }
20180d9c1ab3SIlya Dryomov EXPORT_SYMBOL(ceph_msg_new2);
20190d9c1ab3SIlya Dryomov 
ceph_msg_new(int type,int front_len,gfp_t flags,bool can_fail)20200d9c1ab3SIlya Dryomov struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
20210d9c1ab3SIlya Dryomov 			      bool can_fail)
20220d9c1ab3SIlya Dryomov {
20230d9c1ab3SIlya Dryomov 	return ceph_msg_new2(type, front_len, 0, flags, can_fail);
20240d9c1ab3SIlya Dryomov }
20253d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_msg_new);
20263d14c5d2SYehuda Sadeh 
20273d14c5d2SYehuda Sadeh /*
20283d14c5d2SYehuda Sadeh  * Allocate "middle" portion of a message, if it is needed and wasn't
20293d14c5d2SYehuda Sadeh  * allocated by alloc_msg.  This allows us to read a small fixed-size
20303d14c5d2SYehuda Sadeh  * per-type header in the front and then gracefully fail (i.e.,
20313d14c5d2SYehuda Sadeh  * propagate the error to the caller based on info in the front) when
20323d14c5d2SYehuda Sadeh  * the middle is too large.
20333d14c5d2SYehuda Sadeh  */
ceph_alloc_middle(struct ceph_connection * con,struct ceph_msg * msg)20343d14c5d2SYehuda Sadeh static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
20353d14c5d2SYehuda Sadeh {
20363d14c5d2SYehuda Sadeh 	int type = le16_to_cpu(msg->hdr.type);
20373d14c5d2SYehuda Sadeh 	int middle_len = le32_to_cpu(msg->hdr.middle_len);
20383d14c5d2SYehuda Sadeh 
20393d14c5d2SYehuda Sadeh 	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
20403d14c5d2SYehuda Sadeh 	     ceph_msg_type_name(type), middle_len);
20413d14c5d2SYehuda Sadeh 	BUG_ON(!middle_len);
20423d14c5d2SYehuda Sadeh 	BUG_ON(msg->middle);
20433d14c5d2SYehuda Sadeh 
20443d14c5d2SYehuda Sadeh 	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
20453d14c5d2SYehuda Sadeh 	if (!msg->middle)
20463d14c5d2SYehuda Sadeh 		return -ENOMEM;
20473d14c5d2SYehuda Sadeh 	return 0;
20483d14c5d2SYehuda Sadeh }
20493d14c5d2SYehuda Sadeh 
20503d14c5d2SYehuda Sadeh /*
20511c20f2d2SAlex Elder  * Allocate a message for receiving an incoming message on a
20521c20f2d2SAlex Elder  * connection, and save the result in con->in_msg.  Uses the
20531c20f2d2SAlex Elder  * connection's private alloc_msg op if available.
20541c20f2d2SAlex Elder  *
20554740a623SSage Weil  * Returns 0 on success, or a negative error code.
20564740a623SSage Weil  *
20574740a623SSage Weil  * On success, if we set *skip = 1:
20584740a623SSage Weil  *  - the next message should be skipped and ignored.
20594740a623SSage Weil  *  - con->in_msg == NULL
20604740a623SSage Weil  * or if we set *skip = 0:
20614740a623SSage Weil  *  - con->in_msg is non-null.
20624740a623SSage Weil  * On error (ENOMEM, EAGAIN, ...),
20634740a623SSage Weil  *  - con->in_msg == NULL
20643d14c5d2SYehuda Sadeh  */
ceph_con_in_msg_alloc(struct ceph_connection * con,struct ceph_msg_header * hdr,int * skip)20656503e0b6SIlya Dryomov int ceph_con_in_msg_alloc(struct ceph_connection *con,
2066fc4c128eSIlya Dryomov 			  struct ceph_msg_header *hdr, int *skip)
20673d14c5d2SYehuda Sadeh {
20683d14c5d2SYehuda Sadeh 	int middle_len = le32_to_cpu(hdr->middle_len);
20691d866d1cSAlex Elder 	struct ceph_msg *msg;
20704740a623SSage Weil 	int ret = 0;
20713d14c5d2SYehuda Sadeh 
20721c20f2d2SAlex Elder 	BUG_ON(con->in_msg != NULL);
207353ded495SAlex Elder 	BUG_ON(!con->ops->alloc_msg);
20743d14c5d2SYehuda Sadeh 
20753d14c5d2SYehuda Sadeh 	mutex_unlock(&con->mutex);
20763d14c5d2SYehuda Sadeh 	msg = con->ops->alloc_msg(con, hdr, skip);
20773d14c5d2SYehuda Sadeh 	mutex_lock(&con->mutex);
20786d7f62bfSIlya Dryomov 	if (con->state != CEPH_CON_S_OPEN) {
20797246240cSSage Weil 		if (msg)
208061399191SSage Weil 			ceph_msg_put(msg);
208161399191SSage Weil 		return -EAGAIN;
20823d14c5d2SYehuda Sadeh 	}
20834137577aSAlex Elder 	if (msg) {
20844137577aSAlex Elder 		BUG_ON(*skip);
2085583d0fefSIlya Dryomov 		msg_con_set(msg, con);
208661399191SSage Weil 		con->in_msg = msg;
20874137577aSAlex Elder 	} else {
20884137577aSAlex Elder 		/*
20894137577aSAlex Elder 		 * Null message pointer means either we should skip
20904137577aSAlex Elder 		 * this message or we couldn't allocate memory.  The
20914137577aSAlex Elder 		 * former is not an error.
20924137577aSAlex Elder 		 */
20934137577aSAlex Elder 		if (*skip)
20944740a623SSage Weil 			return 0;
20954137577aSAlex Elder 
209667c64eb7SIlya Dryomov 		con->error_msg = "error allocating memory for incoming message";
20974740a623SSage Weil 		return -ENOMEM;
20984740a623SSage Weil 	}
2099fc4c128eSIlya Dryomov 	memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
21003d14c5d2SYehuda Sadeh 
21011c20f2d2SAlex Elder 	if (middle_len && !con->in_msg->middle) {
21021c20f2d2SAlex Elder 		ret = ceph_alloc_middle(con, con->in_msg);
21033d14c5d2SYehuda Sadeh 		if (ret < 0) {
21041c20f2d2SAlex Elder 			ceph_msg_put(con->in_msg);
21051c20f2d2SAlex Elder 			con->in_msg = NULL;
21063d14c5d2SYehuda Sadeh 		}
21073d14c5d2SYehuda Sadeh 	}
21083d14c5d2SYehuda Sadeh 
21094740a623SSage Weil 	return ret;
21103d14c5d2SYehuda Sadeh }
21113d14c5d2SYehuda Sadeh 
ceph_con_get_out_msg(struct ceph_connection * con)21126503e0b6SIlya Dryomov void ceph_con_get_out_msg(struct ceph_connection *con)
2113771294feSIlya Dryomov {
2114771294feSIlya Dryomov 	struct ceph_msg *msg;
2115771294feSIlya Dryomov 
2116771294feSIlya Dryomov 	BUG_ON(list_empty(&con->out_queue));
2117771294feSIlya Dryomov 	msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
2118771294feSIlya Dryomov 	WARN_ON(msg->con != con);
2119771294feSIlya Dryomov 
2120771294feSIlya Dryomov 	/*
2121771294feSIlya Dryomov 	 * Put the message on "sent" list using a ref from ceph_con_send().
2122771294feSIlya Dryomov 	 * It is put when the message is acked or revoked.
2123771294feSIlya Dryomov 	 */
2124771294feSIlya Dryomov 	list_move_tail(&msg->list_head, &con->out_sent);
2125771294feSIlya Dryomov 
2126771294feSIlya Dryomov 	/*
2127771294feSIlya Dryomov 	 * Only assign outgoing seq # if we haven't sent this message
2128771294feSIlya Dryomov 	 * yet.  If it is requeued, resend with it's original seq.
2129771294feSIlya Dryomov 	 */
2130771294feSIlya Dryomov 	if (msg->needs_out_seq) {
2131771294feSIlya Dryomov 		msg->hdr.seq = cpu_to_le64(++con->out_seq);
2132771294feSIlya Dryomov 		msg->needs_out_seq = false;
2133771294feSIlya Dryomov 
2134771294feSIlya Dryomov 		if (con->ops->reencode_message)
2135771294feSIlya Dryomov 			con->ops->reencode_message(msg);
2136771294feSIlya Dryomov 	}
2137771294feSIlya Dryomov 
2138771294feSIlya Dryomov 	/*
2139771294feSIlya Dryomov 	 * Get a ref for out_msg.  It is put when we are done sending the
2140771294feSIlya Dryomov 	 * message or in case of a fault.
2141771294feSIlya Dryomov 	 */
2142771294feSIlya Dryomov 	WARN_ON(con->out_msg);
2143771294feSIlya Dryomov 	con->out_msg = ceph_msg_get(msg);
2144771294feSIlya Dryomov }
21453d14c5d2SYehuda Sadeh 
21463d14c5d2SYehuda Sadeh /*
21473d14c5d2SYehuda Sadeh  * Free a generically kmalloc'd message.
21483d14c5d2SYehuda Sadeh  */
ceph_msg_free(struct ceph_msg * m)21490215e44bSIlya Dryomov static void ceph_msg_free(struct ceph_msg *m)
21503d14c5d2SYehuda Sadeh {
21510215e44bSIlya Dryomov 	dout("%s %p\n", __func__, m);
21524965fc38SIlya Dryomov 	kvfree(m->front.iov_base);
21530d9c1ab3SIlya Dryomov 	kfree(m->data);
2154e3d5d638SAlex Elder 	kmem_cache_free(ceph_msg_cache, m);
21553d14c5d2SYehuda Sadeh }
21563d14c5d2SYehuda Sadeh 
ceph_msg_release(struct kref * kref)21570215e44bSIlya Dryomov static void ceph_msg_release(struct kref *kref)
21583d14c5d2SYehuda Sadeh {
21593d14c5d2SYehuda Sadeh 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
21600d9c1ab3SIlya Dryomov 	int i;
21613d14c5d2SYehuda Sadeh 
21620215e44bSIlya Dryomov 	dout("%s %p\n", __func__, m);
21633d14c5d2SYehuda Sadeh 	WARN_ON(!list_empty(&m->list_head));
21643d14c5d2SYehuda Sadeh 
2165583d0fefSIlya Dryomov 	msg_con_set(m, NULL);
2166583d0fefSIlya Dryomov 
21673d14c5d2SYehuda Sadeh 	/* drop middle, data, if any */
21683d14c5d2SYehuda Sadeh 	if (m->middle) {
21693d14c5d2SYehuda Sadeh 		ceph_buffer_put(m->middle);
21703d14c5d2SYehuda Sadeh 		m->middle = NULL;
21713d14c5d2SYehuda Sadeh 	}
21725240d9f9SAlex Elder 
21730d9c1ab3SIlya Dryomov 	for (i = 0; i < m->num_data_items; i++)
21740d9c1ab3SIlya Dryomov 		ceph_msg_data_destroy(&m->data[i]);
21753d14c5d2SYehuda Sadeh 
21763d14c5d2SYehuda Sadeh 	if (m->pool)
21773d14c5d2SYehuda Sadeh 		ceph_msgpool_put(m->pool, m);
21783d14c5d2SYehuda Sadeh 	else
21790215e44bSIlya Dryomov 		ceph_msg_free(m);
21803d14c5d2SYehuda Sadeh }
21810215e44bSIlya Dryomov 
ceph_msg_get(struct ceph_msg * msg)21820215e44bSIlya Dryomov struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
21830215e44bSIlya Dryomov {
21840215e44bSIlya Dryomov 	dout("%s %p (was %d)\n", __func__, msg,
21852c935bc5SPeter Zijlstra 	     kref_read(&msg->kref));
21860215e44bSIlya Dryomov 	kref_get(&msg->kref);
21870215e44bSIlya Dryomov 	return msg;
21880215e44bSIlya Dryomov }
21890215e44bSIlya Dryomov EXPORT_SYMBOL(ceph_msg_get);
21900215e44bSIlya Dryomov 
ceph_msg_put(struct ceph_msg * msg)21910215e44bSIlya Dryomov void ceph_msg_put(struct ceph_msg *msg)
21920215e44bSIlya Dryomov {
21930215e44bSIlya Dryomov 	dout("%s %p (was %d)\n", __func__, msg,
21942c935bc5SPeter Zijlstra 	     kref_read(&msg->kref));
21950215e44bSIlya Dryomov 	kref_put(&msg->kref, ceph_msg_release);
21960215e44bSIlya Dryomov }
21970215e44bSIlya Dryomov EXPORT_SYMBOL(ceph_msg_put);
21983d14c5d2SYehuda Sadeh 
ceph_msg_dump(struct ceph_msg * msg)21993d14c5d2SYehuda Sadeh void ceph_msg_dump(struct ceph_msg *msg)
22003d14c5d2SYehuda Sadeh {
22013cea4c30SIlya Dryomov 	pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
22023cea4c30SIlya Dryomov 		 msg->front_alloc_len, msg->data_length);
22033d14c5d2SYehuda Sadeh 	print_hex_dump(KERN_DEBUG, "header: ",
22043d14c5d2SYehuda Sadeh 		       DUMP_PREFIX_OFFSET, 16, 1,
22053d14c5d2SYehuda Sadeh 		       &msg->hdr, sizeof(msg->hdr), true);
22063d14c5d2SYehuda Sadeh 	print_hex_dump(KERN_DEBUG, " front: ",
22073d14c5d2SYehuda Sadeh 		       DUMP_PREFIX_OFFSET, 16, 1,
22083d14c5d2SYehuda Sadeh 		       msg->front.iov_base, msg->front.iov_len, true);
22093d14c5d2SYehuda Sadeh 	if (msg->middle)
22103d14c5d2SYehuda Sadeh 		print_hex_dump(KERN_DEBUG, "middle: ",
22113d14c5d2SYehuda Sadeh 			       DUMP_PREFIX_OFFSET, 16, 1,
22123d14c5d2SYehuda Sadeh 			       msg->middle->vec.iov_base,
22133d14c5d2SYehuda Sadeh 			       msg->middle->vec.iov_len, true);
22143d14c5d2SYehuda Sadeh 	print_hex_dump(KERN_DEBUG, "footer: ",
22153d14c5d2SYehuda Sadeh 		       DUMP_PREFIX_OFFSET, 16, 1,
22163d14c5d2SYehuda Sadeh 		       &msg->footer, sizeof(msg->footer), true);
22173d14c5d2SYehuda Sadeh }
22183d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_msg_dump);
2219