xref: /openbmc/linux/fs/dlm/lock.c (revision c494a447)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /******************************************************************************
3 *******************************************************************************
4 **
5 **  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
6 **
7 **
8 *******************************************************************************
9 ******************************************************************************/
10 
11 /* Central locking logic has four stages:
12 
13    dlm_lock()
14    dlm_unlock()
15 
16    request_lock(ls, lkb)
17    convert_lock(ls, lkb)
18    unlock_lock(ls, lkb)
19    cancel_lock(ls, lkb)
20 
21    _request_lock(r, lkb)
22    _convert_lock(r, lkb)
23    _unlock_lock(r, lkb)
24    _cancel_lock(r, lkb)
25 
26    do_request(r, lkb)
27    do_convert(r, lkb)
28    do_unlock(r, lkb)
29    do_cancel(r, lkb)
30 
31    Stage 1 (lock, unlock) is mainly about checking input args and
32    splitting into one of the four main operations:
33 
34        dlm_lock          = request_lock
35        dlm_lock+CONVERT  = convert_lock
36        dlm_unlock        = unlock_lock
37        dlm_unlock+CANCEL = cancel_lock
38 
39    Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
40    provided to the next stage.
41 
42    Stage 3, _xxxx_lock(), determines if the operation is local or remote.
43    When remote, it calls send_xxxx(), when local it calls do_xxxx().
44 
45    Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
46    given rsb and lkb and queues callbacks.
47 
48    For remote operations, send_xxxx() results in the corresponding do_xxxx()
49    function being executed on the remote node.  The connecting send/receive
50    calls on local (L) and remote (R) nodes:
51 
52    L: send_xxxx()              ->  R: receive_xxxx()
53                                    R: do_xxxx()
54    L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
55 */
56 #include <trace/events/dlm.h>
57 
58 #include <linux/types.h>
59 #include <linux/rbtree.h>
60 #include <linux/slab.h>
61 #include "dlm_internal.h"
62 #include <linux/dlm_device.h>
63 #include "memory.h"
64 #include "midcomms.h"
65 #include "requestqueue.h"
66 #include "util.h"
67 #include "dir.h"
68 #include "member.h"
69 #include "lockspace.h"
70 #include "ast.h"
71 #include "lock.h"
72 #include "rcom.h"
73 #include "recover.h"
74 #include "lvb_table.h"
75 #include "user.h"
76 #include "config.h"
77 
78 static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
79 static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
80 static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
81 static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
82 static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
83 static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
84 static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
85 static int send_remove(struct dlm_rsb *r);
86 static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
87 static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
88 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
89 				    struct dlm_message *ms);
90 static int receive_extralen(struct dlm_message *ms);
91 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
92 static void del_timeout(struct dlm_lkb *lkb);
93 static void toss_rsb(struct kref *kref);
94 
95 /*
96  * Lock compatibilty matrix - thanks Steve
97  * UN = Unlocked state. Not really a state, used as a flag
98  * PD = Padding. Used to make the matrix a nice power of two in size
99  * Other states are the same as the VMS DLM.
100  * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
101  */
102 
103 static const int __dlm_compat_matrix[8][8] = {
104       /* UN NL CR CW PR PW EX PD */
105         {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
106         {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
107         {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
108         {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
109         {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
110         {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
111         {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
112         {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
113 };
114 
115 /*
116  * This defines the direction of transfer of LVB data.
117  * Granted mode is the row; requested mode is the column.
118  * Usage: matrix[grmode+1][rqmode+1]
119  * 1 = LVB is returned to the caller
120  * 0 = LVB is written to the resource
121  * -1 = nothing happens to the LVB
122  */
123 
124 const int dlm_lvb_operations[8][8] = {
125         /* UN   NL  CR  CW  PR  PW  EX  PD*/
126         {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
127         {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
128         {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
129         {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
130         {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
131         {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
132         {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
133         {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
134 };
135 
136 #define modes_compat(gr, rq) \
137 	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
138 
139 int dlm_modes_compat(int mode1, int mode2)
140 {
141 	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
142 }
143 
144 /*
145  * Compatibility matrix for conversions with QUECVT set.
146  * Granted mode is the row; requested mode is the column.
147  * Usage: matrix[grmode+1][rqmode+1]
148  */
149 
150 static const int __quecvt_compat_matrix[8][8] = {
151       /* UN NL CR CW PR PW EX PD */
152         {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
153         {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
154         {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
155         {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
156         {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
157         {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
158         {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
159         {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
160 };
161 
162 void dlm_print_lkb(struct dlm_lkb *lkb)
163 {
164 	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x "
165 	       "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n",
166 	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
167 	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
168 	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid,
169 	       (unsigned long long)lkb->lkb_recover_seq);
170 }
171 
172 static void dlm_print_rsb(struct dlm_rsb *r)
173 {
174 	printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x "
175 	       "rlc %d name %s\n",
176 	       r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
177 	       r->res_flags, r->res_first_lkid, r->res_recover_locks_count,
178 	       r->res_name);
179 }
180 
181 void dlm_dump_rsb(struct dlm_rsb *r)
182 {
183 	struct dlm_lkb *lkb;
184 
185 	dlm_print_rsb(r);
186 
187 	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
188 	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
189 	printk(KERN_ERR "rsb lookup list\n");
190 	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
191 		dlm_print_lkb(lkb);
192 	printk(KERN_ERR "rsb grant queue:\n");
193 	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
194 		dlm_print_lkb(lkb);
195 	printk(KERN_ERR "rsb convert queue:\n");
196 	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
197 		dlm_print_lkb(lkb);
198 	printk(KERN_ERR "rsb wait queue:\n");
199 	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
200 		dlm_print_lkb(lkb);
201 }
202 
203 /* Threads cannot use the lockspace while it's being recovered */
204 
205 static inline void dlm_lock_recovery(struct dlm_ls *ls)
206 {
207 	down_read(&ls->ls_in_recovery);
208 }
209 
210 void dlm_unlock_recovery(struct dlm_ls *ls)
211 {
212 	up_read(&ls->ls_in_recovery);
213 }
214 
215 int dlm_lock_recovery_try(struct dlm_ls *ls)
216 {
217 	return down_read_trylock(&ls->ls_in_recovery);
218 }
219 
220 static inline int can_be_queued(struct dlm_lkb *lkb)
221 {
222 	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
223 }
224 
225 static inline int force_blocking_asts(struct dlm_lkb *lkb)
226 {
227 	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
228 }
229 
230 static inline int is_demoted(struct dlm_lkb *lkb)
231 {
232 	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
233 }
234 
235 static inline int is_altmode(struct dlm_lkb *lkb)
236 {
237 	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
238 }
239 
240 static inline int is_granted(struct dlm_lkb *lkb)
241 {
242 	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
243 }
244 
245 static inline int is_remote(struct dlm_rsb *r)
246 {
247 	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
248 	return !!r->res_nodeid;
249 }
250 
251 static inline int is_process_copy(struct dlm_lkb *lkb)
252 {
253 	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
254 }
255 
256 static inline int is_master_copy(struct dlm_lkb *lkb)
257 {
258 	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
259 }
260 
261 static inline int middle_conversion(struct dlm_lkb *lkb)
262 {
263 	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
264 	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
265 		return 1;
266 	return 0;
267 }
268 
269 static inline int down_conversion(struct dlm_lkb *lkb)
270 {
271 	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
272 }
273 
274 static inline int is_overlap_unlock(struct dlm_lkb *lkb)
275 {
276 	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
277 }
278 
279 static inline int is_overlap_cancel(struct dlm_lkb *lkb)
280 {
281 	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
282 }
283 
284 static inline int is_overlap(struct dlm_lkb *lkb)
285 {
286 	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
287 				  DLM_IFL_OVERLAP_CANCEL));
288 }
289 
290 static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
291 {
292 	if (is_master_copy(lkb))
293 		return;
294 
295 	del_timeout(lkb);
296 
297 	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
298 
299 	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
300 	   timeout caused the cancel then return -ETIMEDOUT */
301 	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
302 		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
303 		rv = -ETIMEDOUT;
304 	}
305 
306 	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
307 		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
308 		rv = -EDEADLK;
309 	}
310 
311 	dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
312 }
313 
314 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
315 {
316 	queue_cast(r, lkb,
317 		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
318 }
319 
320 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
321 {
322 	if (is_master_copy(lkb)) {
323 		send_bast(r, lkb, rqmode);
324 	} else {
325 		dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0);
326 	}
327 }
328 
329 /*
330  * Basic operations on rsb's and lkb's
331  */
332 
333 /* This is only called to add a reference when the code already holds
334    a valid reference to the rsb, so there's no need for locking. */
335 
336 static inline void hold_rsb(struct dlm_rsb *r)
337 {
338 	kref_get(&r->res_ref);
339 }
340 
341 void dlm_hold_rsb(struct dlm_rsb *r)
342 {
343 	hold_rsb(r);
344 }
345 
346 /* When all references to the rsb are gone it's transferred to
347    the tossed list for later disposal. */
348 
349 static void put_rsb(struct dlm_rsb *r)
350 {
351 	struct dlm_ls *ls = r->res_ls;
352 	uint32_t bucket = r->res_bucket;
353 	int rv;
354 
355 	rv = kref_put_lock(&r->res_ref, toss_rsb,
356 			   &ls->ls_rsbtbl[bucket].lock);
357 	if (rv)
358 		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
359 }
360 
361 void dlm_put_rsb(struct dlm_rsb *r)
362 {
363 	put_rsb(r);
364 }
365 
366 static int pre_rsb_struct(struct dlm_ls *ls)
367 {
368 	struct dlm_rsb *r1, *r2;
369 	int count = 0;
370 
371 	spin_lock(&ls->ls_new_rsb_spin);
372 	if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) {
373 		spin_unlock(&ls->ls_new_rsb_spin);
374 		return 0;
375 	}
376 	spin_unlock(&ls->ls_new_rsb_spin);
377 
378 	r1 = dlm_allocate_rsb(ls);
379 	r2 = dlm_allocate_rsb(ls);
380 
381 	spin_lock(&ls->ls_new_rsb_spin);
382 	if (r1) {
383 		list_add(&r1->res_hashchain, &ls->ls_new_rsb);
384 		ls->ls_new_rsb_count++;
385 	}
386 	if (r2) {
387 		list_add(&r2->res_hashchain, &ls->ls_new_rsb);
388 		ls->ls_new_rsb_count++;
389 	}
390 	count = ls->ls_new_rsb_count;
391 	spin_unlock(&ls->ls_new_rsb_spin);
392 
393 	if (!count)
394 		return -ENOMEM;
395 	return 0;
396 }
397 
398 /* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can
399    unlock any spinlocks, go back and call pre_rsb_struct again.
400    Otherwise, take an rsb off the list and return it. */
401 
402 static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
403 			  struct dlm_rsb **r_ret)
404 {
405 	struct dlm_rsb *r;
406 	int count;
407 
408 	spin_lock(&ls->ls_new_rsb_spin);
409 	if (list_empty(&ls->ls_new_rsb)) {
410 		count = ls->ls_new_rsb_count;
411 		spin_unlock(&ls->ls_new_rsb_spin);
412 		log_debug(ls, "find_rsb retry %d %d %s",
413 			  count, dlm_config.ci_new_rsb_count, name);
414 		return -EAGAIN;
415 	}
416 
417 	r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
418 	list_del(&r->res_hashchain);
419 	/* Convert the empty list_head to a NULL rb_node for tree usage: */
420 	memset(&r->res_hashnode, 0, sizeof(struct rb_node));
421 	ls->ls_new_rsb_count--;
422 	spin_unlock(&ls->ls_new_rsb_spin);
423 
424 	r->res_ls = ls;
425 	r->res_length = len;
426 	memcpy(r->res_name, name, len);
427 	mutex_init(&r->res_mutex);
428 
429 	INIT_LIST_HEAD(&r->res_lookup);
430 	INIT_LIST_HEAD(&r->res_grantqueue);
431 	INIT_LIST_HEAD(&r->res_convertqueue);
432 	INIT_LIST_HEAD(&r->res_waitqueue);
433 	INIT_LIST_HEAD(&r->res_root_list);
434 	INIT_LIST_HEAD(&r->res_recover_list);
435 
436 	*r_ret = r;
437 	return 0;
438 }
439 
440 static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
441 {
442 	char maxname[DLM_RESNAME_MAXLEN];
443 
444 	memset(maxname, 0, DLM_RESNAME_MAXLEN);
445 	memcpy(maxname, name, nlen);
446 	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
447 }
448 
449 int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
450 			struct dlm_rsb **r_ret)
451 {
452 	struct rb_node *node = tree->rb_node;
453 	struct dlm_rsb *r;
454 	int rc;
455 
456 	while (node) {
457 		r = rb_entry(node, struct dlm_rsb, res_hashnode);
458 		rc = rsb_cmp(r, name, len);
459 		if (rc < 0)
460 			node = node->rb_left;
461 		else if (rc > 0)
462 			node = node->rb_right;
463 		else
464 			goto found;
465 	}
466 	*r_ret = NULL;
467 	return -EBADR;
468 
469  found:
470 	*r_ret = r;
471 	return 0;
472 }
473 
474 static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
475 {
476 	struct rb_node **newn = &tree->rb_node;
477 	struct rb_node *parent = NULL;
478 	int rc;
479 
480 	while (*newn) {
481 		struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
482 					       res_hashnode);
483 
484 		parent = *newn;
485 		rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
486 		if (rc < 0)
487 			newn = &parent->rb_left;
488 		else if (rc > 0)
489 			newn = &parent->rb_right;
490 		else {
491 			log_print("rsb_insert match");
492 			dlm_dump_rsb(rsb);
493 			dlm_dump_rsb(cur);
494 			return -EEXIST;
495 		}
496 	}
497 
498 	rb_link_node(&rsb->res_hashnode, parent, newn);
499 	rb_insert_color(&rsb->res_hashnode, tree);
500 	return 0;
501 }
502 
503 /*
504  * Find rsb in rsbtbl and potentially create/add one
505  *
506  * Delaying the release of rsb's has a similar benefit to applications keeping
507  * NL locks on an rsb, but without the guarantee that the cached master value
508  * will still be valid when the rsb is reused.  Apps aren't always smart enough
509  * to keep NL locks on an rsb that they may lock again shortly; this can lead
510  * to excessive master lookups and removals if we don't delay the release.
511  *
512  * Searching for an rsb means looking through both the normal list and toss
513  * list.  When found on the toss list the rsb is moved to the normal list with
514  * ref count of 1; when found on normal list the ref count is incremented.
515  *
516  * rsb's on the keep list are being used locally and refcounted.
517  * rsb's on the toss list are not being used locally, and are not refcounted.
518  *
519  * The toss list rsb's were either
520  * - previously used locally but not any more (were on keep list, then
521  *   moved to toss list when last refcount dropped)
522  * - created and put on toss list as a directory record for a lookup
523  *   (we are the dir node for the res, but are not using the res right now,
524  *   but some other node is)
525  *
526  * The purpose of find_rsb() is to return a refcounted rsb for local use.
527  * So, if the given rsb is on the toss list, it is moved to the keep list
528  * before being returned.
529  *
530  * toss_rsb() happens when all local usage of the rsb is done, i.e. no
531  * more refcounts exist, so the rsb is moved from the keep list to the
532  * toss list.
533  *
534  * rsb's on both keep and toss lists are used for doing a name to master
535  * lookups.  rsb's that are in use locally (and being refcounted) are on
536  * the keep list, rsb's that are not in use locally (not refcounted) and
537  * only exist for name/master lookups are on the toss list.
538  *
539  * rsb's on the toss list who's dir_nodeid is not local can have stale
540  * name/master mappings.  So, remote requests on such rsb's can potentially
541  * return with an error, which means the mapping is stale and needs to
542  * be updated with a new lookup.  (The idea behind MASTER UNCERTAIN and
543  * first_lkid is to keep only a single outstanding request on an rsb
544  * while that rsb has a potentially stale master.)
545  */
546 
547 static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
548 			uint32_t hash, uint32_t b,
549 			int dir_nodeid, int from_nodeid,
550 			unsigned int flags, struct dlm_rsb **r_ret)
551 {
552 	struct dlm_rsb *r = NULL;
553 	int our_nodeid = dlm_our_nodeid();
554 	int from_local = 0;
555 	int from_other = 0;
556 	int from_dir = 0;
557 	int create = 0;
558 	int error;
559 
560 	if (flags & R_RECEIVE_REQUEST) {
561 		if (from_nodeid == dir_nodeid)
562 			from_dir = 1;
563 		else
564 			from_other = 1;
565 	} else if (flags & R_REQUEST) {
566 		from_local = 1;
567 	}
568 
569 	/*
570 	 * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so
571 	 * from_nodeid has sent us a lock in dlm_recover_locks, believing
572 	 * we're the new master.  Our local recovery may not have set
573 	 * res_master_nodeid to our_nodeid yet, so allow either.  Don't
574 	 * create the rsb; dlm_recover_process_copy() will handle EBADR
575 	 * by resending.
576 	 *
577 	 * If someone sends us a request, we are the dir node, and we do
578 	 * not find the rsb anywhere, then recreate it.  This happens if
579 	 * someone sends us a request after we have removed/freed an rsb
580 	 * from our toss list.  (They sent a request instead of lookup
581 	 * because they are using an rsb from their toss list.)
582 	 */
583 
584 	if (from_local || from_dir ||
585 	    (from_other && (dir_nodeid == our_nodeid))) {
586 		create = 1;
587 	}
588 
589  retry:
590 	if (create) {
591 		error = pre_rsb_struct(ls);
592 		if (error < 0)
593 			goto out;
594 	}
595 
596 	spin_lock(&ls->ls_rsbtbl[b].lock);
597 
598 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
599 	if (error)
600 		goto do_toss;
601 
602 	/*
603 	 * rsb is active, so we can't check master_nodeid without lock_rsb.
604 	 */
605 
606 	kref_get(&r->res_ref);
607 	goto out_unlock;
608 
609 
610  do_toss:
611 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
612 	if (error)
613 		goto do_new;
614 
615 	/*
616 	 * rsb found inactive (master_nodeid may be out of date unless
617 	 * we are the dir_nodeid or were the master)  No other thread
618 	 * is using this rsb because it's on the toss list, so we can
619 	 * look at or update res_master_nodeid without lock_rsb.
620 	 */
621 
622 	if ((r->res_master_nodeid != our_nodeid) && from_other) {
623 		/* our rsb was not master, and another node (not the dir node)
624 		   has sent us a request */
625 		log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
626 			  from_nodeid, r->res_master_nodeid, dir_nodeid,
627 			  r->res_name);
628 		error = -ENOTBLK;
629 		goto out_unlock;
630 	}
631 
632 	if ((r->res_master_nodeid != our_nodeid) && from_dir) {
633 		/* don't think this should ever happen */
634 		log_error(ls, "find_rsb toss from_dir %d master %d",
635 			  from_nodeid, r->res_master_nodeid);
636 		dlm_print_rsb(r);
637 		/* fix it and go on */
638 		r->res_master_nodeid = our_nodeid;
639 		r->res_nodeid = 0;
640 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
641 		r->res_first_lkid = 0;
642 	}
643 
644 	if (from_local && (r->res_master_nodeid != our_nodeid)) {
645 		/* Because we have held no locks on this rsb,
646 		   res_master_nodeid could have become stale. */
647 		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
648 		r->res_first_lkid = 0;
649 	}
650 
651 	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
652 	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
653 	goto out_unlock;
654 
655 
656  do_new:
657 	/*
658 	 * rsb not found
659 	 */
660 
661 	if (error == -EBADR && !create)
662 		goto out_unlock;
663 
664 	error = get_rsb_struct(ls, name, len, &r);
665 	if (error == -EAGAIN) {
666 		spin_unlock(&ls->ls_rsbtbl[b].lock);
667 		goto retry;
668 	}
669 	if (error)
670 		goto out_unlock;
671 
672 	r->res_hash = hash;
673 	r->res_bucket = b;
674 	r->res_dir_nodeid = dir_nodeid;
675 	kref_init(&r->res_ref);
676 
677 	if (from_dir) {
678 		/* want to see how often this happens */
679 		log_debug(ls, "find_rsb new from_dir %d recreate %s",
680 			  from_nodeid, r->res_name);
681 		r->res_master_nodeid = our_nodeid;
682 		r->res_nodeid = 0;
683 		goto out_add;
684 	}
685 
686 	if (from_other && (dir_nodeid != our_nodeid)) {
687 		/* should never happen */
688 		log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
689 			  from_nodeid, dir_nodeid, our_nodeid, r->res_name);
690 		dlm_free_rsb(r);
691 		r = NULL;
692 		error = -ENOTBLK;
693 		goto out_unlock;
694 	}
695 
696 	if (from_other) {
697 		log_debug(ls, "find_rsb new from_other %d dir %d %s",
698 			  from_nodeid, dir_nodeid, r->res_name);
699 	}
700 
701 	if (dir_nodeid == our_nodeid) {
702 		/* When we are the dir nodeid, we can set the master
703 		   node immediately */
704 		r->res_master_nodeid = our_nodeid;
705 		r->res_nodeid = 0;
706 	} else {
707 		/* set_master will send_lookup to dir_nodeid */
708 		r->res_master_nodeid = 0;
709 		r->res_nodeid = -1;
710 	}
711 
712  out_add:
713 	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
714  out_unlock:
715 	spin_unlock(&ls->ls_rsbtbl[b].lock);
716  out:
717 	*r_ret = r;
718 	return error;
719 }
720 
721 /* During recovery, other nodes can send us new MSTCPY locks (from
722    dlm_recover_locks) before we've made ourself master (in
723    dlm_recover_masters). */
724 
725 static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
726 			  uint32_t hash, uint32_t b,
727 			  int dir_nodeid, int from_nodeid,
728 			  unsigned int flags, struct dlm_rsb **r_ret)
729 {
730 	struct dlm_rsb *r = NULL;
731 	int our_nodeid = dlm_our_nodeid();
732 	int recover = (flags & R_RECEIVE_RECOVER);
733 	int error;
734 
735  retry:
736 	error = pre_rsb_struct(ls);
737 	if (error < 0)
738 		goto out;
739 
740 	spin_lock(&ls->ls_rsbtbl[b].lock);
741 
742 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
743 	if (error)
744 		goto do_toss;
745 
746 	/*
747 	 * rsb is active, so we can't check master_nodeid without lock_rsb.
748 	 */
749 
750 	kref_get(&r->res_ref);
751 	goto out_unlock;
752 
753 
754  do_toss:
755 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
756 	if (error)
757 		goto do_new;
758 
759 	/*
760 	 * rsb found inactive. No other thread is using this rsb because
761 	 * it's on the toss list, so we can look at or update
762 	 * res_master_nodeid without lock_rsb.
763 	 */
764 
765 	if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
766 		/* our rsb is not master, and another node has sent us a
767 		   request; this should never happen */
768 		log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
769 			  from_nodeid, r->res_master_nodeid, dir_nodeid);
770 		dlm_print_rsb(r);
771 		error = -ENOTBLK;
772 		goto out_unlock;
773 	}
774 
775 	if (!recover && (r->res_master_nodeid != our_nodeid) &&
776 	    (dir_nodeid == our_nodeid)) {
777 		/* our rsb is not master, and we are dir; may as well fix it;
778 		   this should never happen */
779 		log_error(ls, "find_rsb toss our %d master %d dir %d",
780 			  our_nodeid, r->res_master_nodeid, dir_nodeid);
781 		dlm_print_rsb(r);
782 		r->res_master_nodeid = our_nodeid;
783 		r->res_nodeid = 0;
784 	}
785 
786 	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
787 	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
788 	goto out_unlock;
789 
790 
791  do_new:
792 	/*
793 	 * rsb not found
794 	 */
795 
796 	error = get_rsb_struct(ls, name, len, &r);
797 	if (error == -EAGAIN) {
798 		spin_unlock(&ls->ls_rsbtbl[b].lock);
799 		goto retry;
800 	}
801 	if (error)
802 		goto out_unlock;
803 
804 	r->res_hash = hash;
805 	r->res_bucket = b;
806 	r->res_dir_nodeid = dir_nodeid;
807 	r->res_master_nodeid = dir_nodeid;
808 	r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid;
809 	kref_init(&r->res_ref);
810 
811 	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
812  out_unlock:
813 	spin_unlock(&ls->ls_rsbtbl[b].lock);
814  out:
815 	*r_ret = r;
816 	return error;
817 }
818 
819 static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
820 		    unsigned int flags, struct dlm_rsb **r_ret)
821 {
822 	uint32_t hash, b;
823 	int dir_nodeid;
824 
825 	if (len > DLM_RESNAME_MAXLEN)
826 		return -EINVAL;
827 
828 	hash = jhash(name, len, 0);
829 	b = hash & (ls->ls_rsbtbl_size - 1);
830 
831 	dir_nodeid = dlm_hash2nodeid(ls, hash);
832 
833 	if (dlm_no_directory(ls))
834 		return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid,
835 				      from_nodeid, flags, r_ret);
836 	else
837 		return find_rsb_dir(ls, name, len, hash, b, dir_nodeid,
838 				      from_nodeid, flags, r_ret);
839 }
840 
841 /* we have received a request and found that res_master_nodeid != our_nodeid,
842    so we need to return an error or make ourself the master */
843 
844 static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
845 				  int from_nodeid)
846 {
847 	if (dlm_no_directory(ls)) {
848 		log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d",
849 			  from_nodeid, r->res_master_nodeid,
850 			  r->res_dir_nodeid);
851 		dlm_print_rsb(r);
852 		return -ENOTBLK;
853 	}
854 
855 	if (from_nodeid != r->res_dir_nodeid) {
856 		/* our rsb is not master, and another node (not the dir node)
857 	   	   has sent us a request.  this is much more common when our
858 	   	   master_nodeid is zero, so limit debug to non-zero.  */
859 
860 		if (r->res_master_nodeid) {
861 			log_debug(ls, "validate master from_other %d master %d "
862 				  "dir %d first %x %s", from_nodeid,
863 				  r->res_master_nodeid, r->res_dir_nodeid,
864 				  r->res_first_lkid, r->res_name);
865 		}
866 		return -ENOTBLK;
867 	} else {
868 		/* our rsb is not master, but the dir nodeid has sent us a
869 	   	   request; this could happen with master 0 / res_nodeid -1 */
870 
871 		if (r->res_master_nodeid) {
872 			log_error(ls, "validate master from_dir %d master %d "
873 				  "first %x %s",
874 				  from_nodeid, r->res_master_nodeid,
875 				  r->res_first_lkid, r->res_name);
876 		}
877 
878 		r->res_master_nodeid = dlm_our_nodeid();
879 		r->res_nodeid = 0;
880 		return 0;
881 	}
882 }
883 
884 static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
885 				int from_nodeid, bool toss_list, unsigned int flags,
886 				int *r_nodeid, int *result)
887 {
888 	int fix_master = (flags & DLM_LU_RECOVER_MASTER);
889 	int from_master = (flags & DLM_LU_RECOVER_DIR);
890 
891 	if (r->res_dir_nodeid != our_nodeid) {
892 		/* should not happen, but may as well fix it and carry on */
893 		log_error(ls, "%s res_dir %d our %d %s", __func__,
894 			  r->res_dir_nodeid, our_nodeid, r->res_name);
895 		r->res_dir_nodeid = our_nodeid;
896 	}
897 
898 	if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
899 		/* Recovery uses this function to set a new master when
900 		 * the previous master failed.  Setting NEW_MASTER will
901 		 * force dlm_recover_masters to call recover_master on this
902 		 * rsb even though the res_nodeid is no longer removed.
903 		 */
904 
905 		r->res_master_nodeid = from_nodeid;
906 		r->res_nodeid = from_nodeid;
907 		rsb_set_flag(r, RSB_NEW_MASTER);
908 
909 		if (toss_list) {
910 			/* I don't think we should ever find it on toss list. */
911 			log_error(ls, "%s fix_master on toss", __func__);
912 			dlm_dump_rsb(r);
913 		}
914 	}
915 
916 	if (from_master && (r->res_master_nodeid != from_nodeid)) {
917 		/* this will happen if from_nodeid became master during
918 		 * a previous recovery cycle, and we aborted the previous
919 		 * cycle before recovering this master value
920 		 */
921 
922 		log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s",
923 			  __func__, from_nodeid, r->res_master_nodeid,
924 			  r->res_nodeid, r->res_first_lkid, r->res_name);
925 
926 		if (r->res_master_nodeid == our_nodeid) {
927 			log_error(ls, "from_master %d our_master", from_nodeid);
928 			dlm_dump_rsb(r);
929 			goto ret_assign;
930 		}
931 
932 		r->res_master_nodeid = from_nodeid;
933 		r->res_nodeid = from_nodeid;
934 		rsb_set_flag(r, RSB_NEW_MASTER);
935 	}
936 
937 	if (!r->res_master_nodeid) {
938 		/* this will happen if recovery happens while we're looking
939 		 * up the master for this rsb
940 		 */
941 
942 		log_debug(ls, "%s master 0 to %d first %x %s", __func__,
943 			  from_nodeid, r->res_first_lkid, r->res_name);
944 		r->res_master_nodeid = from_nodeid;
945 		r->res_nodeid = from_nodeid;
946 	}
947 
948 	if (!from_master && !fix_master &&
949 	    (r->res_master_nodeid == from_nodeid)) {
950 		/* this can happen when the master sends remove, the dir node
951 		 * finds the rsb on the keep list and ignores the remove,
952 		 * and the former master sends a lookup
953 		 */
954 
955 		log_limit(ls, "%s from master %d flags %x first %x %s",
956 			  __func__, from_nodeid, flags, r->res_first_lkid,
957 			  r->res_name);
958 	}
959 
960  ret_assign:
961 	*r_nodeid = r->res_master_nodeid;
962 	if (result)
963 		*result = DLM_LU_MATCH;
964 }
965 
966 /*
967  * We're the dir node for this res and another node wants to know the
968  * master nodeid.  During normal operation (non recovery) this is only
969  * called from receive_lookup(); master lookups when the local node is
970  * the dir node are done by find_rsb().
971  *
972  * normal operation, we are the dir node for a resource
973  * . _request_lock
974  * . set_master
975  * . send_lookup
976  * . receive_lookup
977  * . dlm_master_lookup flags 0
978  *
979  * recover directory, we are rebuilding dir for all resources
980  * . dlm_recover_directory
981  * . dlm_rcom_names
982  *   remote node sends back the rsb names it is master of and we are dir of
983  * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1)
984  *   we either create new rsb setting remote node as master, or find existing
985  *   rsb and set master to be the remote node.
986  *
987  * recover masters, we are finding the new master for resources
988  * . dlm_recover_masters
989  * . recover_master
990  * . dlm_send_rcom_lookup
991  * . receive_rcom_lookup
992  * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
993  */
994 
995 int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
996 		      unsigned int flags, int *r_nodeid, int *result)
997 {
998 	struct dlm_rsb *r = NULL;
999 	uint32_t hash, b;
1000 	int our_nodeid = dlm_our_nodeid();
1001 	int dir_nodeid, error;
1002 
1003 	if (len > DLM_RESNAME_MAXLEN)
1004 		return -EINVAL;
1005 
1006 	if (from_nodeid == our_nodeid) {
1007 		log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x",
1008 			  our_nodeid, flags);
1009 		return -EINVAL;
1010 	}
1011 
1012 	hash = jhash(name, len, 0);
1013 	b = hash & (ls->ls_rsbtbl_size - 1);
1014 
1015 	dir_nodeid = dlm_hash2nodeid(ls, hash);
1016 	if (dir_nodeid != our_nodeid) {
1017 		log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d",
1018 			  from_nodeid, dir_nodeid, our_nodeid, hash,
1019 			  ls->ls_num_nodes);
1020 		*r_nodeid = -1;
1021 		return -EINVAL;
1022 	}
1023 
1024  retry:
1025 	error = pre_rsb_struct(ls);
1026 	if (error < 0)
1027 		return error;
1028 
1029 	spin_lock(&ls->ls_rsbtbl[b].lock);
1030 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
1031 	if (!error) {
1032 		/* because the rsb is active, we need to lock_rsb before
1033 		 * checking/changing re_master_nodeid
1034 		 */
1035 
1036 		hold_rsb(r);
1037 		spin_unlock(&ls->ls_rsbtbl[b].lock);
1038 		lock_rsb(r);
1039 
1040 		__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false,
1041 				    flags, r_nodeid, result);
1042 
1043 		/* the rsb was active */
1044 		unlock_rsb(r);
1045 		put_rsb(r);
1046 
1047 		return 0;
1048 	}
1049 
1050 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1051 	if (error)
1052 		goto not_found;
1053 
1054 	/* because the rsb is inactive (on toss list), it's not refcounted
1055 	 * and lock_rsb is not used, but is protected by the rsbtbl lock
1056 	 */
1057 
1058 	__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
1059 			    r_nodeid, result);
1060 
1061 	r->res_toss_time = jiffies;
1062 	/* the rsb was inactive (on toss list) */
1063 	spin_unlock(&ls->ls_rsbtbl[b].lock);
1064 
1065 	return 0;
1066 
1067  not_found:
1068 	error = get_rsb_struct(ls, name, len, &r);
1069 	if (error == -EAGAIN) {
1070 		spin_unlock(&ls->ls_rsbtbl[b].lock);
1071 		goto retry;
1072 	}
1073 	if (error)
1074 		goto out_unlock;
1075 
1076 	r->res_hash = hash;
1077 	r->res_bucket = b;
1078 	r->res_dir_nodeid = our_nodeid;
1079 	r->res_master_nodeid = from_nodeid;
1080 	r->res_nodeid = from_nodeid;
1081 	kref_init(&r->res_ref);
1082 	r->res_toss_time = jiffies;
1083 
1084 	error = rsb_insert(r, &ls->ls_rsbtbl[b].toss);
1085 	if (error) {
1086 		/* should never happen */
1087 		dlm_free_rsb(r);
1088 		spin_unlock(&ls->ls_rsbtbl[b].lock);
1089 		goto retry;
1090 	}
1091 
1092 	if (result)
1093 		*result = DLM_LU_ADD;
1094 	*r_nodeid = from_nodeid;
1095  out_unlock:
1096 	spin_unlock(&ls->ls_rsbtbl[b].lock);
1097 	return error;
1098 }
1099 
1100 static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
1101 {
1102 	struct rb_node *n;
1103 	struct dlm_rsb *r;
1104 	int i;
1105 
1106 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1107 		spin_lock(&ls->ls_rsbtbl[i].lock);
1108 		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
1109 			r = rb_entry(n, struct dlm_rsb, res_hashnode);
1110 			if (r->res_hash == hash)
1111 				dlm_dump_rsb(r);
1112 		}
1113 		spin_unlock(&ls->ls_rsbtbl[i].lock);
1114 	}
1115 }
1116 
1117 void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len)
1118 {
1119 	struct dlm_rsb *r = NULL;
1120 	uint32_t hash, b;
1121 	int error;
1122 
1123 	hash = jhash(name, len, 0);
1124 	b = hash & (ls->ls_rsbtbl_size - 1);
1125 
1126 	spin_lock(&ls->ls_rsbtbl[b].lock);
1127 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
1128 	if (!error)
1129 		goto out_dump;
1130 
1131 	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1132 	if (error)
1133 		goto out;
1134  out_dump:
1135 	dlm_dump_rsb(r);
1136  out:
1137 	spin_unlock(&ls->ls_rsbtbl[b].lock);
1138 }
1139 
1140 static void toss_rsb(struct kref *kref)
1141 {
1142 	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
1143 	struct dlm_ls *ls = r->res_ls;
1144 
1145 	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
1146 	kref_init(&r->res_ref);
1147 	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
1148 	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
1149 	r->res_toss_time = jiffies;
1150 	ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
1151 	if (r->res_lvbptr) {
1152 		dlm_free_lvb(r->res_lvbptr);
1153 		r->res_lvbptr = NULL;
1154 	}
1155 }
1156 
1157 /* See comment for unhold_lkb */
1158 
1159 static void unhold_rsb(struct dlm_rsb *r)
1160 {
1161 	int rv;
1162 	rv = kref_put(&r->res_ref, toss_rsb);
1163 	DLM_ASSERT(!rv, dlm_dump_rsb(r););
1164 }
1165 
1166 static void kill_rsb(struct kref *kref)
1167 {
1168 	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
1169 
1170 	/* All work is done after the return from kref_put() so we
1171 	   can release the write_lock before the remove and free. */
1172 
1173 	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
1174 	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
1175 	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
1176 	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
1177 	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
1178 	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
1179 }
1180 
1181 /* Attaching/detaching lkb's from rsb's is for rsb reference counting.
1182    The rsb must exist as long as any lkb's for it do. */
1183 
1184 static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
1185 {
1186 	hold_rsb(r);
1187 	lkb->lkb_resource = r;
1188 }
1189 
1190 static void detach_lkb(struct dlm_lkb *lkb)
1191 {
1192 	if (lkb->lkb_resource) {
1193 		put_rsb(lkb->lkb_resource);
1194 		lkb->lkb_resource = NULL;
1195 	}
1196 }
1197 
1198 static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
1199 		       int start, int end)
1200 {
1201 	struct dlm_lkb *lkb;
1202 	int rv;
1203 
1204 	lkb = dlm_allocate_lkb(ls);
1205 	if (!lkb)
1206 		return -ENOMEM;
1207 
1208 	lkb->lkb_nodeid = -1;
1209 	lkb->lkb_grmode = DLM_LOCK_IV;
1210 	kref_init(&lkb->lkb_ref);
1211 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
1212 	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
1213 	INIT_LIST_HEAD(&lkb->lkb_time_list);
1214 	INIT_LIST_HEAD(&lkb->lkb_cb_list);
1215 	mutex_init(&lkb->lkb_cb_mutex);
1216 	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
1217 
1218 	idr_preload(GFP_NOFS);
1219 	spin_lock(&ls->ls_lkbidr_spin);
1220 	rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
1221 	if (rv >= 0)
1222 		lkb->lkb_id = rv;
1223 	spin_unlock(&ls->ls_lkbidr_spin);
1224 	idr_preload_end();
1225 
1226 	if (rv < 0) {
1227 		log_error(ls, "create_lkb idr error %d", rv);
1228 		dlm_free_lkb(lkb);
1229 		return rv;
1230 	}
1231 
1232 	*lkb_ret = lkb;
1233 	return 0;
1234 }
1235 
1236 static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
1237 {
1238 	return _create_lkb(ls, lkb_ret, 1, 0);
1239 }
1240 
1241 static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
1242 {
1243 	struct dlm_lkb *lkb;
1244 
1245 	spin_lock(&ls->ls_lkbidr_spin);
1246 	lkb = idr_find(&ls->ls_lkbidr, lkid);
1247 	if (lkb)
1248 		kref_get(&lkb->lkb_ref);
1249 	spin_unlock(&ls->ls_lkbidr_spin);
1250 
1251 	*lkb_ret = lkb;
1252 	return lkb ? 0 : -ENOENT;
1253 }
1254 
1255 static void kill_lkb(struct kref *kref)
1256 {
1257 	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
1258 
1259 	/* All work is done after the return from kref_put() so we
1260 	   can release the write_lock before the detach_lkb */
1261 
1262 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
1263 }
1264 
1265 /* __put_lkb() is used when an lkb may not have an rsb attached to
1266    it so we need to provide the lockspace explicitly */
1267 
1268 static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
1269 {
1270 	uint32_t lkid = lkb->lkb_id;
1271 	int rv;
1272 
1273 	rv = kref_put_lock(&lkb->lkb_ref, kill_lkb,
1274 			   &ls->ls_lkbidr_spin);
1275 	if (rv) {
1276 		idr_remove(&ls->ls_lkbidr, lkid);
1277 		spin_unlock(&ls->ls_lkbidr_spin);
1278 
1279 		detach_lkb(lkb);
1280 
1281 		/* for local/process lkbs, lvbptr points to caller's lksb */
1282 		if (lkb->lkb_lvbptr && is_master_copy(lkb))
1283 			dlm_free_lvb(lkb->lkb_lvbptr);
1284 		dlm_free_lkb(lkb);
1285 	}
1286 
1287 	return rv;
1288 }
1289 
1290 int dlm_put_lkb(struct dlm_lkb *lkb)
1291 {
1292 	struct dlm_ls *ls;
1293 
1294 	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
1295 	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
1296 
1297 	ls = lkb->lkb_resource->res_ls;
1298 	return __put_lkb(ls, lkb);
1299 }
1300 
1301 /* This is only called to add a reference when the code already holds
1302    a valid reference to the lkb, so there's no need for locking. */
1303 
1304 static inline void hold_lkb(struct dlm_lkb *lkb)
1305 {
1306 	kref_get(&lkb->lkb_ref);
1307 }
1308 
1309 /* This is called when we need to remove a reference and are certain
1310    it's not the last ref.  e.g. del_lkb is always called between a
1311    find_lkb/put_lkb and is always the inverse of a previous add_lkb.
1312    put_lkb would work fine, but would involve unnecessary locking */
1313 
1314 static inline void unhold_lkb(struct dlm_lkb *lkb)
1315 {
1316 	int rv;
1317 	rv = kref_put(&lkb->lkb_ref, kill_lkb);
1318 	DLM_ASSERT(!rv, dlm_print_lkb(lkb););
1319 }
1320 
1321 static void lkb_add_ordered(struct list_head *new, struct list_head *head,
1322 			    int mode)
1323 {
1324 	struct dlm_lkb *lkb = NULL, *iter;
1325 
1326 	list_for_each_entry(iter, head, lkb_statequeue)
1327 		if (iter->lkb_rqmode < mode) {
1328 			lkb = iter;
1329 			list_add_tail(new, &iter->lkb_statequeue);
1330 			break;
1331 		}
1332 
1333 	if (!lkb)
1334 		list_add_tail(new, head);
1335 }
1336 
1337 /* add/remove lkb to rsb's grant/convert/wait queue */
1338 
1339 static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
1340 {
1341 	kref_get(&lkb->lkb_ref);
1342 
1343 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
1344 
1345 	lkb->lkb_timestamp = ktime_get();
1346 
1347 	lkb->lkb_status = status;
1348 
1349 	switch (status) {
1350 	case DLM_LKSTS_WAITING:
1351 		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
1352 			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
1353 		else
1354 			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
1355 		break;
1356 	case DLM_LKSTS_GRANTED:
1357 		/* convention says granted locks kept in order of grmode */
1358 		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
1359 				lkb->lkb_grmode);
1360 		break;
1361 	case DLM_LKSTS_CONVERT:
1362 		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
1363 			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
1364 		else
1365 			list_add_tail(&lkb->lkb_statequeue,
1366 				      &r->res_convertqueue);
1367 		break;
1368 	default:
1369 		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
1370 	}
1371 }
1372 
1373 static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
1374 {
1375 	lkb->lkb_status = 0;
1376 	list_del(&lkb->lkb_statequeue);
1377 	unhold_lkb(lkb);
1378 }
1379 
1380 static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
1381 {
1382 	hold_lkb(lkb);
1383 	del_lkb(r, lkb);
1384 	add_lkb(r, lkb, sts);
1385 	unhold_lkb(lkb);
1386 }
1387 
1388 static int msg_reply_type(int mstype)
1389 {
1390 	switch (mstype) {
1391 	case DLM_MSG_REQUEST:
1392 		return DLM_MSG_REQUEST_REPLY;
1393 	case DLM_MSG_CONVERT:
1394 		return DLM_MSG_CONVERT_REPLY;
1395 	case DLM_MSG_UNLOCK:
1396 		return DLM_MSG_UNLOCK_REPLY;
1397 	case DLM_MSG_CANCEL:
1398 		return DLM_MSG_CANCEL_REPLY;
1399 	case DLM_MSG_LOOKUP:
1400 		return DLM_MSG_LOOKUP_REPLY;
1401 	}
1402 	return -1;
1403 }
1404 
1405 static int nodeid_warned(int nodeid, int num_nodes, int *warned)
1406 {
1407 	int i;
1408 
1409 	for (i = 0; i < num_nodes; i++) {
1410 		if (!warned[i]) {
1411 			warned[i] = nodeid;
1412 			return 0;
1413 		}
1414 		if (warned[i] == nodeid)
1415 			return 1;
1416 	}
1417 	return 0;
1418 }
1419 
1420 void dlm_scan_waiters(struct dlm_ls *ls)
1421 {
1422 	struct dlm_lkb *lkb;
1423 	s64 us;
1424 	s64 debug_maxus = 0;
1425 	u32 debug_scanned = 0;
1426 	u32 debug_expired = 0;
1427 	int num_nodes = 0;
1428 	int *warned = NULL;
1429 
1430 	if (!dlm_config.ci_waitwarn_us)
1431 		return;
1432 
1433 	mutex_lock(&ls->ls_waiters_mutex);
1434 
1435 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
1436 		if (!lkb->lkb_wait_time)
1437 			continue;
1438 
1439 		debug_scanned++;
1440 
1441 		us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
1442 
1443 		if (us < dlm_config.ci_waitwarn_us)
1444 			continue;
1445 
1446 		lkb->lkb_wait_time = 0;
1447 
1448 		debug_expired++;
1449 		if (us > debug_maxus)
1450 			debug_maxus = us;
1451 
1452 		if (!num_nodes) {
1453 			num_nodes = ls->ls_num_nodes;
1454 			warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
1455 		}
1456 		if (!warned)
1457 			continue;
1458 		if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
1459 			continue;
1460 
1461 		log_error(ls, "waitwarn %x %lld %d us check connection to "
1462 			  "node %d", lkb->lkb_id, (long long)us,
1463 			  dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
1464 	}
1465 	mutex_unlock(&ls->ls_waiters_mutex);
1466 	kfree(warned);
1467 
1468 	if (debug_expired)
1469 		log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
1470 			  debug_scanned, debug_expired,
1471 			  dlm_config.ci_waitwarn_us, (long long)debug_maxus);
1472 }
1473 
1474 /* add/remove lkb from global waiters list of lkb's waiting for
1475    a reply from a remote node */
1476 
1477 static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
1478 {
1479 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1480 	int error = 0;
1481 
1482 	mutex_lock(&ls->ls_waiters_mutex);
1483 
1484 	if (is_overlap_unlock(lkb) ||
1485 	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
1486 		error = -EINVAL;
1487 		goto out;
1488 	}
1489 
1490 	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
1491 		switch (mstype) {
1492 		case DLM_MSG_UNLOCK:
1493 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
1494 			break;
1495 		case DLM_MSG_CANCEL:
1496 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
1497 			break;
1498 		default:
1499 			error = -EBUSY;
1500 			goto out;
1501 		}
1502 		lkb->lkb_wait_count++;
1503 		hold_lkb(lkb);
1504 
1505 		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
1506 			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
1507 			  lkb->lkb_wait_count, lkb->lkb_flags);
1508 		goto out;
1509 	}
1510 
1511 	DLM_ASSERT(!lkb->lkb_wait_count,
1512 		   dlm_print_lkb(lkb);
1513 		   printk("wait_count %d\n", lkb->lkb_wait_count););
1514 
1515 	lkb->lkb_wait_count++;
1516 	lkb->lkb_wait_type = mstype;
1517 	lkb->lkb_wait_time = ktime_get();
1518 	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
1519 	hold_lkb(lkb);
1520 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
1521  out:
1522 	if (error)
1523 		log_error(ls, "addwait error %x %d flags %x %d %d %s",
1524 			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
1525 			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
1526 	mutex_unlock(&ls->ls_waiters_mutex);
1527 	return error;
1528 }
1529 
1530 /* We clear the RESEND flag because we might be taking an lkb off the waiters
1531    list as part of process_requestqueue (e.g. a lookup that has an optimized
1532    request reply on the requestqueue) between dlm_recover_waiters_pre() which
1533    set RESEND and dlm_recover_waiters_post() */
1534 
1535 static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
1536 				struct dlm_message *ms)
1537 {
1538 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1539 	int overlap_done = 0;
1540 
1541 	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
1542 		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
1543 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
1544 		overlap_done = 1;
1545 		goto out_del;
1546 	}
1547 
1548 	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
1549 		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
1550 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
1551 		overlap_done = 1;
1552 		goto out_del;
1553 	}
1554 
1555 	/* Cancel state was preemptively cleared by a successful convert,
1556 	   see next comment, nothing to do. */
1557 
1558 	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
1559 	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
1560 		log_debug(ls, "remwait %x cancel_reply wait_type %d",
1561 			  lkb->lkb_id, lkb->lkb_wait_type);
1562 		return -1;
1563 	}
1564 
1565 	/* Remove for the convert reply, and premptively remove for the
1566 	   cancel reply.  A convert has been granted while there's still
1567 	   an outstanding cancel on it (the cancel is moot and the result
1568 	   in the cancel reply should be 0).  We preempt the cancel reply
1569 	   because the app gets the convert result and then can follow up
1570 	   with another op, like convert.  This subsequent op would see the
1571 	   lingering state of the cancel and fail with -EBUSY. */
1572 
1573 	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
1574 	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
1575 	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
1576 		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
1577 			  lkb->lkb_id);
1578 		lkb->lkb_wait_type = 0;
1579 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
1580 		lkb->lkb_wait_count--;
1581 		unhold_lkb(lkb);
1582 		goto out_del;
1583 	}
1584 
1585 	/* N.B. type of reply may not always correspond to type of original
1586 	   msg due to lookup->request optimization, verify others? */
1587 
1588 	if (lkb->lkb_wait_type) {
1589 		lkb->lkb_wait_type = 0;
1590 		goto out_del;
1591 	}
1592 
1593 	log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
1594 		  lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0,
1595 		  lkb->lkb_remid, mstype, lkb->lkb_flags);
1596 	return -1;
1597 
1598  out_del:
1599 	/* the force-unlock/cancel has completed and we haven't recvd a reply
1600 	   to the op that was in progress prior to the unlock/cancel; we
1601 	   give up on any reply to the earlier op.  FIXME: not sure when/how
1602 	   this would happen */
1603 
1604 	if (overlap_done && lkb->lkb_wait_type) {
1605 		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
1606 			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
1607 		lkb->lkb_wait_count--;
1608 		unhold_lkb(lkb);
1609 		lkb->lkb_wait_type = 0;
1610 	}
1611 
1612 	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
1613 
1614 	lkb->lkb_flags &= ~DLM_IFL_RESEND;
1615 	lkb->lkb_wait_count--;
1616 	if (!lkb->lkb_wait_count)
1617 		list_del_init(&lkb->lkb_wait_reply);
1618 	unhold_lkb(lkb);
1619 	return 0;
1620 }
1621 
1622 static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
1623 {
1624 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1625 	int error;
1626 
1627 	mutex_lock(&ls->ls_waiters_mutex);
1628 	error = _remove_from_waiters(lkb, mstype, NULL);
1629 	mutex_unlock(&ls->ls_waiters_mutex);
1630 	return error;
1631 }
1632 
1633 /* Handles situations where we might be processing a "fake" or "stub" reply in
1634    which we can't try to take waiters_mutex again. */
1635 
1636 static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
1637 {
1638 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1639 	int error;
1640 
1641 	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
1642 		mutex_lock(&ls->ls_waiters_mutex);
1643 	error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
1644 	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
1645 		mutex_unlock(&ls->ls_waiters_mutex);
1646 	return error;
1647 }
1648 
1649 /* If there's an rsb for the same resource being removed, ensure
1650  * that the remove message is sent before the new lookup message.
1651  */
1652 
1653 #define DLM_WAIT_PENDING_COND(ls, r)		\
1654 	(ls->ls_remove_len &&			\
1655 	 !rsb_cmp(r, ls->ls_remove_name,	\
1656 		  ls->ls_remove_len))
1657 
1658 static void wait_pending_remove(struct dlm_rsb *r)
1659 {
1660 	struct dlm_ls *ls = r->res_ls;
1661  restart:
1662 	spin_lock(&ls->ls_remove_spin);
1663 	if (DLM_WAIT_PENDING_COND(ls, r)) {
1664 		log_debug(ls, "delay lookup for remove dir %d %s",
1665 			  r->res_dir_nodeid, r->res_name);
1666 		spin_unlock(&ls->ls_remove_spin);
1667 		wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
1668 		goto restart;
1669 	}
1670 	spin_unlock(&ls->ls_remove_spin);
1671 }
1672 
1673 /*
1674  * ls_remove_spin protects ls_remove_name and ls_remove_len which are
1675  * read by other threads in wait_pending_remove.  ls_remove_names
1676  * and ls_remove_lens are only used by the scan thread, so they do
1677  * not need protection.
1678  */
1679 
1680 static void shrink_bucket(struct dlm_ls *ls, int b)
1681 {
1682 	struct rb_node *n, *next;
1683 	struct dlm_rsb *r;
1684 	char *name;
1685 	int our_nodeid = dlm_our_nodeid();
1686 	int remote_count = 0;
1687 	int need_shrink = 0;
1688 	int i, len, rv;
1689 
1690 	memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
1691 
1692 	spin_lock(&ls->ls_rsbtbl[b].lock);
1693 
1694 	if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
1695 		spin_unlock(&ls->ls_rsbtbl[b].lock);
1696 		return;
1697 	}
1698 
1699 	for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
1700 		next = rb_next(n);
1701 		r = rb_entry(n, struct dlm_rsb, res_hashnode);
1702 
1703 		/* If we're the directory record for this rsb, and
1704 		   we're not the master of it, then we need to wait
1705 		   for the master node to send us a dir remove for
1706 		   before removing the dir record. */
1707 
1708 		if (!dlm_no_directory(ls) &&
1709 		    (r->res_master_nodeid != our_nodeid) &&
1710 		    (dlm_dir_nodeid(r) == our_nodeid)) {
1711 			continue;
1712 		}
1713 
1714 		need_shrink = 1;
1715 
1716 		if (!time_after_eq(jiffies, r->res_toss_time +
1717 				   dlm_config.ci_toss_secs * HZ)) {
1718 			continue;
1719 		}
1720 
1721 		if (!dlm_no_directory(ls) &&
1722 		    (r->res_master_nodeid == our_nodeid) &&
1723 		    (dlm_dir_nodeid(r) != our_nodeid)) {
1724 
1725 			/* We're the master of this rsb but we're not
1726 			   the directory record, so we need to tell the
1727 			   dir node to remove the dir record. */
1728 
1729 			ls->ls_remove_lens[remote_count] = r->res_length;
1730 			memcpy(ls->ls_remove_names[remote_count], r->res_name,
1731 			       DLM_RESNAME_MAXLEN);
1732 			remote_count++;
1733 
1734 			if (remote_count >= DLM_REMOVE_NAMES_MAX)
1735 				break;
1736 			continue;
1737 		}
1738 
1739 		if (!kref_put(&r->res_ref, kill_rsb)) {
1740 			log_error(ls, "tossed rsb in use %s", r->res_name);
1741 			continue;
1742 		}
1743 
1744 		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1745 		dlm_free_rsb(r);
1746 	}
1747 
1748 	if (need_shrink)
1749 		ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
1750 	else
1751 		ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
1752 	spin_unlock(&ls->ls_rsbtbl[b].lock);
1753 
1754 	/*
1755 	 * While searching for rsb's to free, we found some that require
1756 	 * remote removal.  We leave them in place and find them again here
1757 	 * so there is a very small gap between removing them from the toss
1758 	 * list and sending the removal.  Keeping this gap small is
1759 	 * important to keep us (the master node) from being out of sync
1760 	 * with the remote dir node for very long.
1761 	 *
1762 	 * From the time the rsb is removed from toss until just after
1763 	 * send_remove, the rsb name is saved in ls_remove_name.  A new
1764 	 * lookup checks this to ensure that a new lookup message for the
1765 	 * same resource name is not sent just before the remove message.
1766 	 */
1767 
1768 	for (i = 0; i < remote_count; i++) {
1769 		name = ls->ls_remove_names[i];
1770 		len = ls->ls_remove_lens[i];
1771 
1772 		spin_lock(&ls->ls_rsbtbl[b].lock);
1773 		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1774 		if (rv) {
1775 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1776 			log_debug(ls, "remove_name not toss %s", name);
1777 			continue;
1778 		}
1779 
1780 		if (r->res_master_nodeid != our_nodeid) {
1781 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1782 			log_debug(ls, "remove_name master %d dir %d our %d %s",
1783 				  r->res_master_nodeid, r->res_dir_nodeid,
1784 				  our_nodeid, name);
1785 			continue;
1786 		}
1787 
1788 		if (r->res_dir_nodeid == our_nodeid) {
1789 			/* should never happen */
1790 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1791 			log_error(ls, "remove_name dir %d master %d our %d %s",
1792 				  r->res_dir_nodeid, r->res_master_nodeid,
1793 				  our_nodeid, name);
1794 			continue;
1795 		}
1796 
1797 		if (!time_after_eq(jiffies, r->res_toss_time +
1798 				   dlm_config.ci_toss_secs * HZ)) {
1799 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1800 			log_debug(ls, "remove_name toss_time %lu now %lu %s",
1801 				  r->res_toss_time, jiffies, name);
1802 			continue;
1803 		}
1804 
1805 		if (!kref_put(&r->res_ref, kill_rsb)) {
1806 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1807 			log_error(ls, "remove_name in use %s", name);
1808 			continue;
1809 		}
1810 
1811 		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1812 
1813 		/* block lookup of same name until we've sent remove */
1814 		spin_lock(&ls->ls_remove_spin);
1815 		ls->ls_remove_len = len;
1816 		memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
1817 		spin_unlock(&ls->ls_remove_spin);
1818 		spin_unlock(&ls->ls_rsbtbl[b].lock);
1819 
1820 		send_remove(r);
1821 
1822 		/* allow lookup of name again */
1823 		spin_lock(&ls->ls_remove_spin);
1824 		ls->ls_remove_len = 0;
1825 		memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
1826 		spin_unlock(&ls->ls_remove_spin);
1827 		wake_up(&ls->ls_remove_wait);
1828 
1829 		dlm_free_rsb(r);
1830 	}
1831 }
1832 
1833 void dlm_scan_rsbs(struct dlm_ls *ls)
1834 {
1835 	int i;
1836 
1837 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1838 		shrink_bucket(ls, i);
1839 		if (dlm_locking_stopped(ls))
1840 			break;
1841 		cond_resched();
1842 	}
1843 }
1844 
1845 static void add_timeout(struct dlm_lkb *lkb)
1846 {
1847 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1848 
1849 	if (is_master_copy(lkb))
1850 		return;
1851 
1852 	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1853 	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1854 		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
1855 		goto add_it;
1856 	}
1857 	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
1858 		goto add_it;
1859 	return;
1860 
1861  add_it:
1862 	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1863 	mutex_lock(&ls->ls_timeout_mutex);
1864 	hold_lkb(lkb);
1865 	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1866 	mutex_unlock(&ls->ls_timeout_mutex);
1867 }
1868 
1869 static void del_timeout(struct dlm_lkb *lkb)
1870 {
1871 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1872 
1873 	mutex_lock(&ls->ls_timeout_mutex);
1874 	if (!list_empty(&lkb->lkb_time_list)) {
1875 		list_del_init(&lkb->lkb_time_list);
1876 		unhold_lkb(lkb);
1877 	}
1878 	mutex_unlock(&ls->ls_timeout_mutex);
1879 }
1880 
1881 /* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
1882    lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
1883    and then lock rsb because of lock ordering in add_timeout.  We may need
1884    to specify some special timeout-related bits in the lkb that are just to
1885    be accessed under the timeout_mutex. */
1886 
1887 void dlm_scan_timeout(struct dlm_ls *ls)
1888 {
1889 	struct dlm_rsb *r;
1890 	struct dlm_lkb *lkb = NULL, *iter;
1891 	int do_cancel, do_warn;
1892 	s64 wait_us;
1893 
1894 	for (;;) {
1895 		if (dlm_locking_stopped(ls))
1896 			break;
1897 
1898 		do_cancel = 0;
1899 		do_warn = 0;
1900 		mutex_lock(&ls->ls_timeout_mutex);
1901 		list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
1902 
1903 			wait_us = ktime_to_us(ktime_sub(ktime_get(),
1904 							iter->lkb_timestamp));
1905 
1906 			if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
1907 			    wait_us >= (iter->lkb_timeout_cs * 10000))
1908 				do_cancel = 1;
1909 
1910 			if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1911 			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
1912 				do_warn = 1;
1913 
1914 			if (!do_cancel && !do_warn)
1915 				continue;
1916 			hold_lkb(iter);
1917 			lkb = iter;
1918 			break;
1919 		}
1920 		mutex_unlock(&ls->ls_timeout_mutex);
1921 
1922 		if (!lkb)
1923 			break;
1924 
1925 		r = lkb->lkb_resource;
1926 		hold_rsb(r);
1927 		lock_rsb(r);
1928 
1929 		if (do_warn) {
1930 			/* clear flag so we only warn once */
1931 			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1932 			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
1933 				del_timeout(lkb);
1934 			dlm_timeout_warn(lkb);
1935 		}
1936 
1937 		if (do_cancel) {
1938 			log_debug(ls, "timeout cancel %x node %d %s",
1939 				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1940 			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1941 			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
1942 			del_timeout(lkb);
1943 			_cancel_lock(r, lkb);
1944 		}
1945 
1946 		unlock_rsb(r);
1947 		unhold_rsb(r);
1948 		dlm_put_lkb(lkb);
1949 	}
1950 }
1951 
1952 /* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
1953    dlm_recoverd before checking/setting ls_recover_begin. */
1954 
1955 void dlm_adjust_timeouts(struct dlm_ls *ls)
1956 {
1957 	struct dlm_lkb *lkb;
1958 	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1959 
1960 	ls->ls_recover_begin = 0;
1961 	mutex_lock(&ls->ls_timeout_mutex);
1962 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1963 		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1964 	mutex_unlock(&ls->ls_timeout_mutex);
1965 
1966 	if (!dlm_config.ci_waitwarn_us)
1967 		return;
1968 
1969 	mutex_lock(&ls->ls_waiters_mutex);
1970 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
1971 		if (ktime_to_us(lkb->lkb_wait_time))
1972 			lkb->lkb_wait_time = ktime_get();
1973 	}
1974 	mutex_unlock(&ls->ls_waiters_mutex);
1975 }
1976 
1977 /* lkb is master or local copy */
1978 
1979 static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1980 {
1981 	int b, len = r->res_ls->ls_lvblen;
1982 
1983 	/* b=1 lvb returned to caller
1984 	   b=0 lvb written to rsb or invalidated
1985 	   b=-1 do nothing */
1986 
1987 	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
1988 
1989 	if (b == 1) {
1990 		if (!lkb->lkb_lvbptr)
1991 			return;
1992 
1993 		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1994 			return;
1995 
1996 		if (!r->res_lvbptr)
1997 			return;
1998 
1999 		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
2000 		lkb->lkb_lvbseq = r->res_lvbseq;
2001 
2002 	} else if (b == 0) {
2003 		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
2004 			rsb_set_flag(r, RSB_VALNOTVALID);
2005 			return;
2006 		}
2007 
2008 		if (!lkb->lkb_lvbptr)
2009 			return;
2010 
2011 		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
2012 			return;
2013 
2014 		if (!r->res_lvbptr)
2015 			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
2016 
2017 		if (!r->res_lvbptr)
2018 			return;
2019 
2020 		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
2021 		r->res_lvbseq++;
2022 		lkb->lkb_lvbseq = r->res_lvbseq;
2023 		rsb_clear_flag(r, RSB_VALNOTVALID);
2024 	}
2025 
2026 	if (rsb_flag(r, RSB_VALNOTVALID))
2027 		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
2028 }
2029 
2030 static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2031 {
2032 	if (lkb->lkb_grmode < DLM_LOCK_PW)
2033 		return;
2034 
2035 	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
2036 		rsb_set_flag(r, RSB_VALNOTVALID);
2037 		return;
2038 	}
2039 
2040 	if (!lkb->lkb_lvbptr)
2041 		return;
2042 
2043 	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
2044 		return;
2045 
2046 	if (!r->res_lvbptr)
2047 		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
2048 
2049 	if (!r->res_lvbptr)
2050 		return;
2051 
2052 	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2053 	r->res_lvbseq++;
2054 	rsb_clear_flag(r, RSB_VALNOTVALID);
2055 }
2056 
2057 /* lkb is process copy (pc) */
2058 
2059 static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
2060 			    struct dlm_message *ms)
2061 {
2062 	int b;
2063 
2064 	if (!lkb->lkb_lvbptr)
2065 		return;
2066 
2067 	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
2068 		return;
2069 
2070 	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
2071 	if (b == 1) {
2072 		int len = receive_extralen(ms);
2073 		if (len > r->res_ls->ls_lvblen)
2074 			len = r->res_ls->ls_lvblen;
2075 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2076 		lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
2077 	}
2078 }
2079 
2080 /* Manipulate lkb's on rsb's convert/granted/waiting queues
2081    remove_lock -- used for unlock, removes lkb from granted
2082    revert_lock -- used for cancel, moves lkb from convert to granted
2083    grant_lock  -- used for request and convert, adds lkb to granted or
2084                   moves lkb from convert or waiting to granted
2085 
2086    Each of these is used for master or local copy lkb's.  There is
2087    also a _pc() variation used to make the corresponding change on
2088    a process copy (pc) lkb. */
2089 
2090 static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2091 {
2092 	del_lkb(r, lkb);
2093 	lkb->lkb_grmode = DLM_LOCK_IV;
2094 	/* this unhold undoes the original ref from create_lkb()
2095 	   so this leads to the lkb being freed */
2096 	unhold_lkb(lkb);
2097 }
2098 
2099 static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2100 {
2101 	set_lvb_unlock(r, lkb);
2102 	_remove_lock(r, lkb);
2103 }
2104 
2105 static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
2106 {
2107 	_remove_lock(r, lkb);
2108 }
2109 
2110 /* returns: 0 did nothing
2111 	    1 moved lock to granted
2112 	   -1 removed lock */
2113 
2114 static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2115 {
2116 	int rv = 0;
2117 
2118 	lkb->lkb_rqmode = DLM_LOCK_IV;
2119 
2120 	switch (lkb->lkb_status) {
2121 	case DLM_LKSTS_GRANTED:
2122 		break;
2123 	case DLM_LKSTS_CONVERT:
2124 		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
2125 		rv = 1;
2126 		break;
2127 	case DLM_LKSTS_WAITING:
2128 		del_lkb(r, lkb);
2129 		lkb->lkb_grmode = DLM_LOCK_IV;
2130 		/* this unhold undoes the original ref from create_lkb()
2131 		   so this leads to the lkb being freed */
2132 		unhold_lkb(lkb);
2133 		rv = -1;
2134 		break;
2135 	default:
2136 		log_print("invalid status for revert %d", lkb->lkb_status);
2137 	}
2138 	return rv;
2139 }
2140 
2141 static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
2142 {
2143 	return revert_lock(r, lkb);
2144 }
2145 
2146 static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2147 {
2148 	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
2149 		lkb->lkb_grmode = lkb->lkb_rqmode;
2150 		if (lkb->lkb_status)
2151 			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
2152 		else
2153 			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
2154 	}
2155 
2156 	lkb->lkb_rqmode = DLM_LOCK_IV;
2157 	lkb->lkb_highbast = 0;
2158 }
2159 
2160 static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2161 {
2162 	set_lvb_lock(r, lkb);
2163 	_grant_lock(r, lkb);
2164 }
2165 
2166 static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
2167 			  struct dlm_message *ms)
2168 {
2169 	set_lvb_lock_pc(r, lkb, ms);
2170 	_grant_lock(r, lkb);
2171 }
2172 
2173 /* called by grant_pending_locks() which means an async grant message must
2174    be sent to the requesting node in addition to granting the lock if the
2175    lkb belongs to a remote node. */
2176 
2177 static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
2178 {
2179 	grant_lock(r, lkb);
2180 	if (is_master_copy(lkb))
2181 		send_grant(r, lkb);
2182 	else
2183 		queue_cast(r, lkb, 0);
2184 }
2185 
2186 /* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
2187    change the granted/requested modes.  We're munging things accordingly in
2188    the process copy.
2189    CONVDEADLK: our grmode may have been forced down to NL to resolve a
2190    conversion deadlock
2191    ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
2192    compatible with other granted locks */
2193 
2194 static void munge_demoted(struct dlm_lkb *lkb)
2195 {
2196 	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
2197 		log_print("munge_demoted %x invalid modes gr %d rq %d",
2198 			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
2199 		return;
2200 	}
2201 
2202 	lkb->lkb_grmode = DLM_LOCK_NL;
2203 }
2204 
2205 static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
2206 {
2207 	if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
2208 	    ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
2209 		log_print("munge_altmode %x invalid reply type %d",
2210 			  lkb->lkb_id, le32_to_cpu(ms->m_type));
2211 		return;
2212 	}
2213 
2214 	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
2215 		lkb->lkb_rqmode = DLM_LOCK_PR;
2216 	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
2217 		lkb->lkb_rqmode = DLM_LOCK_CW;
2218 	else {
2219 		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
2220 		dlm_print_lkb(lkb);
2221 	}
2222 }
2223 
2224 static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
2225 {
2226 	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
2227 					   lkb_statequeue);
2228 	if (lkb->lkb_id == first->lkb_id)
2229 		return 1;
2230 
2231 	return 0;
2232 }
2233 
2234 /* Check if the given lkb conflicts with another lkb on the queue. */
2235 
2236 static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
2237 {
2238 	struct dlm_lkb *this;
2239 
2240 	list_for_each_entry(this, head, lkb_statequeue) {
2241 		if (this == lkb)
2242 			continue;
2243 		if (!modes_compat(this, lkb))
2244 			return 1;
2245 	}
2246 	return 0;
2247 }
2248 
2249 /*
2250  * "A conversion deadlock arises with a pair of lock requests in the converting
2251  * queue for one resource.  The granted mode of each lock blocks the requested
2252  * mode of the other lock."
2253  *
2254  * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
2255  * convert queue from being granted, then deadlk/demote lkb.
2256  *
2257  * Example:
2258  * Granted Queue: empty
2259  * Convert Queue: NL->EX (first lock)
2260  *                PR->EX (second lock)
2261  *
2262  * The first lock can't be granted because of the granted mode of the second
2263  * lock and the second lock can't be granted because it's not first in the
2264  * list.  We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
2265  * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
2266  * flag set and return DEMOTED in the lksb flags.
2267  *
2268  * Originally, this function detected conv-deadlk in a more limited scope:
2269  * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
2270  * - if lkb1 was the first entry in the queue (not just earlier), and was
2271  *   blocked by the granted mode of lkb2, and there was nothing on the
2272  *   granted queue preventing lkb1 from being granted immediately, i.e.
2273  *   lkb2 was the only thing preventing lkb1 from being granted.
2274  *
2275  * That second condition meant we'd only say there was conv-deadlk if
2276  * resolving it (by demotion) would lead to the first lock on the convert
2277  * queue being granted right away.  It allowed conversion deadlocks to exist
2278  * between locks on the convert queue while they couldn't be granted anyway.
2279  *
2280  * Now, we detect and take action on conversion deadlocks immediately when
2281  * they're created, even if they may not be immediately consequential.  If
2282  * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
2283  * mode that would prevent lkb1's conversion from being granted, we do a
2284  * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
2285  * I think this means that the lkb_is_ahead condition below should always
2286  * be zero, i.e. there will never be conv-deadlk between two locks that are
2287  * both already on the convert queue.
2288  */
2289 
2290 static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
2291 {
2292 	struct dlm_lkb *lkb1;
2293 	int lkb_is_ahead = 0;
2294 
2295 	list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
2296 		if (lkb1 == lkb2) {
2297 			lkb_is_ahead = 1;
2298 			continue;
2299 		}
2300 
2301 		if (!lkb_is_ahead) {
2302 			if (!modes_compat(lkb2, lkb1))
2303 				return 1;
2304 		} else {
2305 			if (!modes_compat(lkb2, lkb1) &&
2306 			    !modes_compat(lkb1, lkb2))
2307 				return 1;
2308 		}
2309 	}
2310 	return 0;
2311 }
2312 
2313 /*
2314  * Return 1 if the lock can be granted, 0 otherwise.
2315  * Also detect and resolve conversion deadlocks.
2316  *
2317  * lkb is the lock to be granted
2318  *
2319  * now is 1 if the function is being called in the context of the
2320  * immediate request, it is 0 if called later, after the lock has been
2321  * queued.
2322  *
2323  * recover is 1 if dlm_recover_grant() is trying to grant conversions
2324  * after recovery.
2325  *
2326  * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
2327  */
2328 
2329 static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2330 			   int recover)
2331 {
2332 	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
2333 
2334 	/*
2335 	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
2336 	 * a new request for a NL mode lock being blocked.
2337 	 *
2338 	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
2339 	 * request, then it would be granted.  In essence, the use of this flag
2340 	 * tells the Lock Manager to expedite theis request by not considering
2341 	 * what may be in the CONVERTING or WAITING queues...  As of this
2342 	 * writing, the EXPEDITE flag can be used only with new requests for NL
2343 	 * mode locks.  This flag is not valid for conversion requests.
2344 	 *
2345 	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
2346 	 * conversion or used with a non-NL requested mode.  We also know an
2347 	 * EXPEDITE request is always granted immediately, so now must always
2348 	 * be 1.  The full condition to grant an expedite request: (now &&
2349 	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
2350 	 * therefore be shortened to just checking the flag.
2351 	 */
2352 
2353 	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
2354 		return 1;
2355 
2356 	/*
2357 	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
2358 	 * added to the remaining conditions.
2359 	 */
2360 
2361 	if (queue_conflict(&r->res_grantqueue, lkb))
2362 		return 0;
2363 
2364 	/*
2365 	 * 6-3: By default, a conversion request is immediately granted if the
2366 	 * requested mode is compatible with the modes of all other granted
2367 	 * locks
2368 	 */
2369 
2370 	if (queue_conflict(&r->res_convertqueue, lkb))
2371 		return 0;
2372 
2373 	/*
2374 	 * The RECOVER_GRANT flag means dlm_recover_grant() is granting
2375 	 * locks for a recovered rsb, on which lkb's have been rebuilt.
2376 	 * The lkb's may have been rebuilt on the queues in a different
2377 	 * order than they were in on the previous master.  So, granting
2378 	 * queued conversions in order after recovery doesn't make sense
2379 	 * since the order hasn't been preserved anyway.  The new order
2380 	 * could also have created a new "in place" conversion deadlock.
2381 	 * (e.g. old, failed master held granted EX, with PR->EX, NL->EX.
2382 	 * After recovery, there would be no granted locks, and possibly
2383 	 * NL->EX, PR->EX, an in-place conversion deadlock.)  So, after
2384 	 * recovery, grant conversions without considering order.
2385 	 */
2386 
2387 	if (conv && recover)
2388 		return 1;
2389 
2390 	/*
2391 	 * 6-5: But the default algorithm for deciding whether to grant or
2392 	 * queue conversion requests does not by itself guarantee that such
2393 	 * requests are serviced on a "first come first serve" basis.  This, in
2394 	 * turn, can lead to a phenomenon known as "indefinate postponement".
2395 	 *
2396 	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
2397 	 * the system service employed to request a lock conversion.  This flag
2398 	 * forces certain conversion requests to be queued, even if they are
2399 	 * compatible with the granted modes of other locks on the same
2400 	 * resource.  Thus, the use of this flag results in conversion requests
2401 	 * being ordered on a "first come first servce" basis.
2402 	 *
2403 	 * DCT: This condition is all about new conversions being able to occur
2404 	 * "in place" while the lock remains on the granted queue (assuming
2405 	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
2406 	 * doesn't _have_ to go onto the convert queue where it's processed in
2407 	 * order.  The "now" variable is necessary to distinguish converts
2408 	 * being received and processed for the first time now, because once a
2409 	 * convert is moved to the conversion queue the condition below applies
2410 	 * requiring fifo granting.
2411 	 */
2412 
2413 	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
2414 		return 1;
2415 
2416 	/*
2417 	 * Even if the convert is compat with all granted locks,
2418 	 * QUECVT forces it behind other locks on the convert queue.
2419 	 */
2420 
2421 	if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
2422 		if (list_empty(&r->res_convertqueue))
2423 			return 1;
2424 		else
2425 			return 0;
2426 	}
2427 
2428 	/*
2429 	 * The NOORDER flag is set to avoid the standard vms rules on grant
2430 	 * order.
2431 	 */
2432 
2433 	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
2434 		return 1;
2435 
2436 	/*
2437 	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
2438 	 * granted until all other conversion requests ahead of it are granted
2439 	 * and/or canceled.
2440 	 */
2441 
2442 	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
2443 		return 1;
2444 
2445 	/*
2446 	 * 6-4: By default, a new request is immediately granted only if all
2447 	 * three of the following conditions are satisfied when the request is
2448 	 * issued:
2449 	 * - The queue of ungranted conversion requests for the resource is
2450 	 *   empty.
2451 	 * - The queue of ungranted new requests for the resource is empty.
2452 	 * - The mode of the new request is compatible with the most
2453 	 *   restrictive mode of all granted locks on the resource.
2454 	 */
2455 
2456 	if (now && !conv && list_empty(&r->res_convertqueue) &&
2457 	    list_empty(&r->res_waitqueue))
2458 		return 1;
2459 
2460 	/*
2461 	 * 6-4: Once a lock request is in the queue of ungranted new requests,
2462 	 * it cannot be granted until the queue of ungranted conversion
2463 	 * requests is empty, all ungranted new requests ahead of it are
2464 	 * granted and/or canceled, and it is compatible with the granted mode
2465 	 * of the most restrictive lock granted on the resource.
2466 	 */
2467 
2468 	if (!now && !conv && list_empty(&r->res_convertqueue) &&
2469 	    first_in_list(lkb, &r->res_waitqueue))
2470 		return 1;
2471 
2472 	return 0;
2473 }
2474 
2475 static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2476 			  int recover, int *err)
2477 {
2478 	int rv;
2479 	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
2480 	int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
2481 
2482 	if (err)
2483 		*err = 0;
2484 
2485 	rv = _can_be_granted(r, lkb, now, recover);
2486 	if (rv)
2487 		goto out;
2488 
2489 	/*
2490 	 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
2491 	 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
2492 	 * cancels one of the locks.
2493 	 */
2494 
2495 	if (is_convert && can_be_queued(lkb) &&
2496 	    conversion_deadlock_detect(r, lkb)) {
2497 		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
2498 			lkb->lkb_grmode = DLM_LOCK_NL;
2499 			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
2500 		} else if (err) {
2501 			*err = -EDEADLK;
2502 		} else {
2503 			log_print("can_be_granted deadlock %x now %d",
2504 				  lkb->lkb_id, now);
2505 			dlm_dump_rsb(r);
2506 		}
2507 		goto out;
2508 	}
2509 
2510 	/*
2511 	 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
2512 	 * to grant a request in a mode other than the normal rqmode.  It's a
2513 	 * simple way to provide a big optimization to applications that can
2514 	 * use them.
2515 	 */
2516 
2517 	if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
2518 		alt = DLM_LOCK_PR;
2519 	else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
2520 		alt = DLM_LOCK_CW;
2521 
2522 	if (alt) {
2523 		lkb->lkb_rqmode = alt;
2524 		rv = _can_be_granted(r, lkb, now, 0);
2525 		if (rv)
2526 			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
2527 		else
2528 			lkb->lkb_rqmode = rqmode;
2529 	}
2530  out:
2531 	return rv;
2532 }
2533 
2534 /* Returns the highest requested mode of all blocked conversions; sets
2535    cw if there's a blocked conversion to DLM_LOCK_CW. */
2536 
2537 static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
2538 				 unsigned int *count)
2539 {
2540 	struct dlm_lkb *lkb, *s;
2541 	int recover = rsb_flag(r, RSB_RECOVER_GRANT);
2542 	int hi, demoted, quit, grant_restart, demote_restart;
2543 	int deadlk;
2544 
2545 	quit = 0;
2546  restart:
2547 	grant_restart = 0;
2548 	demote_restart = 0;
2549 	hi = DLM_LOCK_IV;
2550 
2551 	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
2552 		demoted = is_demoted(lkb);
2553 		deadlk = 0;
2554 
2555 		if (can_be_granted(r, lkb, 0, recover, &deadlk)) {
2556 			grant_lock_pending(r, lkb);
2557 			grant_restart = 1;
2558 			if (count)
2559 				(*count)++;
2560 			continue;
2561 		}
2562 
2563 		if (!demoted && is_demoted(lkb)) {
2564 			log_print("WARN: pending demoted %x node %d %s",
2565 				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
2566 			demote_restart = 1;
2567 			continue;
2568 		}
2569 
2570 		if (deadlk) {
2571 			/*
2572 			 * If DLM_LKB_NODLKWT flag is set and conversion
2573 			 * deadlock is detected, we request blocking AST and
2574 			 * down (or cancel) conversion.
2575 			 */
2576 			if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) {
2577 				if (lkb->lkb_highbast < lkb->lkb_rqmode) {
2578 					queue_bast(r, lkb, lkb->lkb_rqmode);
2579 					lkb->lkb_highbast = lkb->lkb_rqmode;
2580 				}
2581 			} else {
2582 				log_print("WARN: pending deadlock %x node %d %s",
2583 					  lkb->lkb_id, lkb->lkb_nodeid,
2584 					  r->res_name);
2585 				dlm_dump_rsb(r);
2586 			}
2587 			continue;
2588 		}
2589 
2590 		hi = max_t(int, lkb->lkb_rqmode, hi);
2591 
2592 		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
2593 			*cw = 1;
2594 	}
2595 
2596 	if (grant_restart)
2597 		goto restart;
2598 	if (demote_restart && !quit) {
2599 		quit = 1;
2600 		goto restart;
2601 	}
2602 
2603 	return max_t(int, high, hi);
2604 }
2605 
2606 static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw,
2607 			      unsigned int *count)
2608 {
2609 	struct dlm_lkb *lkb, *s;
2610 
2611 	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
2612 		if (can_be_granted(r, lkb, 0, 0, NULL)) {
2613 			grant_lock_pending(r, lkb);
2614 			if (count)
2615 				(*count)++;
2616 		} else {
2617 			high = max_t(int, lkb->lkb_rqmode, high);
2618 			if (lkb->lkb_rqmode == DLM_LOCK_CW)
2619 				*cw = 1;
2620 		}
2621 	}
2622 
2623 	return high;
2624 }
2625 
2626 /* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
2627    on either the convert or waiting queue.
2628    high is the largest rqmode of all locks blocked on the convert or
2629    waiting queue. */
2630 
2631 static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
2632 {
2633 	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
2634 		if (gr->lkb_highbast < DLM_LOCK_EX)
2635 			return 1;
2636 		return 0;
2637 	}
2638 
2639 	if (gr->lkb_highbast < high &&
2640 	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
2641 		return 1;
2642 	return 0;
2643 }
2644 
2645 static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count)
2646 {
2647 	struct dlm_lkb *lkb, *s;
2648 	int high = DLM_LOCK_IV;
2649 	int cw = 0;
2650 
2651 	if (!is_master(r)) {
2652 		log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
2653 		dlm_dump_rsb(r);
2654 		return;
2655 	}
2656 
2657 	high = grant_pending_convert(r, high, &cw, count);
2658 	high = grant_pending_wait(r, high, &cw, count);
2659 
2660 	if (high == DLM_LOCK_IV)
2661 		return;
2662 
2663 	/*
2664 	 * If there are locks left on the wait/convert queue then send blocking
2665 	 * ASTs to granted locks based on the largest requested mode (high)
2666 	 * found above.
2667 	 */
2668 
2669 	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
2670 		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
2671 			if (cw && high == DLM_LOCK_PR &&
2672 			    lkb->lkb_grmode == DLM_LOCK_PR)
2673 				queue_bast(r, lkb, DLM_LOCK_CW);
2674 			else
2675 				queue_bast(r, lkb, high);
2676 			lkb->lkb_highbast = high;
2677 		}
2678 	}
2679 }
2680 
2681 static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
2682 {
2683 	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
2684 	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
2685 		if (gr->lkb_highbast < DLM_LOCK_EX)
2686 			return 1;
2687 		return 0;
2688 	}
2689 
2690 	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
2691 		return 1;
2692 	return 0;
2693 }
2694 
2695 static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
2696 			    struct dlm_lkb *lkb)
2697 {
2698 	struct dlm_lkb *gr;
2699 
2700 	list_for_each_entry(gr, head, lkb_statequeue) {
2701 		/* skip self when sending basts to convertqueue */
2702 		if (gr == lkb)
2703 			continue;
2704 		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
2705 			queue_bast(r, gr, lkb->lkb_rqmode);
2706 			gr->lkb_highbast = lkb->lkb_rqmode;
2707 		}
2708 	}
2709 }
2710 
2711 static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
2712 {
2713 	send_bast_queue(r, &r->res_grantqueue, lkb);
2714 }
2715 
2716 static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
2717 {
2718 	send_bast_queue(r, &r->res_grantqueue, lkb);
2719 	send_bast_queue(r, &r->res_convertqueue, lkb);
2720 }
2721 
2722 /* set_master(r, lkb) -- set the master nodeid of a resource
2723 
2724    The purpose of this function is to set the nodeid field in the given
2725    lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
2726    known, it can just be copied to the lkb and the function will return
2727    0.  If the rsb's nodeid is _not_ known, it needs to be looked up
2728    before it can be copied to the lkb.
2729 
2730    When the rsb nodeid is being looked up remotely, the initial lkb
2731    causing the lookup is kept on the ls_waiters list waiting for the
2732    lookup reply.  Other lkb's waiting for the same rsb lookup are kept
2733    on the rsb's res_lookup list until the master is verified.
2734 
2735    Return values:
2736    0: nodeid is set in rsb/lkb and the caller should go ahead and use it
2737    1: the rsb master is not available and the lkb has been placed on
2738       a wait queue
2739 */
2740 
2741 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
2742 {
2743 	int our_nodeid = dlm_our_nodeid();
2744 
2745 	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
2746 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
2747 		r->res_first_lkid = lkb->lkb_id;
2748 		lkb->lkb_nodeid = r->res_nodeid;
2749 		return 0;
2750 	}
2751 
2752 	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
2753 		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
2754 		return 1;
2755 	}
2756 
2757 	if (r->res_master_nodeid == our_nodeid) {
2758 		lkb->lkb_nodeid = 0;
2759 		return 0;
2760 	}
2761 
2762 	if (r->res_master_nodeid) {
2763 		lkb->lkb_nodeid = r->res_master_nodeid;
2764 		return 0;
2765 	}
2766 
2767 	if (dlm_dir_nodeid(r) == our_nodeid) {
2768 		/* This is a somewhat unusual case; find_rsb will usually
2769 		   have set res_master_nodeid when dir nodeid is local, but
2770 		   there are cases where we become the dir node after we've
2771 		   past find_rsb and go through _request_lock again.
2772 		   confirm_master() or process_lookup_list() needs to be
2773 		   called after this. */
2774 		log_debug(r->res_ls, "set_master %x self master %d dir %d %s",
2775 			  lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid,
2776 			  r->res_name);
2777 		r->res_master_nodeid = our_nodeid;
2778 		r->res_nodeid = 0;
2779 		lkb->lkb_nodeid = 0;
2780 		return 0;
2781 	}
2782 
2783 	wait_pending_remove(r);
2784 
2785 	r->res_first_lkid = lkb->lkb_id;
2786 	send_lookup(r, lkb);
2787 	return 1;
2788 }
2789 
2790 static void process_lookup_list(struct dlm_rsb *r)
2791 {
2792 	struct dlm_lkb *lkb, *safe;
2793 
2794 	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
2795 		list_del_init(&lkb->lkb_rsb_lookup);
2796 		_request_lock(r, lkb);
2797 		schedule();
2798 	}
2799 }
2800 
2801 /* confirm_master -- confirm (or deny) an rsb's master nodeid */
2802 
2803 static void confirm_master(struct dlm_rsb *r, int error)
2804 {
2805 	struct dlm_lkb *lkb;
2806 
2807 	if (!r->res_first_lkid)
2808 		return;
2809 
2810 	switch (error) {
2811 	case 0:
2812 	case -EINPROGRESS:
2813 		r->res_first_lkid = 0;
2814 		process_lookup_list(r);
2815 		break;
2816 
2817 	case -EAGAIN:
2818 	case -EBADR:
2819 	case -ENOTBLK:
2820 		/* the remote request failed and won't be retried (it was
2821 		   a NOQUEUE, or has been canceled/unlocked); make a waiting
2822 		   lkb the first_lkid */
2823 
2824 		r->res_first_lkid = 0;
2825 
2826 		if (!list_empty(&r->res_lookup)) {
2827 			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
2828 					 lkb_rsb_lookup);
2829 			list_del_init(&lkb->lkb_rsb_lookup);
2830 			r->res_first_lkid = lkb->lkb_id;
2831 			_request_lock(r, lkb);
2832 		}
2833 		break;
2834 
2835 	default:
2836 		log_error(r->res_ls, "confirm_master unknown error %d", error);
2837 	}
2838 }
2839 
2840 static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
2841 			 int namelen, unsigned long timeout_cs,
2842 			 void (*ast) (void *astparam),
2843 			 void *astparam,
2844 			 void (*bast) (void *astparam, int mode),
2845 			 struct dlm_args *args)
2846 {
2847 	int rv = -EINVAL;
2848 
2849 	/* check for invalid arg usage */
2850 
2851 	if (mode < 0 || mode > DLM_LOCK_EX)
2852 		goto out;
2853 
2854 	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
2855 		goto out;
2856 
2857 	if (flags & DLM_LKF_CANCEL)
2858 		goto out;
2859 
2860 	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
2861 		goto out;
2862 
2863 	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
2864 		goto out;
2865 
2866 	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
2867 		goto out;
2868 
2869 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
2870 		goto out;
2871 
2872 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
2873 		goto out;
2874 
2875 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
2876 		goto out;
2877 
2878 	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
2879 		goto out;
2880 
2881 	if (!ast || !lksb)
2882 		goto out;
2883 
2884 	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
2885 		goto out;
2886 
2887 	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
2888 		goto out;
2889 
2890 	/* these args will be copied to the lkb in validate_lock_args,
2891 	   it cannot be done now because when converting locks, fields in
2892 	   an active lkb cannot be modified before locking the rsb */
2893 
2894 	args->flags = flags;
2895 	args->astfn = ast;
2896 	args->astparam = astparam;
2897 	args->bastfn = bast;
2898 	args->timeout = timeout_cs;
2899 	args->mode = mode;
2900 	args->lksb = lksb;
2901 	rv = 0;
2902  out:
2903 	return rv;
2904 }
2905 
2906 static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
2907 {
2908 	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
2909  		      DLM_LKF_FORCEUNLOCK))
2910 		return -EINVAL;
2911 
2912 	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
2913 		return -EINVAL;
2914 
2915 	args->flags = flags;
2916 	args->astparam = astarg;
2917 	return 0;
2918 }
2919 
2920 static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2921 			      struct dlm_args *args)
2922 {
2923 	int rv = -EINVAL;
2924 
2925 	if (args->flags & DLM_LKF_CONVERT) {
2926 		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
2927 			goto out;
2928 
2929 		if (args->flags & DLM_LKF_QUECVT &&
2930 		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
2931 			goto out;
2932 
2933 		rv = -EBUSY;
2934 		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2935 			goto out;
2936 
2937 		/* lock not allowed if there's any op in progress */
2938 		if (lkb->lkb_wait_type || lkb->lkb_wait_count)
2939 			goto out;
2940 
2941 		if (is_overlap(lkb))
2942 			goto out;
2943 	}
2944 
2945 	lkb->lkb_exflags = args->flags;
2946 	lkb->lkb_sbflags = 0;
2947 	lkb->lkb_astfn = args->astfn;
2948 	lkb->lkb_astparam = args->astparam;
2949 	lkb->lkb_bastfn = args->bastfn;
2950 	lkb->lkb_rqmode = args->mode;
2951 	lkb->lkb_lksb = args->lksb;
2952 	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
2953 	lkb->lkb_ownpid = (int) current->pid;
2954 	lkb->lkb_timeout_cs = args->timeout;
2955 	rv = 0;
2956  out:
2957 	if (rv)
2958 		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
2959 			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
2960 			  lkb->lkb_status, lkb->lkb_wait_type,
2961 			  lkb->lkb_resource->res_name);
2962 	return rv;
2963 }
2964 
2965 /* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
2966    for success */
2967 
2968 /* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
2969    because there may be a lookup in progress and it's valid to do
2970    cancel/unlockf on it */
2971 
2972 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2973 {
2974 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
2975 	int rv = -EINVAL;
2976 
2977 	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
2978 		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
2979 		dlm_print_lkb(lkb);
2980 		goto out;
2981 	}
2982 
2983 	/* an lkb may still exist even though the lock is EOL'ed due to a
2984 	   cancel, unlock or failed noqueue request; an app can't use these
2985 	   locks; return same error as if the lkid had not been found at all */
2986 
2987 	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
2988 		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
2989 		rv = -ENOENT;
2990 		goto out;
2991 	}
2992 
2993 	/* an lkb may be waiting for an rsb lookup to complete where the
2994 	   lookup was initiated by another lock */
2995 
2996 	if (!list_empty(&lkb->lkb_rsb_lookup)) {
2997 		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2998 			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2999 			list_del_init(&lkb->lkb_rsb_lookup);
3000 			queue_cast(lkb->lkb_resource, lkb,
3001 				   args->flags & DLM_LKF_CANCEL ?
3002 				   -DLM_ECANCEL : -DLM_EUNLOCK);
3003 			unhold_lkb(lkb); /* undoes create_lkb() */
3004 		}
3005 		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
3006 		rv = -EBUSY;
3007 		goto out;
3008 	}
3009 
3010 	/* cancel not allowed with another cancel/unlock in progress */
3011 
3012 	if (args->flags & DLM_LKF_CANCEL) {
3013 		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
3014 			goto out;
3015 
3016 		if (is_overlap(lkb))
3017 			goto out;
3018 
3019 		/* don't let scand try to do a cancel */
3020 		del_timeout(lkb);
3021 
3022 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
3023 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
3024 			rv = -EBUSY;
3025 			goto out;
3026 		}
3027 
3028 		/* there's nothing to cancel */
3029 		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
3030 		    !lkb->lkb_wait_type) {
3031 			rv = -EBUSY;
3032 			goto out;
3033 		}
3034 
3035 		switch (lkb->lkb_wait_type) {
3036 		case DLM_MSG_LOOKUP:
3037 		case DLM_MSG_REQUEST:
3038 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
3039 			rv = -EBUSY;
3040 			goto out;
3041 		case DLM_MSG_UNLOCK:
3042 		case DLM_MSG_CANCEL:
3043 			goto out;
3044 		}
3045 		/* add_to_waiters() will set OVERLAP_CANCEL */
3046 		goto out_ok;
3047 	}
3048 
3049 	/* do we need to allow a force-unlock if there's a normal unlock
3050 	   already in progress?  in what conditions could the normal unlock
3051 	   fail such that we'd want to send a force-unlock to be sure? */
3052 
3053 	if (args->flags & DLM_LKF_FORCEUNLOCK) {
3054 		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
3055 			goto out;
3056 
3057 		if (is_overlap_unlock(lkb))
3058 			goto out;
3059 
3060 		/* don't let scand try to do a cancel */
3061 		del_timeout(lkb);
3062 
3063 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
3064 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
3065 			rv = -EBUSY;
3066 			goto out;
3067 		}
3068 
3069 		switch (lkb->lkb_wait_type) {
3070 		case DLM_MSG_LOOKUP:
3071 		case DLM_MSG_REQUEST:
3072 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
3073 			rv = -EBUSY;
3074 			goto out;
3075 		case DLM_MSG_UNLOCK:
3076 			goto out;
3077 		}
3078 		/* add_to_waiters() will set OVERLAP_UNLOCK */
3079 		goto out_ok;
3080 	}
3081 
3082 	/* normal unlock not allowed if there's any op in progress */
3083 	rv = -EBUSY;
3084 	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
3085 		goto out;
3086 
3087  out_ok:
3088 	/* an overlapping op shouldn't blow away exflags from other op */
3089 	lkb->lkb_exflags |= args->flags;
3090 	lkb->lkb_sbflags = 0;
3091 	lkb->lkb_astparam = args->astparam;
3092 	rv = 0;
3093  out:
3094 	if (rv)
3095 		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
3096 			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
3097 			  args->flags, lkb->lkb_wait_type,
3098 			  lkb->lkb_resource->res_name);
3099 	return rv;
3100 }
3101 
3102 /*
3103  * Four stage 4 varieties:
3104  * do_request(), do_convert(), do_unlock(), do_cancel()
3105  * These are called on the master node for the given lock and
3106  * from the central locking logic.
3107  */
3108 
3109 static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
3110 {
3111 	int error = 0;
3112 
3113 	if (can_be_granted(r, lkb, 1, 0, NULL)) {
3114 		grant_lock(r, lkb);
3115 		queue_cast(r, lkb, 0);
3116 		goto out;
3117 	}
3118 
3119 	if (can_be_queued(lkb)) {
3120 		error = -EINPROGRESS;
3121 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
3122 		add_timeout(lkb);
3123 		goto out;
3124 	}
3125 
3126 	error = -EAGAIN;
3127 	queue_cast(r, lkb, -EAGAIN);
3128  out:
3129 	return error;
3130 }
3131 
3132 static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3133 			       int error)
3134 {
3135 	switch (error) {
3136 	case -EAGAIN:
3137 		if (force_blocking_asts(lkb))
3138 			send_blocking_asts_all(r, lkb);
3139 		break;
3140 	case -EINPROGRESS:
3141 		send_blocking_asts(r, lkb);
3142 		break;
3143 	}
3144 }
3145 
3146 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3147 {
3148 	int error = 0;
3149 	int deadlk = 0;
3150 
3151 	/* changing an existing lock may allow others to be granted */
3152 
3153 	if (can_be_granted(r, lkb, 1, 0, &deadlk)) {
3154 		grant_lock(r, lkb);
3155 		queue_cast(r, lkb, 0);
3156 		goto out;
3157 	}
3158 
3159 	/* can_be_granted() detected that this lock would block in a conversion
3160 	   deadlock, so we leave it on the granted queue and return EDEADLK in
3161 	   the ast for the convert. */
3162 
3163 	if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
3164 		/* it's left on the granted queue */
3165 		revert_lock(r, lkb);
3166 		queue_cast(r, lkb, -EDEADLK);
3167 		error = -EDEADLK;
3168 		goto out;
3169 	}
3170 
3171 	/* is_demoted() means the can_be_granted() above set the grmode
3172 	   to NL, and left us on the granted queue.  This auto-demotion
3173 	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
3174 	   now grantable.  We have to try to grant other converting locks
3175 	   before we try again to grant this one. */
3176 
3177 	if (is_demoted(lkb)) {
3178 		grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL);
3179 		if (_can_be_granted(r, lkb, 1, 0)) {
3180 			grant_lock(r, lkb);
3181 			queue_cast(r, lkb, 0);
3182 			goto out;
3183 		}
3184 		/* else fall through and move to convert queue */
3185 	}
3186 
3187 	if (can_be_queued(lkb)) {
3188 		error = -EINPROGRESS;
3189 		del_lkb(r, lkb);
3190 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3191 		add_timeout(lkb);
3192 		goto out;
3193 	}
3194 
3195 	error = -EAGAIN;
3196 	queue_cast(r, lkb, -EAGAIN);
3197  out:
3198 	return error;
3199 }
3200 
3201 static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3202 			       int error)
3203 {
3204 	switch (error) {
3205 	case 0:
3206 		grant_pending_locks(r, NULL);
3207 		/* grant_pending_locks also sends basts */
3208 		break;
3209 	case -EAGAIN:
3210 		if (force_blocking_asts(lkb))
3211 			send_blocking_asts_all(r, lkb);
3212 		break;
3213 	case -EINPROGRESS:
3214 		send_blocking_asts(r, lkb);
3215 		break;
3216 	}
3217 }
3218 
3219 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3220 {
3221 	remove_lock(r, lkb);
3222 	queue_cast(r, lkb, -DLM_EUNLOCK);
3223 	return -DLM_EUNLOCK;
3224 }
3225 
3226 static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3227 			      int error)
3228 {
3229 	grant_pending_locks(r, NULL);
3230 }
3231 
3232 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
3233 
3234 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
3235 {
3236 	int error;
3237 
3238 	error = revert_lock(r, lkb);
3239 	if (error) {
3240 		queue_cast(r, lkb, -DLM_ECANCEL);
3241 		return -DLM_ECANCEL;
3242 	}
3243 	return 0;
3244 }
3245 
3246 static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3247 			      int error)
3248 {
3249 	if (error)
3250 		grant_pending_locks(r, NULL);
3251 }
3252 
3253 /*
3254  * Four stage 3 varieties:
3255  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
3256  */
3257 
3258 /* add a new lkb to a possibly new rsb, called by requesting process */
3259 
3260 static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3261 {
3262 	int error;
3263 
3264 	/* set_master: sets lkb nodeid from r */
3265 
3266 	error = set_master(r, lkb);
3267 	if (error < 0)
3268 		goto out;
3269 	if (error) {
3270 		error = 0;
3271 		goto out;
3272 	}
3273 
3274 	if (is_remote(r)) {
3275 		/* receive_request() calls do_request() on remote node */
3276 		error = send_request(r, lkb);
3277 	} else {
3278 		error = do_request(r, lkb);
3279 		/* for remote locks the request_reply is sent
3280 		   between do_request and do_request_effects */
3281 		do_request_effects(r, lkb, error);
3282 	}
3283  out:
3284 	return error;
3285 }
3286 
3287 /* change some property of an existing lkb, e.g. mode */
3288 
3289 static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3290 {
3291 	int error;
3292 
3293 	if (is_remote(r)) {
3294 		/* receive_convert() calls do_convert() on remote node */
3295 		error = send_convert(r, lkb);
3296 	} else {
3297 		error = do_convert(r, lkb);
3298 		/* for remote locks the convert_reply is sent
3299 		   between do_convert and do_convert_effects */
3300 		do_convert_effects(r, lkb, error);
3301 	}
3302 
3303 	return error;
3304 }
3305 
3306 /* remove an existing lkb from the granted queue */
3307 
3308 static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3309 {
3310 	int error;
3311 
3312 	if (is_remote(r)) {
3313 		/* receive_unlock() calls do_unlock() on remote node */
3314 		error = send_unlock(r, lkb);
3315 	} else {
3316 		error = do_unlock(r, lkb);
3317 		/* for remote locks the unlock_reply is sent
3318 		   between do_unlock and do_unlock_effects */
3319 		do_unlock_effects(r, lkb, error);
3320 	}
3321 
3322 	return error;
3323 }
3324 
3325 /* remove an existing lkb from the convert or wait queue */
3326 
3327 static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3328 {
3329 	int error;
3330 
3331 	if (is_remote(r)) {
3332 		/* receive_cancel() calls do_cancel() on remote node */
3333 		error = send_cancel(r, lkb);
3334 	} else {
3335 		error = do_cancel(r, lkb);
3336 		/* for remote locks the cancel_reply is sent
3337 		   between do_cancel and do_cancel_effects */
3338 		do_cancel_effects(r, lkb, error);
3339 	}
3340 
3341 	return error;
3342 }
3343 
3344 /*
3345  * Four stage 2 varieties:
3346  * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
3347  */
3348 
3349 static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
3350 			int len, struct dlm_args *args)
3351 {
3352 	struct dlm_rsb *r;
3353 	int error;
3354 
3355 	error = validate_lock_args(ls, lkb, args);
3356 	if (error)
3357 		return error;
3358 
3359 	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
3360 	if (error)
3361 		return error;
3362 
3363 	lock_rsb(r);
3364 
3365 	attach_lkb(r, lkb);
3366 	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
3367 
3368 	error = _request_lock(r, lkb);
3369 
3370 	unlock_rsb(r);
3371 	put_rsb(r);
3372 	return error;
3373 }
3374 
3375 static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3376 			struct dlm_args *args)
3377 {
3378 	struct dlm_rsb *r;
3379 	int error;
3380 
3381 	r = lkb->lkb_resource;
3382 
3383 	hold_rsb(r);
3384 	lock_rsb(r);
3385 
3386 	error = validate_lock_args(ls, lkb, args);
3387 	if (error)
3388 		goto out;
3389 
3390 	error = _convert_lock(r, lkb);
3391  out:
3392 	unlock_rsb(r);
3393 	put_rsb(r);
3394 	return error;
3395 }
3396 
3397 static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3398 		       struct dlm_args *args)
3399 {
3400 	struct dlm_rsb *r;
3401 	int error;
3402 
3403 	r = lkb->lkb_resource;
3404 
3405 	hold_rsb(r);
3406 	lock_rsb(r);
3407 
3408 	error = validate_unlock_args(lkb, args);
3409 	if (error)
3410 		goto out;
3411 
3412 	error = _unlock_lock(r, lkb);
3413  out:
3414 	unlock_rsb(r);
3415 	put_rsb(r);
3416 	return error;
3417 }
3418 
3419 static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3420 		       struct dlm_args *args)
3421 {
3422 	struct dlm_rsb *r;
3423 	int error;
3424 
3425 	r = lkb->lkb_resource;
3426 
3427 	hold_rsb(r);
3428 	lock_rsb(r);
3429 
3430 	error = validate_unlock_args(lkb, args);
3431 	if (error)
3432 		goto out;
3433 
3434 	error = _cancel_lock(r, lkb);
3435  out:
3436 	unlock_rsb(r);
3437 	put_rsb(r);
3438 	return error;
3439 }
3440 
3441 /*
3442  * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
3443  */
3444 
3445 int dlm_lock(dlm_lockspace_t *lockspace,
3446 	     int mode,
3447 	     struct dlm_lksb *lksb,
3448 	     uint32_t flags,
3449 	     void *name,
3450 	     unsigned int namelen,
3451 	     uint32_t parent_lkid,
3452 	     void (*ast) (void *astarg),
3453 	     void *astarg,
3454 	     void (*bast) (void *astarg, int mode))
3455 {
3456 	struct dlm_ls *ls;
3457 	struct dlm_lkb *lkb;
3458 	struct dlm_args args;
3459 	int error, convert = flags & DLM_LKF_CONVERT;
3460 
3461 	ls = dlm_find_lockspace_local(lockspace);
3462 	if (!ls)
3463 		return -EINVAL;
3464 
3465 	dlm_lock_recovery(ls);
3466 
3467 	if (convert)
3468 		error = find_lkb(ls, lksb->sb_lkid, &lkb);
3469 	else
3470 		error = create_lkb(ls, &lkb);
3471 
3472 	if (error)
3473 		goto out;
3474 
3475 	trace_dlm_lock_start(ls, lkb, mode, flags);
3476 
3477 	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
3478 			      astarg, bast, &args);
3479 	if (error)
3480 		goto out_put;
3481 
3482 	if (convert)
3483 		error = convert_lock(ls, lkb, &args);
3484 	else
3485 		error = request_lock(ls, lkb, name, namelen, &args);
3486 
3487 	if (error == -EINPROGRESS)
3488 		error = 0;
3489  out_put:
3490 	trace_dlm_lock_end(ls, lkb, mode, flags, error);
3491 
3492 	if (convert || error)
3493 		__put_lkb(ls, lkb);
3494 	if (error == -EAGAIN || error == -EDEADLK)
3495 		error = 0;
3496  out:
3497 	dlm_unlock_recovery(ls);
3498 	dlm_put_lockspace(ls);
3499 	return error;
3500 }
3501 
3502 int dlm_unlock(dlm_lockspace_t *lockspace,
3503 	       uint32_t lkid,
3504 	       uint32_t flags,
3505 	       struct dlm_lksb *lksb,
3506 	       void *astarg)
3507 {
3508 	struct dlm_ls *ls;
3509 	struct dlm_lkb *lkb;
3510 	struct dlm_args args;
3511 	int error;
3512 
3513 	ls = dlm_find_lockspace_local(lockspace);
3514 	if (!ls)
3515 		return -EINVAL;
3516 
3517 	dlm_lock_recovery(ls);
3518 
3519 	error = find_lkb(ls, lkid, &lkb);
3520 	if (error)
3521 		goto out;
3522 
3523 	trace_dlm_unlock_start(ls, lkb, flags);
3524 
3525 	error = set_unlock_args(flags, astarg, &args);
3526 	if (error)
3527 		goto out_put;
3528 
3529 	if (flags & DLM_LKF_CANCEL)
3530 		error = cancel_lock(ls, lkb, &args);
3531 	else
3532 		error = unlock_lock(ls, lkb, &args);
3533 
3534 	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
3535 		error = 0;
3536 	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
3537 		error = 0;
3538  out_put:
3539 	trace_dlm_unlock_end(ls, lkb, flags, error);
3540 
3541 	dlm_put_lkb(lkb);
3542  out:
3543 	dlm_unlock_recovery(ls);
3544 	dlm_put_lockspace(ls);
3545 	return error;
3546 }
3547 
3548 /*
3549  * send/receive routines for remote operations and replies
3550  *
3551  * send_args
3552  * send_common
3553  * send_request			receive_request
3554  * send_convert			receive_convert
3555  * send_unlock			receive_unlock
3556  * send_cancel			receive_cancel
3557  * send_grant			receive_grant
3558  * send_bast			receive_bast
3559  * send_lookup			receive_lookup
3560  * send_remove			receive_remove
3561  *
3562  * 				send_common_reply
3563  * receive_request_reply	send_request_reply
3564  * receive_convert_reply	send_convert_reply
3565  * receive_unlock_reply		send_unlock_reply
3566  * receive_cancel_reply		send_cancel_reply
3567  * receive_lookup_reply		send_lookup_reply
3568  */
3569 
3570 static int _create_message(struct dlm_ls *ls, int mb_len,
3571 			   int to_nodeid, int mstype,
3572 			   struct dlm_message **ms_ret,
3573 			   struct dlm_mhandle **mh_ret)
3574 {
3575 	struct dlm_message *ms;
3576 	struct dlm_mhandle *mh;
3577 	char *mb;
3578 
3579 	/* get_buffer gives us a message handle (mh) that we need to
3580 	   pass into midcomms_commit and a message buffer (mb) that we
3581 	   write our data into */
3582 
3583 	mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
3584 	if (!mh)
3585 		return -ENOBUFS;
3586 
3587 	ms = (struct dlm_message *) mb;
3588 
3589 	ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
3590 	ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
3591 	ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
3592 	ms->m_header.h_length = cpu_to_le16(mb_len);
3593 	ms->m_header.h_cmd = DLM_MSG;
3594 
3595 	ms->m_type = cpu_to_le32(mstype);
3596 
3597 	*mh_ret = mh;
3598 	*ms_ret = ms;
3599 	return 0;
3600 }
3601 
3602 static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
3603 			  int to_nodeid, int mstype,
3604 			  struct dlm_message **ms_ret,
3605 			  struct dlm_mhandle **mh_ret)
3606 {
3607 	int mb_len = sizeof(struct dlm_message);
3608 
3609 	switch (mstype) {
3610 	case DLM_MSG_REQUEST:
3611 	case DLM_MSG_LOOKUP:
3612 	case DLM_MSG_REMOVE:
3613 		mb_len += r->res_length;
3614 		break;
3615 	case DLM_MSG_CONVERT:
3616 	case DLM_MSG_UNLOCK:
3617 	case DLM_MSG_REQUEST_REPLY:
3618 	case DLM_MSG_CONVERT_REPLY:
3619 	case DLM_MSG_GRANT:
3620 		if (lkb && lkb->lkb_lvbptr)
3621 			mb_len += r->res_ls->ls_lvblen;
3622 		break;
3623 	}
3624 
3625 	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
3626 			       ms_ret, mh_ret);
3627 }
3628 
3629 /* further lowcomms enhancements or alternate implementations may make
3630    the return value from this function useful at some point */
3631 
3632 static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
3633 {
3634 	dlm_midcomms_commit_mhandle(mh);
3635 	return 0;
3636 }
3637 
3638 static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
3639 		      struct dlm_message *ms)
3640 {
3641 	ms->m_nodeid   = cpu_to_le32(lkb->lkb_nodeid);
3642 	ms->m_pid      = cpu_to_le32(lkb->lkb_ownpid);
3643 	ms->m_lkid     = cpu_to_le32(lkb->lkb_id);
3644 	ms->m_remid    = cpu_to_le32(lkb->lkb_remid);
3645 	ms->m_exflags  = cpu_to_le32(lkb->lkb_exflags);
3646 	ms->m_sbflags  = cpu_to_le32(lkb->lkb_sbflags);
3647 	ms->m_flags    = cpu_to_le32(lkb->lkb_flags);
3648 	ms->m_lvbseq   = cpu_to_le32(lkb->lkb_lvbseq);
3649 	ms->m_status   = cpu_to_le32(lkb->lkb_status);
3650 	ms->m_grmode   = cpu_to_le32(lkb->lkb_grmode);
3651 	ms->m_rqmode   = cpu_to_le32(lkb->lkb_rqmode);
3652 	ms->m_hash     = cpu_to_le32(r->res_hash);
3653 
3654 	/* m_result and m_bastmode are set from function args,
3655 	   not from lkb fields */
3656 
3657 	if (lkb->lkb_bastfn)
3658 		ms->m_asts |= cpu_to_le32(DLM_CB_BAST);
3659 	if (lkb->lkb_astfn)
3660 		ms->m_asts |= cpu_to_le32(DLM_CB_CAST);
3661 
3662 	/* compare with switch in create_message; send_remove() doesn't
3663 	   use send_args() */
3664 
3665 	switch (ms->m_type) {
3666 	case cpu_to_le32(DLM_MSG_REQUEST):
3667 	case cpu_to_le32(DLM_MSG_LOOKUP):
3668 		memcpy(ms->m_extra, r->res_name, r->res_length);
3669 		break;
3670 	case cpu_to_le32(DLM_MSG_CONVERT):
3671 	case cpu_to_le32(DLM_MSG_UNLOCK):
3672 	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
3673 	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
3674 	case cpu_to_le32(DLM_MSG_GRANT):
3675 		if (!lkb->lkb_lvbptr)
3676 			break;
3677 		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
3678 		break;
3679 	}
3680 }
3681 
3682 static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
3683 {
3684 	struct dlm_message *ms;
3685 	struct dlm_mhandle *mh;
3686 	int to_nodeid, error;
3687 
3688 	to_nodeid = r->res_nodeid;
3689 
3690 	error = add_to_waiters(lkb, mstype, to_nodeid);
3691 	if (error)
3692 		return error;
3693 
3694 	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
3695 	if (error)
3696 		goto fail;
3697 
3698 	send_args(r, lkb, ms);
3699 
3700 	error = send_message(mh, ms);
3701 	if (error)
3702 		goto fail;
3703 	return 0;
3704 
3705  fail:
3706 	remove_from_waiters(lkb, msg_reply_type(mstype));
3707 	return error;
3708 }
3709 
3710 static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
3711 {
3712 	return send_common(r, lkb, DLM_MSG_REQUEST);
3713 }
3714 
3715 static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3716 {
3717 	int error;
3718 
3719 	error = send_common(r, lkb, DLM_MSG_CONVERT);
3720 
3721 	/* down conversions go without a reply from the master */
3722 	if (!error && down_conversion(lkb)) {
3723 		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
3724 		r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
3725 		r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
3726 		r->res_ls->ls_stub_ms.m_result = 0;
3727 		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
3728 	}
3729 
3730 	return error;
3731 }
3732 
3733 /* FIXME: if this lkb is the only lock we hold on the rsb, then set
3734    MASTER_UNCERTAIN to force the next request on the rsb to confirm
3735    that the master is still correct. */
3736 
3737 static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3738 {
3739 	return send_common(r, lkb, DLM_MSG_UNLOCK);
3740 }
3741 
3742 static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
3743 {
3744 	return send_common(r, lkb, DLM_MSG_CANCEL);
3745 }
3746 
3747 static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
3748 {
3749 	struct dlm_message *ms;
3750 	struct dlm_mhandle *mh;
3751 	int to_nodeid, error;
3752 
3753 	to_nodeid = lkb->lkb_nodeid;
3754 
3755 	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
3756 	if (error)
3757 		goto out;
3758 
3759 	send_args(r, lkb, ms);
3760 
3761 	ms->m_result = 0;
3762 
3763 	error = send_message(mh, ms);
3764  out:
3765 	return error;
3766 }
3767 
3768 static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
3769 {
3770 	struct dlm_message *ms;
3771 	struct dlm_mhandle *mh;
3772 	int to_nodeid, error;
3773 
3774 	to_nodeid = lkb->lkb_nodeid;
3775 
3776 	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
3777 	if (error)
3778 		goto out;
3779 
3780 	send_args(r, lkb, ms);
3781 
3782 	ms->m_bastmode = cpu_to_le32(mode);
3783 
3784 	error = send_message(mh, ms);
3785  out:
3786 	return error;
3787 }
3788 
3789 static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
3790 {
3791 	struct dlm_message *ms;
3792 	struct dlm_mhandle *mh;
3793 	int to_nodeid, error;
3794 
3795 	to_nodeid = dlm_dir_nodeid(r);
3796 
3797 	error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
3798 	if (error)
3799 		return error;
3800 
3801 	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
3802 	if (error)
3803 		goto fail;
3804 
3805 	send_args(r, lkb, ms);
3806 
3807 	error = send_message(mh, ms);
3808 	if (error)
3809 		goto fail;
3810 	return 0;
3811 
3812  fail:
3813 	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
3814 	return error;
3815 }
3816 
3817 static int send_remove(struct dlm_rsb *r)
3818 {
3819 	struct dlm_message *ms;
3820 	struct dlm_mhandle *mh;
3821 	int to_nodeid, error;
3822 
3823 	to_nodeid = dlm_dir_nodeid(r);
3824 
3825 	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
3826 	if (error)
3827 		goto out;
3828 
3829 	memcpy(ms->m_extra, r->res_name, r->res_length);
3830 	ms->m_hash = cpu_to_le32(r->res_hash);
3831 
3832 	error = send_message(mh, ms);
3833  out:
3834 	return error;
3835 }
3836 
3837 static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3838 			     int mstype, int rv)
3839 {
3840 	struct dlm_message *ms;
3841 	struct dlm_mhandle *mh;
3842 	int to_nodeid, error;
3843 
3844 	to_nodeid = lkb->lkb_nodeid;
3845 
3846 	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
3847 	if (error)
3848 		goto out;
3849 
3850 	send_args(r, lkb, ms);
3851 
3852 	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
3853 
3854 	error = send_message(mh, ms);
3855  out:
3856 	return error;
3857 }
3858 
3859 static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3860 {
3861 	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
3862 }
3863 
3864 static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3865 {
3866 	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
3867 }
3868 
3869 static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3870 {
3871 	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
3872 }
3873 
3874 static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3875 {
3876 	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
3877 }
3878 
3879 static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
3880 			     int ret_nodeid, int rv)
3881 {
3882 	struct dlm_rsb *r = &ls->ls_stub_rsb;
3883 	struct dlm_message *ms;
3884 	struct dlm_mhandle *mh;
3885 	int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
3886 
3887 	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
3888 	if (error)
3889 		goto out;
3890 
3891 	ms->m_lkid = ms_in->m_lkid;
3892 	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
3893 	ms->m_nodeid = cpu_to_le32(ret_nodeid);
3894 
3895 	error = send_message(mh, ms);
3896  out:
3897 	return error;
3898 }
3899 
3900 /* which args we save from a received message depends heavily on the type
3901    of message, unlike the send side where we can safely send everything about
3902    the lkb for any type of message */
3903 
3904 static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
3905 {
3906 	lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
3907 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
3908 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3909 			  (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
3910 }
3911 
3912 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3913 {
3914 	if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS))
3915 		return;
3916 
3917 	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
3918 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3919 			 (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
3920 }
3921 
3922 static int receive_extralen(struct dlm_message *ms)
3923 {
3924 	return (le16_to_cpu(ms->m_header.h_length) -
3925 		sizeof(struct dlm_message));
3926 }
3927 
3928 static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
3929 		       struct dlm_message *ms)
3930 {
3931 	int len;
3932 
3933 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3934 		if (!lkb->lkb_lvbptr)
3935 			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3936 		if (!lkb->lkb_lvbptr)
3937 			return -ENOMEM;
3938 		len = receive_extralen(ms);
3939 		if (len > ls->ls_lvblen)
3940 			len = ls->ls_lvblen;
3941 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
3942 	}
3943 	return 0;
3944 }
3945 
3946 static void fake_bastfn(void *astparam, int mode)
3947 {
3948 	log_print("fake_bastfn should not be called");
3949 }
3950 
3951 static void fake_astfn(void *astparam)
3952 {
3953 	log_print("fake_astfn should not be called");
3954 }
3955 
3956 static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3957 				struct dlm_message *ms)
3958 {
3959 	lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
3960 	lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
3961 	lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
3962 	lkb->lkb_grmode = DLM_LOCK_IV;
3963 	lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
3964 
3965 	lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL;
3966 	lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL;
3967 
3968 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3969 		/* lkb was just created so there won't be an lvb yet */
3970 		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3971 		if (!lkb->lkb_lvbptr)
3972 			return -ENOMEM;
3973 	}
3974 
3975 	return 0;
3976 }
3977 
3978 static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3979 				struct dlm_message *ms)
3980 {
3981 	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3982 		return -EBUSY;
3983 
3984 	if (receive_lvb(ls, lkb, ms))
3985 		return -ENOMEM;
3986 
3987 	lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
3988 	lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
3989 
3990 	return 0;
3991 }
3992 
3993 static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3994 			       struct dlm_message *ms)
3995 {
3996 	if (receive_lvb(ls, lkb, ms))
3997 		return -ENOMEM;
3998 	return 0;
3999 }
4000 
4001 /* We fill in the stub-lkb fields with the info that send_xxxx_reply()
4002    uses to send a reply and that the remote end uses to process the reply. */
4003 
4004 static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
4005 {
4006 	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
4007 	lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4008 	lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
4009 }
4010 
4011 /* This is called after the rsb is locked so that we can safely inspect
4012    fields in the lkb. */
4013 
4014 static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
4015 {
4016 	int from = le32_to_cpu(ms->m_header.h_nodeid);
4017 	int error = 0;
4018 
4019 	/* currently mixing of user/kernel locks are not supported */
4020 	if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) &&
4021 	    ~lkb->lkb_flags & DLM_IFL_USER) {
4022 		log_error(lkb->lkb_resource->res_ls,
4023 			  "got user dlm message for a kernel lock");
4024 		error = -EINVAL;
4025 		goto out;
4026 	}
4027 
4028 	switch (ms->m_type) {
4029 	case cpu_to_le32(DLM_MSG_CONVERT):
4030 	case cpu_to_le32(DLM_MSG_UNLOCK):
4031 	case cpu_to_le32(DLM_MSG_CANCEL):
4032 		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
4033 			error = -EINVAL;
4034 		break;
4035 
4036 	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
4037 	case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
4038 	case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
4039 	case cpu_to_le32(DLM_MSG_GRANT):
4040 	case cpu_to_le32(DLM_MSG_BAST):
4041 		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
4042 			error = -EINVAL;
4043 		break;
4044 
4045 	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
4046 		if (!is_process_copy(lkb))
4047 			error = -EINVAL;
4048 		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
4049 			error = -EINVAL;
4050 		break;
4051 
4052 	default:
4053 		error = -EINVAL;
4054 	}
4055 
4056 out:
4057 	if (error)
4058 		log_error(lkb->lkb_resource->res_ls,
4059 			  "ignore invalid message %d from %d %x %x %x %d",
4060 			  le32_to_cpu(ms->m_type), from, lkb->lkb_id,
4061 			  lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid);
4062 	return error;
4063 }
4064 
4065 static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
4066 {
4067 	char name[DLM_RESNAME_MAXLEN + 1];
4068 	struct dlm_message *ms;
4069 	struct dlm_mhandle *mh;
4070 	struct dlm_rsb *r;
4071 	uint32_t hash, b;
4072 	int rv, dir_nodeid;
4073 
4074 	memset(name, 0, sizeof(name));
4075 	memcpy(name, ms_name, len);
4076 
4077 	hash = jhash(name, len, 0);
4078 	b = hash & (ls->ls_rsbtbl_size - 1);
4079 
4080 	dir_nodeid = dlm_hash2nodeid(ls, hash);
4081 
4082 	log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
4083 
4084 	spin_lock(&ls->ls_rsbtbl[b].lock);
4085 	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4086 	if (!rv) {
4087 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4088 		log_error(ls, "repeat_remove on keep %s", name);
4089 		return;
4090 	}
4091 
4092 	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4093 	if (!rv) {
4094 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4095 		log_error(ls, "repeat_remove on toss %s", name);
4096 		return;
4097 	}
4098 
4099 	/* use ls->remove_name2 to avoid conflict with shrink? */
4100 
4101 	spin_lock(&ls->ls_remove_spin);
4102 	ls->ls_remove_len = len;
4103 	memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
4104 	spin_unlock(&ls->ls_remove_spin);
4105 	spin_unlock(&ls->ls_rsbtbl[b].lock);
4106 
4107 	rv = _create_message(ls, sizeof(struct dlm_message) + len,
4108 			     dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
4109 	if (rv)
4110 		goto out;
4111 
4112 	memcpy(ms->m_extra, name, len);
4113 	ms->m_hash = cpu_to_le32(hash);
4114 
4115 	send_message(mh, ms);
4116 
4117 out:
4118 	spin_lock(&ls->ls_remove_spin);
4119 	ls->ls_remove_len = 0;
4120 	memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
4121 	spin_unlock(&ls->ls_remove_spin);
4122 	wake_up(&ls->ls_remove_wait);
4123 }
4124 
4125 static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4126 {
4127 	struct dlm_lkb *lkb;
4128 	struct dlm_rsb *r;
4129 	int from_nodeid;
4130 	int error, namelen = 0;
4131 
4132 	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4133 
4134 	error = create_lkb(ls, &lkb);
4135 	if (error)
4136 		goto fail;
4137 
4138 	receive_flags(lkb, ms);
4139 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
4140 	error = receive_request_args(ls, lkb, ms);
4141 	if (error) {
4142 		__put_lkb(ls, lkb);
4143 		goto fail;
4144 	}
4145 
4146 	/* The dir node is the authority on whether we are the master
4147 	   for this rsb or not, so if the master sends us a request, we should
4148 	   recreate the rsb if we've destroyed it.   This race happens when we
4149 	   send a remove message to the dir node at the same time that the dir
4150 	   node sends us a request for the rsb. */
4151 
4152 	namelen = receive_extralen(ms);
4153 
4154 	error = find_rsb(ls, ms->m_extra, namelen, from_nodeid,
4155 			 R_RECEIVE_REQUEST, &r);
4156 	if (error) {
4157 		__put_lkb(ls, lkb);
4158 		goto fail;
4159 	}
4160 
4161 	lock_rsb(r);
4162 
4163 	if (r->res_master_nodeid != dlm_our_nodeid()) {
4164 		error = validate_master_nodeid(ls, r, from_nodeid);
4165 		if (error) {
4166 			unlock_rsb(r);
4167 			put_rsb(r);
4168 			__put_lkb(ls, lkb);
4169 			goto fail;
4170 		}
4171 	}
4172 
4173 	attach_lkb(r, lkb);
4174 	error = do_request(r, lkb);
4175 	send_request_reply(r, lkb, error);
4176 	do_request_effects(r, lkb, error);
4177 
4178 	unlock_rsb(r);
4179 	put_rsb(r);
4180 
4181 	if (error == -EINPROGRESS)
4182 		error = 0;
4183 	if (error)
4184 		dlm_put_lkb(lkb);
4185 	return 0;
4186 
4187  fail:
4188 	/* TODO: instead of returning ENOTBLK, add the lkb to res_lookup
4189 	   and do this receive_request again from process_lookup_list once
4190 	   we get the lookup reply.  This would avoid a many repeated
4191 	   ENOTBLK request failures when the lookup reply designating us
4192 	   as master is delayed. */
4193 
4194 	/* We could repeatedly return -EBADR here if our send_remove() is
4195 	   delayed in being sent/arriving/being processed on the dir node.
4196 	   Another node would repeatedly lookup up the master, and the dir
4197 	   node would continue returning our nodeid until our send_remove
4198 	   took effect.
4199 
4200 	   We send another remove message in case our previous send_remove
4201 	   was lost/ignored/missed somehow. */
4202 
4203 	if (error != -ENOTBLK) {
4204 		log_limit(ls, "receive_request %x from %d %d",
4205 			  le32_to_cpu(ms->m_lkid), from_nodeid, error);
4206 	}
4207 
4208 	if (namelen && error == -EBADR) {
4209 		send_repeat_remove(ls, ms->m_extra, namelen);
4210 		msleep(1000);
4211 	}
4212 
4213 	setup_stub_lkb(ls, ms);
4214 	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4215 	return error;
4216 }
4217 
4218 static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
4219 {
4220 	struct dlm_lkb *lkb;
4221 	struct dlm_rsb *r;
4222 	int error, reply = 1;
4223 
4224 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4225 	if (error)
4226 		goto fail;
4227 
4228 	if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
4229 		log_error(ls, "receive_convert %x remid %x recover_seq %llu "
4230 			  "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
4231 			  (unsigned long long)lkb->lkb_recover_seq,
4232 			  le32_to_cpu(ms->m_header.h_nodeid),
4233 			  le32_to_cpu(ms->m_lkid));
4234 		error = -ENOENT;
4235 		dlm_put_lkb(lkb);
4236 		goto fail;
4237 	}
4238 
4239 	r = lkb->lkb_resource;
4240 
4241 	hold_rsb(r);
4242 	lock_rsb(r);
4243 
4244 	error = validate_message(lkb, ms);
4245 	if (error)
4246 		goto out;
4247 
4248 	receive_flags(lkb, ms);
4249 
4250 	error = receive_convert_args(ls, lkb, ms);
4251 	if (error) {
4252 		send_convert_reply(r, lkb, error);
4253 		goto out;
4254 	}
4255 
4256 	reply = !down_conversion(lkb);
4257 
4258 	error = do_convert(r, lkb);
4259 	if (reply)
4260 		send_convert_reply(r, lkb, error);
4261 	do_convert_effects(r, lkb, error);
4262  out:
4263 	unlock_rsb(r);
4264 	put_rsb(r);
4265 	dlm_put_lkb(lkb);
4266 	return 0;
4267 
4268  fail:
4269 	setup_stub_lkb(ls, ms);
4270 	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4271 	return error;
4272 }
4273 
4274 static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
4275 {
4276 	struct dlm_lkb *lkb;
4277 	struct dlm_rsb *r;
4278 	int error;
4279 
4280 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4281 	if (error)
4282 		goto fail;
4283 
4284 	if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
4285 		log_error(ls, "receive_unlock %x remid %x remote %d %x",
4286 			  lkb->lkb_id, lkb->lkb_remid,
4287 			  le32_to_cpu(ms->m_header.h_nodeid),
4288 			  le32_to_cpu(ms->m_lkid));
4289 		error = -ENOENT;
4290 		dlm_put_lkb(lkb);
4291 		goto fail;
4292 	}
4293 
4294 	r = lkb->lkb_resource;
4295 
4296 	hold_rsb(r);
4297 	lock_rsb(r);
4298 
4299 	error = validate_message(lkb, ms);
4300 	if (error)
4301 		goto out;
4302 
4303 	receive_flags(lkb, ms);
4304 
4305 	error = receive_unlock_args(ls, lkb, ms);
4306 	if (error) {
4307 		send_unlock_reply(r, lkb, error);
4308 		goto out;
4309 	}
4310 
4311 	error = do_unlock(r, lkb);
4312 	send_unlock_reply(r, lkb, error);
4313 	do_unlock_effects(r, lkb, error);
4314  out:
4315 	unlock_rsb(r);
4316 	put_rsb(r);
4317 	dlm_put_lkb(lkb);
4318 	return 0;
4319 
4320  fail:
4321 	setup_stub_lkb(ls, ms);
4322 	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4323 	return error;
4324 }
4325 
4326 static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
4327 {
4328 	struct dlm_lkb *lkb;
4329 	struct dlm_rsb *r;
4330 	int error;
4331 
4332 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4333 	if (error)
4334 		goto fail;
4335 
4336 	receive_flags(lkb, ms);
4337 
4338 	r = lkb->lkb_resource;
4339 
4340 	hold_rsb(r);
4341 	lock_rsb(r);
4342 
4343 	error = validate_message(lkb, ms);
4344 	if (error)
4345 		goto out;
4346 
4347 	error = do_cancel(r, lkb);
4348 	send_cancel_reply(r, lkb, error);
4349 	do_cancel_effects(r, lkb, error);
4350  out:
4351 	unlock_rsb(r);
4352 	put_rsb(r);
4353 	dlm_put_lkb(lkb);
4354 	return 0;
4355 
4356  fail:
4357 	setup_stub_lkb(ls, ms);
4358 	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4359 	return error;
4360 }
4361 
4362 static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
4363 {
4364 	struct dlm_lkb *lkb;
4365 	struct dlm_rsb *r;
4366 	int error;
4367 
4368 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4369 	if (error)
4370 		return error;
4371 
4372 	r = lkb->lkb_resource;
4373 
4374 	hold_rsb(r);
4375 	lock_rsb(r);
4376 
4377 	error = validate_message(lkb, ms);
4378 	if (error)
4379 		goto out;
4380 
4381 	receive_flags_reply(lkb, ms);
4382 	if (is_altmode(lkb))
4383 		munge_altmode(lkb, ms);
4384 	grant_lock_pc(r, lkb, ms);
4385 	queue_cast(r, lkb, 0);
4386  out:
4387 	unlock_rsb(r);
4388 	put_rsb(r);
4389 	dlm_put_lkb(lkb);
4390 	return 0;
4391 }
4392 
4393 static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
4394 {
4395 	struct dlm_lkb *lkb;
4396 	struct dlm_rsb *r;
4397 	int error;
4398 
4399 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4400 	if (error)
4401 		return error;
4402 
4403 	r = lkb->lkb_resource;
4404 
4405 	hold_rsb(r);
4406 	lock_rsb(r);
4407 
4408 	error = validate_message(lkb, ms);
4409 	if (error)
4410 		goto out;
4411 
4412 	queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode));
4413 	lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode);
4414  out:
4415 	unlock_rsb(r);
4416 	put_rsb(r);
4417 	dlm_put_lkb(lkb);
4418 	return 0;
4419 }
4420 
4421 static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
4422 {
4423 	int len, error, ret_nodeid, from_nodeid, our_nodeid;
4424 
4425 	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4426 	our_nodeid = dlm_our_nodeid();
4427 
4428 	len = receive_extralen(ms);
4429 
4430 	error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0,
4431 				  &ret_nodeid, NULL);
4432 
4433 	/* Optimization: we're master so treat lookup as a request */
4434 	if (!error && ret_nodeid == our_nodeid) {
4435 		receive_request(ls, ms);
4436 		return;
4437 	}
4438 	send_lookup_reply(ls, ms, ret_nodeid, error);
4439 }
4440 
4441 static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
4442 {
4443 	char name[DLM_RESNAME_MAXLEN+1];
4444 	struct dlm_rsb *r;
4445 	uint32_t hash, b;
4446 	int rv, len, dir_nodeid, from_nodeid;
4447 
4448 	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4449 
4450 	len = receive_extralen(ms);
4451 
4452 	if (len > DLM_RESNAME_MAXLEN) {
4453 		log_error(ls, "receive_remove from %d bad len %d",
4454 			  from_nodeid, len);
4455 		return;
4456 	}
4457 
4458 	dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash));
4459 	if (dir_nodeid != dlm_our_nodeid()) {
4460 		log_error(ls, "receive_remove from %d bad nodeid %d",
4461 			  from_nodeid, dir_nodeid);
4462 		return;
4463 	}
4464 
4465 	/* Look for name on rsbtbl.toss, if it's there, kill it.
4466 	   If it's on rsbtbl.keep, it's being used, and we should ignore this
4467 	   message.  This is an expected race between the dir node sending a
4468 	   request to the master node at the same time as the master node sends
4469 	   a remove to the dir node.  The resolution to that race is for the
4470 	   dir node to ignore the remove message, and the master node to
4471 	   recreate the master rsb when it gets a request from the dir node for
4472 	   an rsb it doesn't have. */
4473 
4474 	memset(name, 0, sizeof(name));
4475 	memcpy(name, ms->m_extra, len);
4476 
4477 	hash = jhash(name, len, 0);
4478 	b = hash & (ls->ls_rsbtbl_size - 1);
4479 
4480 	spin_lock(&ls->ls_rsbtbl[b].lock);
4481 
4482 	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4483 	if (rv) {
4484 		/* verify the rsb is on keep list per comment above */
4485 		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4486 		if (rv) {
4487 			/* should not happen */
4488 			log_error(ls, "receive_remove from %d not found %s",
4489 				  from_nodeid, name);
4490 			spin_unlock(&ls->ls_rsbtbl[b].lock);
4491 			return;
4492 		}
4493 		if (r->res_master_nodeid != from_nodeid) {
4494 			/* should not happen */
4495 			log_error(ls, "receive_remove keep from %d master %d",
4496 				  from_nodeid, r->res_master_nodeid);
4497 			dlm_print_rsb(r);
4498 			spin_unlock(&ls->ls_rsbtbl[b].lock);
4499 			return;
4500 		}
4501 
4502 		log_debug(ls, "receive_remove from %d master %d first %x %s",
4503 			  from_nodeid, r->res_master_nodeid, r->res_first_lkid,
4504 			  name);
4505 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4506 		return;
4507 	}
4508 
4509 	if (r->res_master_nodeid != from_nodeid) {
4510 		log_error(ls, "receive_remove toss from %d master %d",
4511 			  from_nodeid, r->res_master_nodeid);
4512 		dlm_print_rsb(r);
4513 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4514 		return;
4515 	}
4516 
4517 	if (kref_put(&r->res_ref, kill_rsb)) {
4518 		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
4519 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4520 		dlm_free_rsb(r);
4521 	} else {
4522 		log_error(ls, "receive_remove from %d rsb ref error",
4523 			  from_nodeid);
4524 		dlm_print_rsb(r);
4525 		spin_unlock(&ls->ls_rsbtbl[b].lock);
4526 	}
4527 }
4528 
4529 static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
4530 {
4531 	do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
4532 }
4533 
4534 static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4535 {
4536 	struct dlm_lkb *lkb;
4537 	struct dlm_rsb *r;
4538 	int error, mstype, result;
4539 	int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4540 
4541 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4542 	if (error)
4543 		return error;
4544 
4545 	r = lkb->lkb_resource;
4546 	hold_rsb(r);
4547 	lock_rsb(r);
4548 
4549 	error = validate_message(lkb, ms);
4550 	if (error)
4551 		goto out;
4552 
4553 	mstype = lkb->lkb_wait_type;
4554 	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
4555 	if (error) {
4556 		log_error(ls, "receive_request_reply %x remote %d %x result %d",
4557 			  lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid),
4558 			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4559 		dlm_dump_rsb(r);
4560 		goto out;
4561 	}
4562 
4563 	/* Optimization: the dir node was also the master, so it took our
4564 	   lookup as a request and sent request reply instead of lookup reply */
4565 	if (mstype == DLM_MSG_LOOKUP) {
4566 		r->res_master_nodeid = from_nodeid;
4567 		r->res_nodeid = from_nodeid;
4568 		lkb->lkb_nodeid = from_nodeid;
4569 	}
4570 
4571 	/* this is the value returned from do_request() on the master */
4572 	result = from_dlm_errno(le32_to_cpu(ms->m_result));
4573 
4574 	switch (result) {
4575 	case -EAGAIN:
4576 		/* request would block (be queued) on remote master */
4577 		queue_cast(r, lkb, -EAGAIN);
4578 		confirm_master(r, -EAGAIN);
4579 		unhold_lkb(lkb); /* undoes create_lkb() */
4580 		break;
4581 
4582 	case -EINPROGRESS:
4583 	case 0:
4584 		/* request was queued or granted on remote master */
4585 		receive_flags_reply(lkb, ms);
4586 		lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
4587 		if (is_altmode(lkb))
4588 			munge_altmode(lkb, ms);
4589 		if (result) {
4590 			add_lkb(r, lkb, DLM_LKSTS_WAITING);
4591 			add_timeout(lkb);
4592 		} else {
4593 			grant_lock_pc(r, lkb, ms);
4594 			queue_cast(r, lkb, 0);
4595 		}
4596 		confirm_master(r, result);
4597 		break;
4598 
4599 	case -EBADR:
4600 	case -ENOTBLK:
4601 		/* find_rsb failed to find rsb or rsb wasn't master */
4602 		log_limit(ls, "receive_request_reply %x from %d %d "
4603 			  "master %d dir %d first %x %s", lkb->lkb_id,
4604 			  from_nodeid, result, r->res_master_nodeid,
4605 			  r->res_dir_nodeid, r->res_first_lkid, r->res_name);
4606 
4607 		if (r->res_dir_nodeid != dlm_our_nodeid() &&
4608 		    r->res_master_nodeid != dlm_our_nodeid()) {
4609 			/* cause _request_lock->set_master->send_lookup */
4610 			r->res_master_nodeid = 0;
4611 			r->res_nodeid = -1;
4612 			lkb->lkb_nodeid = -1;
4613 		}
4614 
4615 		if (is_overlap(lkb)) {
4616 			/* we'll ignore error in cancel/unlock reply */
4617 			queue_cast_overlap(r, lkb);
4618 			confirm_master(r, result);
4619 			unhold_lkb(lkb); /* undoes create_lkb() */
4620 		} else {
4621 			_request_lock(r, lkb);
4622 
4623 			if (r->res_master_nodeid == dlm_our_nodeid())
4624 				confirm_master(r, 0);
4625 		}
4626 		break;
4627 
4628 	default:
4629 		log_error(ls, "receive_request_reply %x error %d",
4630 			  lkb->lkb_id, result);
4631 	}
4632 
4633 	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
4634 		log_debug(ls, "receive_request_reply %x result %d unlock",
4635 			  lkb->lkb_id, result);
4636 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4637 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4638 		send_unlock(r, lkb);
4639 	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
4640 		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
4641 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4642 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4643 		send_cancel(r, lkb);
4644 	} else {
4645 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4646 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4647 	}
4648  out:
4649 	unlock_rsb(r);
4650 	put_rsb(r);
4651 	dlm_put_lkb(lkb);
4652 	return 0;
4653 }
4654 
4655 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
4656 				    struct dlm_message *ms)
4657 {
4658 	/* this is the value returned from do_convert() on the master */
4659 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4660 	case -EAGAIN:
4661 		/* convert would block (be queued) on remote master */
4662 		queue_cast(r, lkb, -EAGAIN);
4663 		break;
4664 
4665 	case -EDEADLK:
4666 		receive_flags_reply(lkb, ms);
4667 		revert_lock_pc(r, lkb);
4668 		queue_cast(r, lkb, -EDEADLK);
4669 		break;
4670 
4671 	case -EINPROGRESS:
4672 		/* convert was queued on remote master */
4673 		receive_flags_reply(lkb, ms);
4674 		if (is_demoted(lkb))
4675 			munge_demoted(lkb);
4676 		del_lkb(r, lkb);
4677 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
4678 		add_timeout(lkb);
4679 		break;
4680 
4681 	case 0:
4682 		/* convert was granted on remote master */
4683 		receive_flags_reply(lkb, ms);
4684 		if (is_demoted(lkb))
4685 			munge_demoted(lkb);
4686 		grant_lock_pc(r, lkb, ms);
4687 		queue_cast(r, lkb, 0);
4688 		break;
4689 
4690 	default:
4691 		log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
4692 			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
4693 			  le32_to_cpu(ms->m_lkid),
4694 			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4695 		dlm_print_rsb(r);
4696 		dlm_print_lkb(lkb);
4697 	}
4698 }
4699 
4700 static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4701 {
4702 	struct dlm_rsb *r = lkb->lkb_resource;
4703 	int error;
4704 
4705 	hold_rsb(r);
4706 	lock_rsb(r);
4707 
4708 	error = validate_message(lkb, ms);
4709 	if (error)
4710 		goto out;
4711 
4712 	/* stub reply can happen with waiters_mutex held */
4713 	error = remove_from_waiters_ms(lkb, ms);
4714 	if (error)
4715 		goto out;
4716 
4717 	__receive_convert_reply(r, lkb, ms);
4718  out:
4719 	unlock_rsb(r);
4720 	put_rsb(r);
4721 }
4722 
4723 static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
4724 {
4725 	struct dlm_lkb *lkb;
4726 	int error;
4727 
4728 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4729 	if (error)
4730 		return error;
4731 
4732 	_receive_convert_reply(lkb, ms);
4733 	dlm_put_lkb(lkb);
4734 	return 0;
4735 }
4736 
4737 static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4738 {
4739 	struct dlm_rsb *r = lkb->lkb_resource;
4740 	int error;
4741 
4742 	hold_rsb(r);
4743 	lock_rsb(r);
4744 
4745 	error = validate_message(lkb, ms);
4746 	if (error)
4747 		goto out;
4748 
4749 	/* stub reply can happen with waiters_mutex held */
4750 	error = remove_from_waiters_ms(lkb, ms);
4751 	if (error)
4752 		goto out;
4753 
4754 	/* this is the value returned from do_unlock() on the master */
4755 
4756 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4757 	case -DLM_EUNLOCK:
4758 		receive_flags_reply(lkb, ms);
4759 		remove_lock_pc(r, lkb);
4760 		queue_cast(r, lkb, -DLM_EUNLOCK);
4761 		break;
4762 	case -ENOENT:
4763 		break;
4764 	default:
4765 		log_error(r->res_ls, "receive_unlock_reply %x error %d",
4766 			  lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result)));
4767 	}
4768  out:
4769 	unlock_rsb(r);
4770 	put_rsb(r);
4771 }
4772 
4773 static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
4774 {
4775 	struct dlm_lkb *lkb;
4776 	int error;
4777 
4778 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4779 	if (error)
4780 		return error;
4781 
4782 	_receive_unlock_reply(lkb, ms);
4783 	dlm_put_lkb(lkb);
4784 	return 0;
4785 }
4786 
4787 static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4788 {
4789 	struct dlm_rsb *r = lkb->lkb_resource;
4790 	int error;
4791 
4792 	hold_rsb(r);
4793 	lock_rsb(r);
4794 
4795 	error = validate_message(lkb, ms);
4796 	if (error)
4797 		goto out;
4798 
4799 	/* stub reply can happen with waiters_mutex held */
4800 	error = remove_from_waiters_ms(lkb, ms);
4801 	if (error)
4802 		goto out;
4803 
4804 	/* this is the value returned from do_cancel() on the master */
4805 
4806 	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4807 	case -DLM_ECANCEL:
4808 		receive_flags_reply(lkb, ms);
4809 		revert_lock_pc(r, lkb);
4810 		queue_cast(r, lkb, -DLM_ECANCEL);
4811 		break;
4812 	case 0:
4813 		break;
4814 	default:
4815 		log_error(r->res_ls, "receive_cancel_reply %x error %d",
4816 			  lkb->lkb_id,
4817 			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4818 	}
4819  out:
4820 	unlock_rsb(r);
4821 	put_rsb(r);
4822 }
4823 
4824 static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
4825 {
4826 	struct dlm_lkb *lkb;
4827 	int error;
4828 
4829 	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4830 	if (error)
4831 		return error;
4832 
4833 	_receive_cancel_reply(lkb, ms);
4834 	dlm_put_lkb(lkb);
4835 	return 0;
4836 }
4837 
4838 static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4839 {
4840 	struct dlm_lkb *lkb;
4841 	struct dlm_rsb *r;
4842 	int error, ret_nodeid;
4843 	int do_lookup_list = 0;
4844 
4845 	error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb);
4846 	if (error) {
4847 		log_error(ls, "%s no lkid %x", __func__,
4848 			  le32_to_cpu(ms->m_lkid));
4849 		return;
4850 	}
4851 
4852 	/* ms->m_result is the value returned by dlm_master_lookup on dir node
4853 	   FIXME: will a non-zero error ever be returned? */
4854 
4855 	r = lkb->lkb_resource;
4856 	hold_rsb(r);
4857 	lock_rsb(r);
4858 
4859 	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
4860 	if (error)
4861 		goto out;
4862 
4863 	ret_nodeid = le32_to_cpu(ms->m_nodeid);
4864 
4865 	/* We sometimes receive a request from the dir node for this
4866 	   rsb before we've received the dir node's loookup_reply for it.
4867 	   The request from the dir node implies we're the master, so we set
4868 	   ourself as master in receive_request_reply, and verify here that
4869 	   we are indeed the master. */
4870 
4871 	if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) {
4872 		/* This should never happen */
4873 		log_error(ls, "receive_lookup_reply %x from %d ret %d "
4874 			  "master %d dir %d our %d first %x %s",
4875 			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
4876 			  ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
4877 			  dlm_our_nodeid(), r->res_first_lkid, r->res_name);
4878 	}
4879 
4880 	if (ret_nodeid == dlm_our_nodeid()) {
4881 		r->res_master_nodeid = ret_nodeid;
4882 		r->res_nodeid = 0;
4883 		do_lookup_list = 1;
4884 		r->res_first_lkid = 0;
4885 	} else if (ret_nodeid == -1) {
4886 		/* the remote node doesn't believe it's the dir node */
4887 		log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
4888 			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid));
4889 		r->res_master_nodeid = 0;
4890 		r->res_nodeid = -1;
4891 		lkb->lkb_nodeid = -1;
4892 	} else {
4893 		/* set_master() will set lkb_nodeid from r */
4894 		r->res_master_nodeid = ret_nodeid;
4895 		r->res_nodeid = ret_nodeid;
4896 	}
4897 
4898 	if (is_overlap(lkb)) {
4899 		log_debug(ls, "receive_lookup_reply %x unlock %x",
4900 			  lkb->lkb_id, lkb->lkb_flags);
4901 		queue_cast_overlap(r, lkb);
4902 		unhold_lkb(lkb); /* undoes create_lkb() */
4903 		goto out_list;
4904 	}
4905 
4906 	_request_lock(r, lkb);
4907 
4908  out_list:
4909 	if (do_lookup_list)
4910 		process_lookup_list(r);
4911  out:
4912 	unlock_rsb(r);
4913 	put_rsb(r);
4914 	dlm_put_lkb(lkb);
4915 }
4916 
4917 static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
4918 			     uint32_t saved_seq)
4919 {
4920 	int error = 0, noent = 0;
4921 
4922 	if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) {
4923 		log_limit(ls, "receive %d from non-member %d %x %x %d",
4924 			  le32_to_cpu(ms->m_type),
4925 			  le32_to_cpu(ms->m_header.h_nodeid),
4926 			  le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
4927 			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4928 		return;
4929 	}
4930 
4931 	switch (ms->m_type) {
4932 
4933 	/* messages sent to a master node */
4934 
4935 	case cpu_to_le32(DLM_MSG_REQUEST):
4936 		error = receive_request(ls, ms);
4937 		break;
4938 
4939 	case cpu_to_le32(DLM_MSG_CONVERT):
4940 		error = receive_convert(ls, ms);
4941 		break;
4942 
4943 	case cpu_to_le32(DLM_MSG_UNLOCK):
4944 		error = receive_unlock(ls, ms);
4945 		break;
4946 
4947 	case cpu_to_le32(DLM_MSG_CANCEL):
4948 		noent = 1;
4949 		error = receive_cancel(ls, ms);
4950 		break;
4951 
4952 	/* messages sent from a master node (replies to above) */
4953 
4954 	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
4955 		error = receive_request_reply(ls, ms);
4956 		break;
4957 
4958 	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
4959 		error = receive_convert_reply(ls, ms);
4960 		break;
4961 
4962 	case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
4963 		error = receive_unlock_reply(ls, ms);
4964 		break;
4965 
4966 	case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
4967 		error = receive_cancel_reply(ls, ms);
4968 		break;
4969 
4970 	/* messages sent from a master node (only two types of async msg) */
4971 
4972 	case cpu_to_le32(DLM_MSG_GRANT):
4973 		noent = 1;
4974 		error = receive_grant(ls, ms);
4975 		break;
4976 
4977 	case cpu_to_le32(DLM_MSG_BAST):
4978 		noent = 1;
4979 		error = receive_bast(ls, ms);
4980 		break;
4981 
4982 	/* messages sent to a dir node */
4983 
4984 	case cpu_to_le32(DLM_MSG_LOOKUP):
4985 		receive_lookup(ls, ms);
4986 		break;
4987 
4988 	case cpu_to_le32(DLM_MSG_REMOVE):
4989 		receive_remove(ls, ms);
4990 		break;
4991 
4992 	/* messages sent from a dir node (remove has no reply) */
4993 
4994 	case cpu_to_le32(DLM_MSG_LOOKUP_REPLY):
4995 		receive_lookup_reply(ls, ms);
4996 		break;
4997 
4998 	/* other messages */
4999 
5000 	case cpu_to_le32(DLM_MSG_PURGE):
5001 		receive_purge(ls, ms);
5002 		break;
5003 
5004 	default:
5005 		log_error(ls, "unknown message type %d",
5006 			  le32_to_cpu(ms->m_type));
5007 	}
5008 
5009 	/*
5010 	 * When checking for ENOENT, we're checking the result of
5011 	 * find_lkb(m_remid):
5012 	 *
5013 	 * The lock id referenced in the message wasn't found.  This may
5014 	 * happen in normal usage for the async messages and cancel, so
5015 	 * only use log_debug for them.
5016 	 *
5017 	 * Some errors are expected and normal.
5018 	 */
5019 
5020 	if (error == -ENOENT && noent) {
5021 		log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
5022 			  le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
5023 			  le32_to_cpu(ms->m_header.h_nodeid),
5024 			  le32_to_cpu(ms->m_lkid), saved_seq);
5025 	} else if (error == -ENOENT) {
5026 		log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
5027 			  le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
5028 			  le32_to_cpu(ms->m_header.h_nodeid),
5029 			  le32_to_cpu(ms->m_lkid), saved_seq);
5030 
5031 		if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT))
5032 			dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash));
5033 	}
5034 
5035 	if (error == -EINVAL) {
5036 		log_error(ls, "receive %d inval from %d lkid %x remid %x "
5037 			  "saved_seq %u",
5038 			  le32_to_cpu(ms->m_type),
5039 			  le32_to_cpu(ms->m_header.h_nodeid),
5040 			  le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
5041 			  saved_seq);
5042 	}
5043 }
5044 
5045 /* If the lockspace is in recovery mode (locking stopped), then normal
5046    messages are saved on the requestqueue for processing after recovery is
5047    done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
5048    messages off the requestqueue before we process new ones. This occurs right
5049    after recovery completes when we transition from saving all messages on
5050    requestqueue, to processing all the saved messages, to processing new
5051    messages as they arrive. */
5052 
5053 static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
5054 				int nodeid)
5055 {
5056 	if (dlm_locking_stopped(ls)) {
5057 		/* If we were a member of this lockspace, left, and rejoined,
5058 		   other nodes may still be sending us messages from the
5059 		   lockspace generation before we left. */
5060 		if (!ls->ls_generation) {
5061 			log_limit(ls, "receive %d from %d ignore old gen",
5062 				  le32_to_cpu(ms->m_type), nodeid);
5063 			return;
5064 		}
5065 
5066 		dlm_add_requestqueue(ls, nodeid, ms);
5067 	} else {
5068 		dlm_wait_requestqueue(ls);
5069 		_receive_message(ls, ms, 0);
5070 	}
5071 }
5072 
5073 /* This is called by dlm_recoverd to process messages that were saved on
5074    the requestqueue. */
5075 
5076 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
5077 			       uint32_t saved_seq)
5078 {
5079 	_receive_message(ls, ms, saved_seq);
5080 }
5081 
5082 /* This is called by the midcomms layer when something is received for
5083    the lockspace.  It could be either a MSG (normal message sent as part of
5084    standard locking activity) or an RCOM (recovery message sent as part of
5085    lockspace recovery). */
5086 
5087 void dlm_receive_buffer(union dlm_packet *p, int nodeid)
5088 {
5089 	struct dlm_header *hd = &p->header;
5090 	struct dlm_ls *ls;
5091 	int type = 0;
5092 
5093 	switch (hd->h_cmd) {
5094 	case DLM_MSG:
5095 		type = le32_to_cpu(p->message.m_type);
5096 		break;
5097 	case DLM_RCOM:
5098 		type = le32_to_cpu(p->rcom.rc_type);
5099 		break;
5100 	default:
5101 		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
5102 		return;
5103 	}
5104 
5105 	if (le32_to_cpu(hd->h_nodeid) != nodeid) {
5106 		log_print("invalid h_nodeid %d from %d lockspace %x",
5107 			  le32_to_cpu(hd->h_nodeid), nodeid,
5108 			  le32_to_cpu(hd->u.h_lockspace));
5109 		return;
5110 	}
5111 
5112 	ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace));
5113 	if (!ls) {
5114 		if (dlm_config.ci_log_debug) {
5115 			printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
5116 				"%u from %d cmd %d type %d\n",
5117 				le32_to_cpu(hd->u.h_lockspace), nodeid,
5118 				hd->h_cmd, type);
5119 		}
5120 
5121 		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
5122 			dlm_send_ls_not_ready(nodeid, &p->rcom);
5123 		return;
5124 	}
5125 
5126 	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
5127 	   be inactive (in this ls) before transitioning to recovery mode */
5128 
5129 	down_read(&ls->ls_recv_active);
5130 	if (hd->h_cmd == DLM_MSG)
5131 		dlm_receive_message(ls, &p->message, nodeid);
5132 	else
5133 		dlm_receive_rcom(ls, &p->rcom, nodeid);
5134 	up_read(&ls->ls_recv_active);
5135 
5136 	dlm_put_lockspace(ls);
5137 }
5138 
5139 static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
5140 				   struct dlm_message *ms_stub)
5141 {
5142 	if (middle_conversion(lkb)) {
5143 		hold_lkb(lkb);
5144 		memset(ms_stub, 0, sizeof(struct dlm_message));
5145 		ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5146 		ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
5147 		ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
5148 		ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5149 		_receive_convert_reply(lkb, ms_stub);
5150 
5151 		/* Same special case as in receive_rcom_lock_args() */
5152 		lkb->lkb_grmode = DLM_LOCK_IV;
5153 		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
5154 		unhold_lkb(lkb);
5155 
5156 	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
5157 		lkb->lkb_flags |= DLM_IFL_RESEND;
5158 	}
5159 
5160 	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
5161 	   conversions are async; there's no reply from the remote master */
5162 }
5163 
5164 /* A waiting lkb needs recovery if the master node has failed, or
5165    the master node is changing (only when no directory is used) */
5166 
5167 static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
5168 				 int dir_nodeid)
5169 {
5170 	if (dlm_no_directory(ls))
5171 		return 1;
5172 
5173 	if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
5174 		return 1;
5175 
5176 	return 0;
5177 }
5178 
5179 /* Recovery for locks that are waiting for replies from nodes that are now
5180    gone.  We can just complete unlocks and cancels by faking a reply from the
5181    dead node.  Requests and up-conversions we flag to be resent after
5182    recovery.  Down-conversions can just be completed with a fake reply like
5183    unlocks.  Conversions between PR and CW need special attention. */
5184 
5185 void dlm_recover_waiters_pre(struct dlm_ls *ls)
5186 {
5187 	struct dlm_lkb *lkb, *safe;
5188 	struct dlm_message *ms_stub;
5189 	int wait_type, stub_unlock_result, stub_cancel_result;
5190 	int dir_nodeid;
5191 
5192 	ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL);
5193 	if (!ms_stub)
5194 		return;
5195 
5196 	mutex_lock(&ls->ls_waiters_mutex);
5197 
5198 	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
5199 
5200 		dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
5201 
5202 		/* exclude debug messages about unlocks because there can be so
5203 		   many and they aren't very interesting */
5204 
5205 		if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
5206 			log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
5207 				  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
5208 				  lkb->lkb_id,
5209 				  lkb->lkb_remid,
5210 				  lkb->lkb_wait_type,
5211 				  lkb->lkb_resource->res_nodeid,
5212 				  lkb->lkb_nodeid,
5213 				  lkb->lkb_wait_nodeid,
5214 				  dir_nodeid);
5215 		}
5216 
5217 		/* all outstanding lookups, regardless of destination  will be
5218 		   resent after recovery is done */
5219 
5220 		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
5221 			lkb->lkb_flags |= DLM_IFL_RESEND;
5222 			continue;
5223 		}
5224 
5225 		if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
5226 			continue;
5227 
5228 		wait_type = lkb->lkb_wait_type;
5229 		stub_unlock_result = -DLM_EUNLOCK;
5230 		stub_cancel_result = -DLM_ECANCEL;
5231 
5232 		/* Main reply may have been received leaving a zero wait_type,
5233 		   but a reply for the overlapping op may not have been
5234 		   received.  In that case we need to fake the appropriate
5235 		   reply for the overlap op. */
5236 
5237 		if (!wait_type) {
5238 			if (is_overlap_cancel(lkb)) {
5239 				wait_type = DLM_MSG_CANCEL;
5240 				if (lkb->lkb_grmode == DLM_LOCK_IV)
5241 					stub_cancel_result = 0;
5242 			}
5243 			if (is_overlap_unlock(lkb)) {
5244 				wait_type = DLM_MSG_UNLOCK;
5245 				if (lkb->lkb_grmode == DLM_LOCK_IV)
5246 					stub_unlock_result = -ENOENT;
5247 			}
5248 
5249 			log_debug(ls, "rwpre overlap %x %x %d %d %d",
5250 				  lkb->lkb_id, lkb->lkb_flags, wait_type,
5251 				  stub_cancel_result, stub_unlock_result);
5252 		}
5253 
5254 		switch (wait_type) {
5255 
5256 		case DLM_MSG_REQUEST:
5257 			lkb->lkb_flags |= DLM_IFL_RESEND;
5258 			break;
5259 
5260 		case DLM_MSG_CONVERT:
5261 			recover_convert_waiter(ls, lkb, ms_stub);
5262 			break;
5263 
5264 		case DLM_MSG_UNLOCK:
5265 			hold_lkb(lkb);
5266 			memset(ms_stub, 0, sizeof(struct dlm_message));
5267 			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5268 			ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
5269 			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result));
5270 			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5271 			_receive_unlock_reply(lkb, ms_stub);
5272 			dlm_put_lkb(lkb);
5273 			break;
5274 
5275 		case DLM_MSG_CANCEL:
5276 			hold_lkb(lkb);
5277 			memset(ms_stub, 0, sizeof(struct dlm_message));
5278 			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5279 			ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
5280 			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result));
5281 			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5282 			_receive_cancel_reply(lkb, ms_stub);
5283 			dlm_put_lkb(lkb);
5284 			break;
5285 
5286 		default:
5287 			log_error(ls, "invalid lkb wait_type %d %d",
5288 				  lkb->lkb_wait_type, wait_type);
5289 		}
5290 		schedule();
5291 	}
5292 	mutex_unlock(&ls->ls_waiters_mutex);
5293 	kfree(ms_stub);
5294 }
5295 
5296 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
5297 {
5298 	struct dlm_lkb *lkb = NULL, *iter;
5299 
5300 	mutex_lock(&ls->ls_waiters_mutex);
5301 	list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
5302 		if (iter->lkb_flags & DLM_IFL_RESEND) {
5303 			hold_lkb(iter);
5304 			lkb = iter;
5305 			break;
5306 		}
5307 	}
5308 	mutex_unlock(&ls->ls_waiters_mutex);
5309 
5310 	return lkb;
5311 }
5312 
5313 /* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
5314    master or dir-node for r.  Processing the lkb may result in it being placed
5315    back on waiters. */
5316 
5317 /* We do this after normal locking has been enabled and any saved messages
5318    (in requestqueue) have been processed.  We should be confident that at
5319    this point we won't get or process a reply to any of these waiting
5320    operations.  But, new ops may be coming in on the rsbs/locks here from
5321    userspace or remotely. */
5322 
5323 /* there may have been an overlap unlock/cancel prior to recovery or after
5324    recovery.  if before, the lkb may still have a pos wait_count; if after, the
5325    overlap flag would just have been set and nothing new sent.  we can be
5326    confident here than any replies to either the initial op or overlap ops
5327    prior to recovery have been received. */
5328 
5329 int dlm_recover_waiters_post(struct dlm_ls *ls)
5330 {
5331 	struct dlm_lkb *lkb;
5332 	struct dlm_rsb *r;
5333 	int error = 0, mstype, err, oc, ou;
5334 
5335 	while (1) {
5336 		if (dlm_locking_stopped(ls)) {
5337 			log_debug(ls, "recover_waiters_post aborted");
5338 			error = -EINTR;
5339 			break;
5340 		}
5341 
5342 		lkb = find_resend_waiter(ls);
5343 		if (!lkb)
5344 			break;
5345 
5346 		r = lkb->lkb_resource;
5347 		hold_rsb(r);
5348 		lock_rsb(r);
5349 
5350 		mstype = lkb->lkb_wait_type;
5351 		oc = is_overlap_cancel(lkb);
5352 		ou = is_overlap_unlock(lkb);
5353 		err = 0;
5354 
5355 		log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
5356 			  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
5357 			  "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
5358 			  r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
5359 			  dlm_dir_nodeid(r), oc, ou);
5360 
5361 		/* At this point we assume that we won't get a reply to any
5362 		   previous op or overlap op on this lock.  First, do a big
5363 		   remove_from_waiters() for all previous ops. */
5364 
5365 		lkb->lkb_flags &= ~DLM_IFL_RESEND;
5366 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
5367 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
5368 		lkb->lkb_wait_type = 0;
5369 		/* drop all wait_count references we still
5370 		 * hold a reference for this iteration.
5371 		 */
5372 		while (lkb->lkb_wait_count) {
5373 			lkb->lkb_wait_count--;
5374 			unhold_lkb(lkb);
5375 		}
5376 		mutex_lock(&ls->ls_waiters_mutex);
5377 		list_del_init(&lkb->lkb_wait_reply);
5378 		mutex_unlock(&ls->ls_waiters_mutex);
5379 
5380 		if (oc || ou) {
5381 			/* do an unlock or cancel instead of resending */
5382 			switch (mstype) {
5383 			case DLM_MSG_LOOKUP:
5384 			case DLM_MSG_REQUEST:
5385 				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
5386 							-DLM_ECANCEL);
5387 				unhold_lkb(lkb); /* undoes create_lkb() */
5388 				break;
5389 			case DLM_MSG_CONVERT:
5390 				if (oc) {
5391 					queue_cast(r, lkb, -DLM_ECANCEL);
5392 				} else {
5393 					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
5394 					_unlock_lock(r, lkb);
5395 				}
5396 				break;
5397 			default:
5398 				err = 1;
5399 			}
5400 		} else {
5401 			switch (mstype) {
5402 			case DLM_MSG_LOOKUP:
5403 			case DLM_MSG_REQUEST:
5404 				_request_lock(r, lkb);
5405 				if (is_master(r))
5406 					confirm_master(r, 0);
5407 				break;
5408 			case DLM_MSG_CONVERT:
5409 				_convert_lock(r, lkb);
5410 				break;
5411 			default:
5412 				err = 1;
5413 			}
5414 		}
5415 
5416 		if (err) {
5417 			log_error(ls, "waiter %x msg %d r_nodeid %d "
5418 				  "dir_nodeid %d overlap %d %d",
5419 				  lkb->lkb_id, mstype, r->res_nodeid,
5420 				  dlm_dir_nodeid(r), oc, ou);
5421 		}
5422 		unlock_rsb(r);
5423 		put_rsb(r);
5424 		dlm_put_lkb(lkb);
5425 	}
5426 
5427 	return error;
5428 }
5429 
5430 static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r,
5431 			      struct list_head *list)
5432 {
5433 	struct dlm_lkb *lkb, *safe;
5434 
5435 	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
5436 		if (!is_master_copy(lkb))
5437 			continue;
5438 
5439 		/* don't purge lkbs we've added in recover_master_copy for
5440 		   the current recovery seq */
5441 
5442 		if (lkb->lkb_recover_seq == ls->ls_recover_seq)
5443 			continue;
5444 
5445 		del_lkb(r, lkb);
5446 
5447 		/* this put should free the lkb */
5448 		if (!dlm_put_lkb(lkb))
5449 			log_error(ls, "purged mstcpy lkb not released");
5450 	}
5451 }
5452 
5453 void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
5454 {
5455 	struct dlm_ls *ls = r->res_ls;
5456 
5457 	purge_mstcpy_list(ls, r, &r->res_grantqueue);
5458 	purge_mstcpy_list(ls, r, &r->res_convertqueue);
5459 	purge_mstcpy_list(ls, r, &r->res_waitqueue);
5460 }
5461 
5462 static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
5463 			    struct list_head *list,
5464 			    int nodeid_gone, unsigned int *count)
5465 {
5466 	struct dlm_lkb *lkb, *safe;
5467 
5468 	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
5469 		if (!is_master_copy(lkb))
5470 			continue;
5471 
5472 		if ((lkb->lkb_nodeid == nodeid_gone) ||
5473 		    dlm_is_removed(ls, lkb->lkb_nodeid)) {
5474 
5475 			/* tell recover_lvb to invalidate the lvb
5476 			   because a node holding EX/PW failed */
5477 			if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5478 			    (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5479 				rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5480 			}
5481 
5482 			del_lkb(r, lkb);
5483 
5484 			/* this put should free the lkb */
5485 			if (!dlm_put_lkb(lkb))
5486 				log_error(ls, "purged dead lkb not released");
5487 
5488 			rsb_set_flag(r, RSB_RECOVER_GRANT);
5489 
5490 			(*count)++;
5491 		}
5492 	}
5493 }
5494 
5495 /* Get rid of locks held by nodes that are gone. */
5496 
5497 void dlm_recover_purge(struct dlm_ls *ls)
5498 {
5499 	struct dlm_rsb *r;
5500 	struct dlm_member *memb;
5501 	int nodes_count = 0;
5502 	int nodeid_gone = 0;
5503 	unsigned int lkb_count = 0;
5504 
5505 	/* cache one removed nodeid to optimize the common
5506 	   case of a single node removed */
5507 
5508 	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
5509 		nodes_count++;
5510 		nodeid_gone = memb->nodeid;
5511 	}
5512 
5513 	if (!nodes_count)
5514 		return;
5515 
5516 	down_write(&ls->ls_root_sem);
5517 	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
5518 		hold_rsb(r);
5519 		lock_rsb(r);
5520 		if (is_master(r)) {
5521 			purge_dead_list(ls, r, &r->res_grantqueue,
5522 					nodeid_gone, &lkb_count);
5523 			purge_dead_list(ls, r, &r->res_convertqueue,
5524 					nodeid_gone, &lkb_count);
5525 			purge_dead_list(ls, r, &r->res_waitqueue,
5526 					nodeid_gone, &lkb_count);
5527 		}
5528 		unlock_rsb(r);
5529 		unhold_rsb(r);
5530 		cond_resched();
5531 	}
5532 	up_write(&ls->ls_root_sem);
5533 
5534 	if (lkb_count)
5535 		log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
5536 			  lkb_count, nodes_count);
5537 }
5538 
5539 static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket)
5540 {
5541 	struct rb_node *n;
5542 	struct dlm_rsb *r;
5543 
5544 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
5545 	for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
5546 		r = rb_entry(n, struct dlm_rsb, res_hashnode);
5547 
5548 		if (!rsb_flag(r, RSB_RECOVER_GRANT))
5549 			continue;
5550 		if (!is_master(r)) {
5551 			rsb_clear_flag(r, RSB_RECOVER_GRANT);
5552 			continue;
5553 		}
5554 		hold_rsb(r);
5555 		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
5556 		return r;
5557 	}
5558 	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
5559 	return NULL;
5560 }
5561 
5562 /*
5563  * Attempt to grant locks on resources that we are the master of.
5564  * Locks may have become grantable during recovery because locks
5565  * from departed nodes have been purged (or not rebuilt), allowing
5566  * previously blocked locks to now be granted.  The subset of rsb's
5567  * we are interested in are those with lkb's on either the convert or
5568  * waiting queues.
5569  *
5570  * Simplest would be to go through each master rsb and check for non-empty
5571  * convert or waiting queues, and attempt to grant on those rsbs.
5572  * Checking the queues requires lock_rsb, though, for which we'd need
5573  * to release the rsbtbl lock.  This would make iterating through all
5574  * rsb's very inefficient.  So, we rely on earlier recovery routines
5575  * to set RECOVER_GRANT on any rsb's that we should attempt to grant
5576  * locks for.
5577  */
5578 
5579 void dlm_recover_grant(struct dlm_ls *ls)
5580 {
5581 	struct dlm_rsb *r;
5582 	int bucket = 0;
5583 	unsigned int count = 0;
5584 	unsigned int rsb_count = 0;
5585 	unsigned int lkb_count = 0;
5586 
5587 	while (1) {
5588 		r = find_grant_rsb(ls, bucket);
5589 		if (!r) {
5590 			if (bucket == ls->ls_rsbtbl_size - 1)
5591 				break;
5592 			bucket++;
5593 			continue;
5594 		}
5595 		rsb_count++;
5596 		count = 0;
5597 		lock_rsb(r);
5598 		/* the RECOVER_GRANT flag is checked in the grant path */
5599 		grant_pending_locks(r, &count);
5600 		rsb_clear_flag(r, RSB_RECOVER_GRANT);
5601 		lkb_count += count;
5602 		confirm_master(r, 0);
5603 		unlock_rsb(r);
5604 		put_rsb(r);
5605 		cond_resched();
5606 	}
5607 
5608 	if (lkb_count)
5609 		log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
5610 			  lkb_count, rsb_count);
5611 }
5612 
5613 static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
5614 					 uint32_t remid)
5615 {
5616 	struct dlm_lkb *lkb;
5617 
5618 	list_for_each_entry(lkb, head, lkb_statequeue) {
5619 		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
5620 			return lkb;
5621 	}
5622 	return NULL;
5623 }
5624 
5625 static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
5626 				    uint32_t remid)
5627 {
5628 	struct dlm_lkb *lkb;
5629 
5630 	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
5631 	if (lkb)
5632 		return lkb;
5633 	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
5634 	if (lkb)
5635 		return lkb;
5636 	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
5637 	if (lkb)
5638 		return lkb;
5639 	return NULL;
5640 }
5641 
5642 /* needs at least dlm_rcom + rcom_lock */
5643 static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
5644 				  struct dlm_rsb *r, struct dlm_rcom *rc)
5645 {
5646 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5647 
5648 	lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
5649 	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
5650 	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
5651 	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
5652 	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
5653 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
5654 	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
5655 	lkb->lkb_rqmode = rl->rl_rqmode;
5656 	lkb->lkb_grmode = rl->rl_grmode;
5657 	/* don't set lkb_status because add_lkb wants to itself */
5658 
5659 	lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
5660 	lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
5661 
5662 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
5663 		int lvblen = le16_to_cpu(rc->rc_header.h_length) -
5664 			sizeof(struct dlm_rcom) - sizeof(struct rcom_lock);
5665 		if (lvblen > ls->ls_lvblen)
5666 			return -EINVAL;
5667 		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
5668 		if (!lkb->lkb_lvbptr)
5669 			return -ENOMEM;
5670 		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
5671 	}
5672 
5673 	/* Conversions between PR and CW (middle modes) need special handling.
5674 	   The real granted mode of these converting locks cannot be determined
5675 	   until all locks have been rebuilt on the rsb (recover_conversion) */
5676 
5677 	if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
5678 	    middle_conversion(lkb)) {
5679 		rl->rl_status = DLM_LKSTS_CONVERT;
5680 		lkb->lkb_grmode = DLM_LOCK_IV;
5681 		rsb_set_flag(r, RSB_RECOVER_CONVERT);
5682 	}
5683 
5684 	return 0;
5685 }
5686 
5687 /* This lkb may have been recovered in a previous aborted recovery so we need
5688    to check if the rsb already has an lkb with the given remote nodeid/lkid.
5689    If so we just send back a standard reply.  If not, we create a new lkb with
5690    the given values and send back our lkid.  We send back our lkid by sending
5691    back the rcom_lock struct we got but with the remid field filled in. */
5692 
5693 /* needs at least dlm_rcom + rcom_lock */
5694 int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5695 {
5696 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5697 	struct dlm_rsb *r;
5698 	struct dlm_lkb *lkb;
5699 	uint32_t remid = 0;
5700 	int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
5701 	int error;
5702 
5703 	if (rl->rl_parent_lkid) {
5704 		error = -EOPNOTSUPP;
5705 		goto out;
5706 	}
5707 
5708 	remid = le32_to_cpu(rl->rl_lkid);
5709 
5710 	/* In general we expect the rsb returned to be R_MASTER, but we don't
5711 	   have to require it.  Recovery of masters on one node can overlap
5712 	   recovery of locks on another node, so one node can send us MSTCPY
5713 	   locks before we've made ourselves master of this rsb.  We can still
5714 	   add new MSTCPY locks that we receive here without any harm; when
5715 	   we make ourselves master, dlm_recover_masters() won't touch the
5716 	   MSTCPY locks we've received early. */
5717 
5718 	error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
5719 			 from_nodeid, R_RECEIVE_RECOVER, &r);
5720 	if (error)
5721 		goto out;
5722 
5723 	lock_rsb(r);
5724 
5725 	if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) {
5726 		log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
5727 			  from_nodeid, remid);
5728 		error = -EBADR;
5729 		goto out_unlock;
5730 	}
5731 
5732 	lkb = search_remid(r, from_nodeid, remid);
5733 	if (lkb) {
5734 		error = -EEXIST;
5735 		goto out_remid;
5736 	}
5737 
5738 	error = create_lkb(ls, &lkb);
5739 	if (error)
5740 		goto out_unlock;
5741 
5742 	error = receive_rcom_lock_args(ls, lkb, r, rc);
5743 	if (error) {
5744 		__put_lkb(ls, lkb);
5745 		goto out_unlock;
5746 	}
5747 
5748 	attach_lkb(r, lkb);
5749 	add_lkb(r, lkb, rl->rl_status);
5750 	ls->ls_recover_locks_in++;
5751 
5752 	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
5753 		rsb_set_flag(r, RSB_RECOVER_GRANT);
5754 
5755  out_remid:
5756 	/* this is the new value returned to the lock holder for
5757 	   saving in its process-copy lkb */
5758 	rl->rl_remid = cpu_to_le32(lkb->lkb_id);
5759 
5760 	lkb->lkb_recover_seq = ls->ls_recover_seq;
5761 
5762  out_unlock:
5763 	unlock_rsb(r);
5764 	put_rsb(r);
5765  out:
5766 	if (error && error != -EEXIST)
5767 		log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
5768 			  from_nodeid, remid, error);
5769 	rl->rl_result = cpu_to_le32(error);
5770 	return error;
5771 }
5772 
5773 /* needs at least dlm_rcom + rcom_lock */
5774 int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5775 {
5776 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5777 	struct dlm_rsb *r;
5778 	struct dlm_lkb *lkb;
5779 	uint32_t lkid, remid;
5780 	int error, result;
5781 
5782 	lkid = le32_to_cpu(rl->rl_lkid);
5783 	remid = le32_to_cpu(rl->rl_remid);
5784 	result = le32_to_cpu(rl->rl_result);
5785 
5786 	error = find_lkb(ls, lkid, &lkb);
5787 	if (error) {
5788 		log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
5789 			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5790 			  result);
5791 		return error;
5792 	}
5793 
5794 	r = lkb->lkb_resource;
5795 	hold_rsb(r);
5796 	lock_rsb(r);
5797 
5798 	if (!is_process_copy(lkb)) {
5799 		log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
5800 			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5801 			  result);
5802 		dlm_dump_rsb(r);
5803 		unlock_rsb(r);
5804 		put_rsb(r);
5805 		dlm_put_lkb(lkb);
5806 		return -EINVAL;
5807 	}
5808 
5809 	switch (result) {
5810 	case -EBADR:
5811 		/* There's a chance the new master received our lock before
5812 		   dlm_recover_master_reply(), this wouldn't happen if we did
5813 		   a barrier between recover_masters and recover_locks. */
5814 
5815 		log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
5816 			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5817 			  result);
5818 
5819 		dlm_send_rcom_lock(r, lkb);
5820 		goto out;
5821 	case -EEXIST:
5822 	case 0:
5823 		lkb->lkb_remid = remid;
5824 		break;
5825 	default:
5826 		log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
5827 			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5828 			  result);
5829 	}
5830 
5831 	/* an ack for dlm_recover_locks() which waits for replies from
5832 	   all the locks it sends to new masters */
5833 	dlm_recovered_lock(r);
5834  out:
5835 	unlock_rsb(r);
5836 	put_rsb(r);
5837 	dlm_put_lkb(lkb);
5838 
5839 	return 0;
5840 }
5841 
5842 int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
5843 		     int mode, uint32_t flags, void *name, unsigned int namelen,
5844 		     unsigned long timeout_cs)
5845 {
5846 	struct dlm_lkb *lkb;
5847 	struct dlm_args args;
5848 	int error;
5849 
5850 	dlm_lock_recovery(ls);
5851 
5852 	error = create_lkb(ls, &lkb);
5853 	if (error) {
5854 		kfree(ua);
5855 		goto out;
5856 	}
5857 
5858 	if (flags & DLM_LKF_VALBLK) {
5859 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
5860 		if (!ua->lksb.sb_lvbptr) {
5861 			kfree(ua);
5862 			__put_lkb(ls, lkb);
5863 			error = -ENOMEM;
5864 			goto out;
5865 		}
5866 	}
5867 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
5868 			      fake_astfn, ua, fake_bastfn, &args);
5869 	if (error) {
5870 		kfree(ua->lksb.sb_lvbptr);
5871 		ua->lksb.sb_lvbptr = NULL;
5872 		kfree(ua);
5873 		__put_lkb(ls, lkb);
5874 		goto out;
5875 	}
5876 
5877 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
5878 	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
5879 	   lock and that lkb_astparam is the dlm_user_args structure. */
5880 	lkb->lkb_flags |= DLM_IFL_USER;
5881 	error = request_lock(ls, lkb, name, namelen, &args);
5882 
5883 	switch (error) {
5884 	case 0:
5885 		break;
5886 	case -EINPROGRESS:
5887 		error = 0;
5888 		break;
5889 	case -EAGAIN:
5890 		error = 0;
5891 		fallthrough;
5892 	default:
5893 		__put_lkb(ls, lkb);
5894 		goto out;
5895 	}
5896 
5897 	/* add this new lkb to the per-process list of locks */
5898 	spin_lock(&ua->proc->locks_spin);
5899 	hold_lkb(lkb);
5900 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
5901 	spin_unlock(&ua->proc->locks_spin);
5902  out:
5903 	dlm_unlock_recovery(ls);
5904 	return error;
5905 }
5906 
5907 int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
5908 		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
5909 		     unsigned long timeout_cs)
5910 {
5911 	struct dlm_lkb *lkb;
5912 	struct dlm_args args;
5913 	struct dlm_user_args *ua;
5914 	int error;
5915 
5916 	dlm_lock_recovery(ls);
5917 
5918 	error = find_lkb(ls, lkid, &lkb);
5919 	if (error)
5920 		goto out;
5921 
5922 	/* user can change the params on its lock when it converts it, or
5923 	   add an lvb that didn't exist before */
5924 
5925 	ua = lkb->lkb_ua;
5926 
5927 	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
5928 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
5929 		if (!ua->lksb.sb_lvbptr) {
5930 			error = -ENOMEM;
5931 			goto out_put;
5932 		}
5933 	}
5934 	if (lvb_in && ua->lksb.sb_lvbptr)
5935 		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
5936 
5937 	ua->xid = ua_tmp->xid;
5938 	ua->castparam = ua_tmp->castparam;
5939 	ua->castaddr = ua_tmp->castaddr;
5940 	ua->bastparam = ua_tmp->bastparam;
5941 	ua->bastaddr = ua_tmp->bastaddr;
5942 	ua->user_lksb = ua_tmp->user_lksb;
5943 
5944 	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
5945 			      fake_astfn, ua, fake_bastfn, &args);
5946 	if (error)
5947 		goto out_put;
5948 
5949 	error = convert_lock(ls, lkb, &args);
5950 
5951 	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
5952 		error = 0;
5953  out_put:
5954 	dlm_put_lkb(lkb);
5955  out:
5956 	dlm_unlock_recovery(ls);
5957 	kfree(ua_tmp);
5958 	return error;
5959 }
5960 
5961 /*
5962  * The caller asks for an orphan lock on a given resource with a given mode.
5963  * If a matching lock exists, it's moved to the owner's list of locks and
5964  * the lkid is returned.
5965  */
5966 
5967 int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
5968 		     int mode, uint32_t flags, void *name, unsigned int namelen,
5969 		     unsigned long timeout_cs, uint32_t *lkid)
5970 {
5971 	struct dlm_lkb *lkb = NULL, *iter;
5972 	struct dlm_user_args *ua;
5973 	int found_other_mode = 0;
5974 	int rv = 0;
5975 
5976 	mutex_lock(&ls->ls_orphans_mutex);
5977 	list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
5978 		if (iter->lkb_resource->res_length != namelen)
5979 			continue;
5980 		if (memcmp(iter->lkb_resource->res_name, name, namelen))
5981 			continue;
5982 		if (iter->lkb_grmode != mode) {
5983 			found_other_mode = 1;
5984 			continue;
5985 		}
5986 
5987 		lkb = iter;
5988 		list_del_init(&iter->lkb_ownqueue);
5989 		iter->lkb_flags &= ~DLM_IFL_ORPHAN;
5990 		*lkid = iter->lkb_id;
5991 		break;
5992 	}
5993 	mutex_unlock(&ls->ls_orphans_mutex);
5994 
5995 	if (!lkb && found_other_mode) {
5996 		rv = -EAGAIN;
5997 		goto out;
5998 	}
5999 
6000 	if (!lkb) {
6001 		rv = -ENOENT;
6002 		goto out;
6003 	}
6004 
6005 	lkb->lkb_exflags = flags;
6006 	lkb->lkb_ownpid = (int) current->pid;
6007 
6008 	ua = lkb->lkb_ua;
6009 
6010 	ua->proc = ua_tmp->proc;
6011 	ua->xid = ua_tmp->xid;
6012 	ua->castparam = ua_tmp->castparam;
6013 	ua->castaddr = ua_tmp->castaddr;
6014 	ua->bastparam = ua_tmp->bastparam;
6015 	ua->bastaddr = ua_tmp->bastaddr;
6016 	ua->user_lksb = ua_tmp->user_lksb;
6017 
6018 	/*
6019 	 * The lkb reference from the ls_orphans list was not
6020 	 * removed above, and is now considered the reference
6021 	 * for the proc locks list.
6022 	 */
6023 
6024 	spin_lock(&ua->proc->locks_spin);
6025 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
6026 	spin_unlock(&ua->proc->locks_spin);
6027  out:
6028 	kfree(ua_tmp);
6029 	return rv;
6030 }
6031 
6032 int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
6033 		    uint32_t flags, uint32_t lkid, char *lvb_in)
6034 {
6035 	struct dlm_lkb *lkb;
6036 	struct dlm_args args;
6037 	struct dlm_user_args *ua;
6038 	int error;
6039 
6040 	dlm_lock_recovery(ls);
6041 
6042 	error = find_lkb(ls, lkid, &lkb);
6043 	if (error)
6044 		goto out;
6045 
6046 	ua = lkb->lkb_ua;
6047 
6048 	if (lvb_in && ua->lksb.sb_lvbptr)
6049 		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
6050 	if (ua_tmp->castparam)
6051 		ua->castparam = ua_tmp->castparam;
6052 	ua->user_lksb = ua_tmp->user_lksb;
6053 
6054 	error = set_unlock_args(flags, ua, &args);
6055 	if (error)
6056 		goto out_put;
6057 
6058 	error = unlock_lock(ls, lkb, &args);
6059 
6060 	if (error == -DLM_EUNLOCK)
6061 		error = 0;
6062 	/* from validate_unlock_args() */
6063 	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
6064 		error = 0;
6065 	if (error)
6066 		goto out_put;
6067 
6068 	spin_lock(&ua->proc->locks_spin);
6069 	/* dlm_user_add_cb() may have already taken lkb off the proc list */
6070 	if (!list_empty(&lkb->lkb_ownqueue))
6071 		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
6072 	spin_unlock(&ua->proc->locks_spin);
6073  out_put:
6074 	dlm_put_lkb(lkb);
6075  out:
6076 	dlm_unlock_recovery(ls);
6077 	kfree(ua_tmp);
6078 	return error;
6079 }
6080 
6081 int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
6082 		    uint32_t flags, uint32_t lkid)
6083 {
6084 	struct dlm_lkb *lkb;
6085 	struct dlm_args args;
6086 	struct dlm_user_args *ua;
6087 	int error;
6088 
6089 	dlm_lock_recovery(ls);
6090 
6091 	error = find_lkb(ls, lkid, &lkb);
6092 	if (error)
6093 		goto out;
6094 
6095 	ua = lkb->lkb_ua;
6096 	if (ua_tmp->castparam)
6097 		ua->castparam = ua_tmp->castparam;
6098 	ua->user_lksb = ua_tmp->user_lksb;
6099 
6100 	error = set_unlock_args(flags, ua, &args);
6101 	if (error)
6102 		goto out_put;
6103 
6104 	error = cancel_lock(ls, lkb, &args);
6105 
6106 	if (error == -DLM_ECANCEL)
6107 		error = 0;
6108 	/* from validate_unlock_args() */
6109 	if (error == -EBUSY)
6110 		error = 0;
6111  out_put:
6112 	dlm_put_lkb(lkb);
6113  out:
6114 	dlm_unlock_recovery(ls);
6115 	kfree(ua_tmp);
6116 	return error;
6117 }
6118 
6119 int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
6120 {
6121 	struct dlm_lkb *lkb;
6122 	struct dlm_args args;
6123 	struct dlm_user_args *ua;
6124 	struct dlm_rsb *r;
6125 	int error;
6126 
6127 	dlm_lock_recovery(ls);
6128 
6129 	error = find_lkb(ls, lkid, &lkb);
6130 	if (error)
6131 		goto out;
6132 
6133 	ua = lkb->lkb_ua;
6134 
6135 	error = set_unlock_args(flags, ua, &args);
6136 	if (error)
6137 		goto out_put;
6138 
6139 	/* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
6140 
6141 	r = lkb->lkb_resource;
6142 	hold_rsb(r);
6143 	lock_rsb(r);
6144 
6145 	error = validate_unlock_args(lkb, &args);
6146 	if (error)
6147 		goto out_r;
6148 	lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
6149 
6150 	error = _cancel_lock(r, lkb);
6151  out_r:
6152 	unlock_rsb(r);
6153 	put_rsb(r);
6154 
6155 	if (error == -DLM_ECANCEL)
6156 		error = 0;
6157 	/* from validate_unlock_args() */
6158 	if (error == -EBUSY)
6159 		error = 0;
6160  out_put:
6161 	dlm_put_lkb(lkb);
6162  out:
6163 	dlm_unlock_recovery(ls);
6164 	return error;
6165 }
6166 
6167 /* lkb's that are removed from the waiters list by revert are just left on the
6168    orphans list with the granted orphan locks, to be freed by purge */
6169 
6170 static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6171 {
6172 	struct dlm_args args;
6173 	int error;
6174 
6175 	hold_lkb(lkb); /* reference for the ls_orphans list */
6176 	mutex_lock(&ls->ls_orphans_mutex);
6177 	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
6178 	mutex_unlock(&ls->ls_orphans_mutex);
6179 
6180 	set_unlock_args(0, lkb->lkb_ua, &args);
6181 
6182 	error = cancel_lock(ls, lkb, &args);
6183 	if (error == -DLM_ECANCEL)
6184 		error = 0;
6185 	return error;
6186 }
6187 
6188 /* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
6189    granted.  Regardless of what rsb queue the lock is on, it's removed and
6190    freed.  The IVVALBLK flag causes the lvb on the resource to be invalidated
6191    if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6192 
6193 static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6194 {
6195 	struct dlm_args args;
6196 	int error;
6197 
6198 	set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
6199 			lkb->lkb_ua, &args);
6200 
6201 	error = unlock_lock(ls, lkb, &args);
6202 	if (error == -DLM_EUNLOCK)
6203 		error = 0;
6204 	return error;
6205 }
6206 
6207 /* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
6208    (which does lock_rsb) due to deadlock with receiving a message that does
6209    lock_rsb followed by dlm_user_add_cb() */
6210 
6211 static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
6212 				     struct dlm_user_proc *proc)
6213 {
6214 	struct dlm_lkb *lkb = NULL;
6215 
6216 	mutex_lock(&ls->ls_clear_proc_locks);
6217 	if (list_empty(&proc->locks))
6218 		goto out;
6219 
6220 	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
6221 	list_del_init(&lkb->lkb_ownqueue);
6222 
6223 	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
6224 		lkb->lkb_flags |= DLM_IFL_ORPHAN;
6225 	else
6226 		lkb->lkb_flags |= DLM_IFL_DEAD;
6227  out:
6228 	mutex_unlock(&ls->ls_clear_proc_locks);
6229 	return lkb;
6230 }
6231 
6232 /* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which
6233    1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
6234    which we clear here. */
6235 
6236 /* proc CLOSING flag is set so no more device_reads should look at proc->asts
6237    list, and no more device_writes should add lkb's to proc->locks list; so we
6238    shouldn't need to take asts_spin or locks_spin here.  this assumes that
6239    device reads/writes/closes are serialized -- FIXME: we may need to serialize
6240    them ourself. */
6241 
6242 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
6243 {
6244 	struct dlm_lkb *lkb, *safe;
6245 
6246 	dlm_lock_recovery(ls);
6247 
6248 	while (1) {
6249 		lkb = del_proc_lock(ls, proc);
6250 		if (!lkb)
6251 			break;
6252 		del_timeout(lkb);
6253 		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
6254 			orphan_proc_lock(ls, lkb);
6255 		else
6256 			unlock_proc_lock(ls, lkb);
6257 
6258 		/* this removes the reference for the proc->locks list
6259 		   added by dlm_user_request, it may result in the lkb
6260 		   being freed */
6261 
6262 		dlm_put_lkb(lkb);
6263 	}
6264 
6265 	mutex_lock(&ls->ls_clear_proc_locks);
6266 
6267 	/* in-progress unlocks */
6268 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
6269 		list_del_init(&lkb->lkb_ownqueue);
6270 		lkb->lkb_flags |= DLM_IFL_DEAD;
6271 		dlm_put_lkb(lkb);
6272 	}
6273 
6274 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
6275 		memset(&lkb->lkb_callbacks, 0,
6276 		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
6277 		list_del_init(&lkb->lkb_cb_list);
6278 		dlm_put_lkb(lkb);
6279 	}
6280 
6281 	mutex_unlock(&ls->ls_clear_proc_locks);
6282 	dlm_unlock_recovery(ls);
6283 }
6284 
6285 static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
6286 {
6287 	struct dlm_lkb *lkb, *safe;
6288 
6289 	while (1) {
6290 		lkb = NULL;
6291 		spin_lock(&proc->locks_spin);
6292 		if (!list_empty(&proc->locks)) {
6293 			lkb = list_entry(proc->locks.next, struct dlm_lkb,
6294 					 lkb_ownqueue);
6295 			list_del_init(&lkb->lkb_ownqueue);
6296 		}
6297 		spin_unlock(&proc->locks_spin);
6298 
6299 		if (!lkb)
6300 			break;
6301 
6302 		lkb->lkb_flags |= DLM_IFL_DEAD;
6303 		unlock_proc_lock(ls, lkb);
6304 		dlm_put_lkb(lkb); /* ref from proc->locks list */
6305 	}
6306 
6307 	spin_lock(&proc->locks_spin);
6308 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
6309 		list_del_init(&lkb->lkb_ownqueue);
6310 		lkb->lkb_flags |= DLM_IFL_DEAD;
6311 		dlm_put_lkb(lkb);
6312 	}
6313 	spin_unlock(&proc->locks_spin);
6314 
6315 	spin_lock(&proc->asts_spin);
6316 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
6317 		memset(&lkb->lkb_callbacks, 0,
6318 		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
6319 		list_del_init(&lkb->lkb_cb_list);
6320 		dlm_put_lkb(lkb);
6321 	}
6322 	spin_unlock(&proc->asts_spin);
6323 }
6324 
6325 /* pid of 0 means purge all orphans */
6326 
6327 static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
6328 {
6329 	struct dlm_lkb *lkb, *safe;
6330 
6331 	mutex_lock(&ls->ls_orphans_mutex);
6332 	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
6333 		if (pid && lkb->lkb_ownpid != pid)
6334 			continue;
6335 		unlock_proc_lock(ls, lkb);
6336 		list_del_init(&lkb->lkb_ownqueue);
6337 		dlm_put_lkb(lkb);
6338 	}
6339 	mutex_unlock(&ls->ls_orphans_mutex);
6340 }
6341 
6342 static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
6343 {
6344 	struct dlm_message *ms;
6345 	struct dlm_mhandle *mh;
6346 	int error;
6347 
6348 	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
6349 				DLM_MSG_PURGE, &ms, &mh);
6350 	if (error)
6351 		return error;
6352 	ms->m_nodeid = cpu_to_le32(nodeid);
6353 	ms->m_pid = cpu_to_le32(pid);
6354 
6355 	return send_message(mh, ms);
6356 }
6357 
6358 int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
6359 		   int nodeid, int pid)
6360 {
6361 	int error = 0;
6362 
6363 	if (nodeid && (nodeid != dlm_our_nodeid())) {
6364 		error = send_purge(ls, nodeid, pid);
6365 	} else {
6366 		dlm_lock_recovery(ls);
6367 		if (pid == current->pid)
6368 			purge_proc_locks(ls, proc);
6369 		else
6370 			do_purge(ls, nodeid, pid);
6371 		dlm_unlock_recovery(ls);
6372 	}
6373 	return error;
6374 }
6375 
6376 /* debug functionality */
6377 int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
6378 		      int lkb_nodeid, unsigned int lkb_flags, int lkb_status)
6379 {
6380 	struct dlm_lksb *lksb;
6381 	struct dlm_lkb *lkb;
6382 	struct dlm_rsb *r;
6383 	int error;
6384 
6385 	/* we currently can't set a valid user lock */
6386 	if (lkb_flags & DLM_IFL_USER)
6387 		return -EOPNOTSUPP;
6388 
6389 	lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
6390 	if (!lksb)
6391 		return -ENOMEM;
6392 
6393 	error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1);
6394 	if (error) {
6395 		kfree(lksb);
6396 		return error;
6397 	}
6398 
6399 	lkb->lkb_flags = lkb_flags;
6400 	lkb->lkb_nodeid = lkb_nodeid;
6401 	lkb->lkb_lksb = lksb;
6402 	/* user specific pointer, just don't have it NULL for kernel locks */
6403 	if (~lkb_flags & DLM_IFL_USER)
6404 		lkb->lkb_astparam = (void *)0xDEADBEEF;
6405 
6406 	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
6407 	if (error) {
6408 		kfree(lksb);
6409 		__put_lkb(ls, lkb);
6410 		return error;
6411 	}
6412 
6413 	lock_rsb(r);
6414 	attach_lkb(r, lkb);
6415 	add_lkb(r, lkb, lkb_status);
6416 	unlock_rsb(r);
6417 	put_rsb(r);
6418 
6419 	return 0;
6420 }
6421 
6422 int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
6423 				 int mstype, int to_nodeid)
6424 {
6425 	struct dlm_lkb *lkb;
6426 	int error;
6427 
6428 	error = find_lkb(ls, lkb_id, &lkb);
6429 	if (error)
6430 		return error;
6431 
6432 	error = add_to_waiters(lkb, mstype, to_nodeid);
6433 	dlm_put_lkb(lkb);
6434 	return error;
6435 }
6436 
6437