xref: /openbmc/linux/fs/dlm/lock.c (revision b6dcefde)
1 /******************************************************************************
2 *******************************************************************************
3 **
4 **  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
5 **
6 **  This copyrighted material is made available to anyone wishing to use,
7 **  modify, copy, or redistribute it subject to the terms and conditions
8 **  of the GNU General Public License v.2.
9 **
10 *******************************************************************************
11 ******************************************************************************/
12 
13 /* Central locking logic has four stages:
14 
15    dlm_lock()
16    dlm_unlock()
17 
18    request_lock(ls, lkb)
19    convert_lock(ls, lkb)
20    unlock_lock(ls, lkb)
21    cancel_lock(ls, lkb)
22 
23    _request_lock(r, lkb)
24    _convert_lock(r, lkb)
25    _unlock_lock(r, lkb)
26    _cancel_lock(r, lkb)
27 
28    do_request(r, lkb)
29    do_convert(r, lkb)
30    do_unlock(r, lkb)
31    do_cancel(r, lkb)
32 
33    Stage 1 (lock, unlock) is mainly about checking input args and
34    splitting into one of the four main operations:
35 
36        dlm_lock          = request_lock
37        dlm_lock+CONVERT  = convert_lock
38        dlm_unlock        = unlock_lock
39        dlm_unlock+CANCEL = cancel_lock
40 
41    Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42    provided to the next stage.
43 
44    Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45    When remote, it calls send_xxxx(), when local it calls do_xxxx().
46 
47    Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
48    given rsb and lkb and queues callbacks.
49 
50    For remote operations, send_xxxx() results in the corresponding do_xxxx()
51    function being executed on the remote node.  The connecting send/receive
52    calls on local (L) and remote (R) nodes:
53 
54    L: send_xxxx()              ->  R: receive_xxxx()
55                                    R: do_xxxx()
56    L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
57 */
58 #include <linux/types.h>
59 #include "dlm_internal.h"
60 #include <linux/dlm_device.h>
61 #include "memory.h"
62 #include "lowcomms.h"
63 #include "requestqueue.h"
64 #include "util.h"
65 #include "dir.h"
66 #include "member.h"
67 #include "lockspace.h"
68 #include "ast.h"
69 #include "lock.h"
70 #include "rcom.h"
71 #include "recover.h"
72 #include "lvb_table.h"
73 #include "user.h"
74 #include "config.h"
75 
76 static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77 static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78 static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79 static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80 static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81 static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82 static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83 static int send_remove(struct dlm_rsb *r);
84 static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85 static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
86 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
87 				    struct dlm_message *ms);
88 static int receive_extralen(struct dlm_message *ms);
89 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
90 static void del_timeout(struct dlm_lkb *lkb);
91 
92 /*
93  * Lock compatibilty matrix - thanks Steve
94  * UN = Unlocked state. Not really a state, used as a flag
95  * PD = Padding. Used to make the matrix a nice power of two in size
96  * Other states are the same as the VMS DLM.
97  * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
98  */
99 
100 static const int __dlm_compat_matrix[8][8] = {
101       /* UN NL CR CW PR PW EX PD */
102         {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
103         {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
104         {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
105         {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
106         {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
107         {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
108         {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
109         {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
110 };
111 
112 /*
113  * This defines the direction of transfer of LVB data.
114  * Granted mode is the row; requested mode is the column.
115  * Usage: matrix[grmode+1][rqmode+1]
116  * 1 = LVB is returned to the caller
117  * 0 = LVB is written to the resource
118  * -1 = nothing happens to the LVB
119  */
120 
121 const int dlm_lvb_operations[8][8] = {
122         /* UN   NL  CR  CW  PR  PW  EX  PD*/
123         {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
124         {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
125         {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
126         {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
127         {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
128         {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
129         {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
130         {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
131 };
132 
133 #define modes_compat(gr, rq) \
134 	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
135 
136 int dlm_modes_compat(int mode1, int mode2)
137 {
138 	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
139 }
140 
141 /*
142  * Compatibility matrix for conversions with QUECVT set.
143  * Granted mode is the row; requested mode is the column.
144  * Usage: matrix[grmode+1][rqmode+1]
145  */
146 
147 static const int __quecvt_compat_matrix[8][8] = {
148       /* UN NL CR CW PR PW EX PD */
149         {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
150         {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
151         {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
152         {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
153         {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
154         {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
155         {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
156         {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
157 };
158 
159 void dlm_print_lkb(struct dlm_lkb *lkb)
160 {
161 	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
162 	       "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
163 	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
164 	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
165 	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
166 }
167 
168 static void dlm_print_rsb(struct dlm_rsb *r)
169 {
170 	printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
171 	       r->res_nodeid, r->res_flags, r->res_first_lkid,
172 	       r->res_recover_locks_count, r->res_name);
173 }
174 
175 void dlm_dump_rsb(struct dlm_rsb *r)
176 {
177 	struct dlm_lkb *lkb;
178 
179 	dlm_print_rsb(r);
180 
181 	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
182 	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
183 	printk(KERN_ERR "rsb lookup list\n");
184 	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
185 		dlm_print_lkb(lkb);
186 	printk(KERN_ERR "rsb grant queue:\n");
187 	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
188 		dlm_print_lkb(lkb);
189 	printk(KERN_ERR "rsb convert queue:\n");
190 	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
191 		dlm_print_lkb(lkb);
192 	printk(KERN_ERR "rsb wait queue:\n");
193 	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
194 		dlm_print_lkb(lkb);
195 }
196 
197 /* Threads cannot use the lockspace while it's being recovered */
198 
199 static inline void dlm_lock_recovery(struct dlm_ls *ls)
200 {
201 	down_read(&ls->ls_in_recovery);
202 }
203 
204 void dlm_unlock_recovery(struct dlm_ls *ls)
205 {
206 	up_read(&ls->ls_in_recovery);
207 }
208 
209 int dlm_lock_recovery_try(struct dlm_ls *ls)
210 {
211 	return down_read_trylock(&ls->ls_in_recovery);
212 }
213 
214 static inline int can_be_queued(struct dlm_lkb *lkb)
215 {
216 	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
217 }
218 
219 static inline int force_blocking_asts(struct dlm_lkb *lkb)
220 {
221 	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
222 }
223 
224 static inline int is_demoted(struct dlm_lkb *lkb)
225 {
226 	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
227 }
228 
229 static inline int is_altmode(struct dlm_lkb *lkb)
230 {
231 	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
232 }
233 
234 static inline int is_granted(struct dlm_lkb *lkb)
235 {
236 	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
237 }
238 
239 static inline int is_remote(struct dlm_rsb *r)
240 {
241 	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
242 	return !!r->res_nodeid;
243 }
244 
245 static inline int is_process_copy(struct dlm_lkb *lkb)
246 {
247 	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
248 }
249 
250 static inline int is_master_copy(struct dlm_lkb *lkb)
251 {
252 	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
253 		DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
254 	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
255 }
256 
257 static inline int middle_conversion(struct dlm_lkb *lkb)
258 {
259 	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
260 	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
261 		return 1;
262 	return 0;
263 }
264 
265 static inline int down_conversion(struct dlm_lkb *lkb)
266 {
267 	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
268 }
269 
270 static inline int is_overlap_unlock(struct dlm_lkb *lkb)
271 {
272 	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
273 }
274 
275 static inline int is_overlap_cancel(struct dlm_lkb *lkb)
276 {
277 	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
278 }
279 
280 static inline int is_overlap(struct dlm_lkb *lkb)
281 {
282 	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
283 				  DLM_IFL_OVERLAP_CANCEL));
284 }
285 
286 static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
287 {
288 	if (is_master_copy(lkb))
289 		return;
290 
291 	del_timeout(lkb);
292 
293 	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
294 
295 	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
296 	   timeout caused the cancel then return -ETIMEDOUT */
297 	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
298 		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
299 		rv = -ETIMEDOUT;
300 	}
301 
302 	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
303 		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
304 		rv = -EDEADLK;
305 	}
306 
307 	lkb->lkb_lksb->sb_status = rv;
308 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 
310 	dlm_add_ast(lkb, AST_COMP, 0);
311 }
312 
313 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
314 {
315 	queue_cast(r, lkb,
316 		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
317 }
318 
319 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320 {
321 	lkb->lkb_time_bast = ktime_get();
322 
323 	if (is_master_copy(lkb))
324 		send_bast(r, lkb, rqmode);
325 	else
326 		dlm_add_ast(lkb, AST_BAST, rqmode);
327 }
328 
329 /*
330  * Basic operations on rsb's and lkb's
331  */
332 
333 static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
334 {
335 	struct dlm_rsb *r;
336 
337 	r = dlm_allocate_rsb(ls, len);
338 	if (!r)
339 		return NULL;
340 
341 	r->res_ls = ls;
342 	r->res_length = len;
343 	memcpy(r->res_name, name, len);
344 	mutex_init(&r->res_mutex);
345 
346 	INIT_LIST_HEAD(&r->res_lookup);
347 	INIT_LIST_HEAD(&r->res_grantqueue);
348 	INIT_LIST_HEAD(&r->res_convertqueue);
349 	INIT_LIST_HEAD(&r->res_waitqueue);
350 	INIT_LIST_HEAD(&r->res_root_list);
351 	INIT_LIST_HEAD(&r->res_recover_list);
352 
353 	return r;
354 }
355 
356 static int search_rsb_list(struct list_head *head, char *name, int len,
357 			   unsigned int flags, struct dlm_rsb **r_ret)
358 {
359 	struct dlm_rsb *r;
360 	int error = 0;
361 
362 	list_for_each_entry(r, head, res_hashchain) {
363 		if (len == r->res_length && !memcmp(name, r->res_name, len))
364 			goto found;
365 	}
366 	*r_ret = NULL;
367 	return -EBADR;
368 
369  found:
370 	if (r->res_nodeid && (flags & R_MASTER))
371 		error = -ENOTBLK;
372 	*r_ret = r;
373 	return error;
374 }
375 
376 static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
377 		       unsigned int flags, struct dlm_rsb **r_ret)
378 {
379 	struct dlm_rsb *r;
380 	int error;
381 
382 	error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
383 	if (!error) {
384 		kref_get(&r->res_ref);
385 		goto out;
386 	}
387 	error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
388 	if (error)
389 		goto out;
390 
391 	list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
392 
393 	if (dlm_no_directory(ls))
394 		goto out;
395 
396 	if (r->res_nodeid == -1) {
397 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
398 		r->res_first_lkid = 0;
399 	} else if (r->res_nodeid > 0) {
400 		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
401 		r->res_first_lkid = 0;
402 	} else {
403 		DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
404 		DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
405 	}
406  out:
407 	*r_ret = r;
408 	return error;
409 }
410 
411 static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
412 		      unsigned int flags, struct dlm_rsb **r_ret)
413 {
414 	int error;
415 	spin_lock(&ls->ls_rsbtbl[b].lock);
416 	error = _search_rsb(ls, name, len, b, flags, r_ret);
417 	spin_unlock(&ls->ls_rsbtbl[b].lock);
418 	return error;
419 }
420 
421 /*
422  * Find rsb in rsbtbl and potentially create/add one
423  *
424  * Delaying the release of rsb's has a similar benefit to applications keeping
425  * NL locks on an rsb, but without the guarantee that the cached master value
426  * will still be valid when the rsb is reused.  Apps aren't always smart enough
427  * to keep NL locks on an rsb that they may lock again shortly; this can lead
428  * to excessive master lookups and removals if we don't delay the release.
429  *
430  * Searching for an rsb means looking through both the normal list and toss
431  * list.  When found on the toss list the rsb is moved to the normal list with
432  * ref count of 1; when found on normal list the ref count is incremented.
433  */
434 
435 static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
436 		    unsigned int flags, struct dlm_rsb **r_ret)
437 {
438 	struct dlm_rsb *r = NULL, *tmp;
439 	uint32_t hash, bucket;
440 	int error = -EINVAL;
441 
442 	if (namelen > DLM_RESNAME_MAXLEN)
443 		goto out;
444 
445 	if (dlm_no_directory(ls))
446 		flags |= R_CREATE;
447 
448 	error = 0;
449 	hash = jhash(name, namelen, 0);
450 	bucket = hash & (ls->ls_rsbtbl_size - 1);
451 
452 	error = search_rsb(ls, name, namelen, bucket, flags, &r);
453 	if (!error)
454 		goto out;
455 
456 	if (error == -EBADR && !(flags & R_CREATE))
457 		goto out;
458 
459 	/* the rsb was found but wasn't a master copy */
460 	if (error == -ENOTBLK)
461 		goto out;
462 
463 	error = -ENOMEM;
464 	r = create_rsb(ls, name, namelen);
465 	if (!r)
466 		goto out;
467 
468 	r->res_hash = hash;
469 	r->res_bucket = bucket;
470 	r->res_nodeid = -1;
471 	kref_init(&r->res_ref);
472 
473 	/* With no directory, the master can be set immediately */
474 	if (dlm_no_directory(ls)) {
475 		int nodeid = dlm_dir_nodeid(r);
476 		if (nodeid == dlm_our_nodeid())
477 			nodeid = 0;
478 		r->res_nodeid = nodeid;
479 	}
480 
481 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
482 	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
483 	if (!error) {
484 		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
485 		dlm_free_rsb(r);
486 		r = tmp;
487 		goto out;
488 	}
489 	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
490 	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
491 	error = 0;
492  out:
493 	*r_ret = r;
494 	return error;
495 }
496 
497 /* This is only called to add a reference when the code already holds
498    a valid reference to the rsb, so there's no need for locking. */
499 
500 static inline void hold_rsb(struct dlm_rsb *r)
501 {
502 	kref_get(&r->res_ref);
503 }
504 
505 void dlm_hold_rsb(struct dlm_rsb *r)
506 {
507 	hold_rsb(r);
508 }
509 
510 static void toss_rsb(struct kref *kref)
511 {
512 	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
513 	struct dlm_ls *ls = r->res_ls;
514 
515 	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
516 	kref_init(&r->res_ref);
517 	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
518 	r->res_toss_time = jiffies;
519 	if (r->res_lvbptr) {
520 		dlm_free_lvb(r->res_lvbptr);
521 		r->res_lvbptr = NULL;
522 	}
523 }
524 
525 /* When all references to the rsb are gone it's transfered to
526    the tossed list for later disposal. */
527 
528 static void put_rsb(struct dlm_rsb *r)
529 {
530 	struct dlm_ls *ls = r->res_ls;
531 	uint32_t bucket = r->res_bucket;
532 
533 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
534 	kref_put(&r->res_ref, toss_rsb);
535 	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
536 }
537 
538 void dlm_put_rsb(struct dlm_rsb *r)
539 {
540 	put_rsb(r);
541 }
542 
543 /* See comment for unhold_lkb */
544 
545 static void unhold_rsb(struct dlm_rsb *r)
546 {
547 	int rv;
548 	rv = kref_put(&r->res_ref, toss_rsb);
549 	DLM_ASSERT(!rv, dlm_dump_rsb(r););
550 }
551 
552 static void kill_rsb(struct kref *kref)
553 {
554 	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
555 
556 	/* All work is done after the return from kref_put() so we
557 	   can release the write_lock before the remove and free. */
558 
559 	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
560 	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
561 	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
562 	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
563 	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
564 	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
565 }
566 
567 /* Attaching/detaching lkb's from rsb's is for rsb reference counting.
568    The rsb must exist as long as any lkb's for it do. */
569 
570 static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
571 {
572 	hold_rsb(r);
573 	lkb->lkb_resource = r;
574 }
575 
576 static void detach_lkb(struct dlm_lkb *lkb)
577 {
578 	if (lkb->lkb_resource) {
579 		put_rsb(lkb->lkb_resource);
580 		lkb->lkb_resource = NULL;
581 	}
582 }
583 
584 static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
585 {
586 	struct dlm_lkb *lkb, *tmp;
587 	uint32_t lkid = 0;
588 	uint16_t bucket;
589 
590 	lkb = dlm_allocate_lkb(ls);
591 	if (!lkb)
592 		return -ENOMEM;
593 
594 	lkb->lkb_nodeid = -1;
595 	lkb->lkb_grmode = DLM_LOCK_IV;
596 	kref_init(&lkb->lkb_ref);
597 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
598 	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
599 	INIT_LIST_HEAD(&lkb->lkb_time_list);
600 
601 	get_random_bytes(&bucket, sizeof(bucket));
602 	bucket &= (ls->ls_lkbtbl_size - 1);
603 
604 	write_lock(&ls->ls_lkbtbl[bucket].lock);
605 
606 	/* counter can roll over so we must verify lkid is not in use */
607 
608 	while (lkid == 0) {
609 		lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
610 
611 		list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
612 				    lkb_idtbl_list) {
613 			if (tmp->lkb_id != lkid)
614 				continue;
615 			lkid = 0;
616 			break;
617 		}
618 	}
619 
620 	lkb->lkb_id = lkid;
621 	list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
622 	write_unlock(&ls->ls_lkbtbl[bucket].lock);
623 
624 	*lkb_ret = lkb;
625 	return 0;
626 }
627 
628 static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
629 {
630 	struct dlm_lkb *lkb;
631 	uint16_t bucket = (lkid >> 16);
632 
633 	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
634 		if (lkb->lkb_id == lkid)
635 			return lkb;
636 	}
637 	return NULL;
638 }
639 
640 static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
641 {
642 	struct dlm_lkb *lkb;
643 	uint16_t bucket = (lkid >> 16);
644 
645 	if (bucket >= ls->ls_lkbtbl_size)
646 		return -EBADSLT;
647 
648 	read_lock(&ls->ls_lkbtbl[bucket].lock);
649 	lkb = __find_lkb(ls, lkid);
650 	if (lkb)
651 		kref_get(&lkb->lkb_ref);
652 	read_unlock(&ls->ls_lkbtbl[bucket].lock);
653 
654 	*lkb_ret = lkb;
655 	return lkb ? 0 : -ENOENT;
656 }
657 
658 static void kill_lkb(struct kref *kref)
659 {
660 	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
661 
662 	/* All work is done after the return from kref_put() so we
663 	   can release the write_lock before the detach_lkb */
664 
665 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
666 }
667 
668 /* __put_lkb() is used when an lkb may not have an rsb attached to
669    it so we need to provide the lockspace explicitly */
670 
671 static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
672 {
673 	uint16_t bucket = (lkb->lkb_id >> 16);
674 
675 	write_lock(&ls->ls_lkbtbl[bucket].lock);
676 	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
677 		list_del(&lkb->lkb_idtbl_list);
678 		write_unlock(&ls->ls_lkbtbl[bucket].lock);
679 
680 		detach_lkb(lkb);
681 
682 		/* for local/process lkbs, lvbptr points to caller's lksb */
683 		if (lkb->lkb_lvbptr && is_master_copy(lkb))
684 			dlm_free_lvb(lkb->lkb_lvbptr);
685 		dlm_free_lkb(lkb);
686 		return 1;
687 	} else {
688 		write_unlock(&ls->ls_lkbtbl[bucket].lock);
689 		return 0;
690 	}
691 }
692 
693 int dlm_put_lkb(struct dlm_lkb *lkb)
694 {
695 	struct dlm_ls *ls;
696 
697 	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
698 	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
699 
700 	ls = lkb->lkb_resource->res_ls;
701 	return __put_lkb(ls, lkb);
702 }
703 
704 /* This is only called to add a reference when the code already holds
705    a valid reference to the lkb, so there's no need for locking. */
706 
707 static inline void hold_lkb(struct dlm_lkb *lkb)
708 {
709 	kref_get(&lkb->lkb_ref);
710 }
711 
712 /* This is called when we need to remove a reference and are certain
713    it's not the last ref.  e.g. del_lkb is always called between a
714    find_lkb/put_lkb and is always the inverse of a previous add_lkb.
715    put_lkb would work fine, but would involve unnecessary locking */
716 
717 static inline void unhold_lkb(struct dlm_lkb *lkb)
718 {
719 	int rv;
720 	rv = kref_put(&lkb->lkb_ref, kill_lkb);
721 	DLM_ASSERT(!rv, dlm_print_lkb(lkb););
722 }
723 
724 static void lkb_add_ordered(struct list_head *new, struct list_head *head,
725 			    int mode)
726 {
727 	struct dlm_lkb *lkb = NULL;
728 
729 	list_for_each_entry(lkb, head, lkb_statequeue)
730 		if (lkb->lkb_rqmode < mode)
731 			break;
732 
733 	if (!lkb)
734 		list_add_tail(new, head);
735 	else
736 		__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
737 }
738 
739 /* add/remove lkb to rsb's grant/convert/wait queue */
740 
741 static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
742 {
743 	kref_get(&lkb->lkb_ref);
744 
745 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
746 
747 	lkb->lkb_timestamp = ktime_get();
748 
749 	lkb->lkb_status = status;
750 
751 	switch (status) {
752 	case DLM_LKSTS_WAITING:
753 		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
754 			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
755 		else
756 			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
757 		break;
758 	case DLM_LKSTS_GRANTED:
759 		/* convention says granted locks kept in order of grmode */
760 		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
761 				lkb->lkb_grmode);
762 		break;
763 	case DLM_LKSTS_CONVERT:
764 		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
765 			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
766 		else
767 			list_add_tail(&lkb->lkb_statequeue,
768 				      &r->res_convertqueue);
769 		break;
770 	default:
771 		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
772 	}
773 }
774 
775 static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
776 {
777 	lkb->lkb_status = 0;
778 	list_del(&lkb->lkb_statequeue);
779 	unhold_lkb(lkb);
780 }
781 
782 static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
783 {
784 	hold_lkb(lkb);
785 	del_lkb(r, lkb);
786 	add_lkb(r, lkb, sts);
787 	unhold_lkb(lkb);
788 }
789 
790 static int msg_reply_type(int mstype)
791 {
792 	switch (mstype) {
793 	case DLM_MSG_REQUEST:
794 		return DLM_MSG_REQUEST_REPLY;
795 	case DLM_MSG_CONVERT:
796 		return DLM_MSG_CONVERT_REPLY;
797 	case DLM_MSG_UNLOCK:
798 		return DLM_MSG_UNLOCK_REPLY;
799 	case DLM_MSG_CANCEL:
800 		return DLM_MSG_CANCEL_REPLY;
801 	case DLM_MSG_LOOKUP:
802 		return DLM_MSG_LOOKUP_REPLY;
803 	}
804 	return -1;
805 }
806 
807 /* add/remove lkb from global waiters list of lkb's waiting for
808    a reply from a remote node */
809 
810 static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
811 {
812 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
813 	int error = 0;
814 
815 	mutex_lock(&ls->ls_waiters_mutex);
816 
817 	if (is_overlap_unlock(lkb) ||
818 	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
819 		error = -EINVAL;
820 		goto out;
821 	}
822 
823 	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
824 		switch (mstype) {
825 		case DLM_MSG_UNLOCK:
826 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
827 			break;
828 		case DLM_MSG_CANCEL:
829 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
830 			break;
831 		default:
832 			error = -EBUSY;
833 			goto out;
834 		}
835 		lkb->lkb_wait_count++;
836 		hold_lkb(lkb);
837 
838 		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
839 			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
840 			  lkb->lkb_wait_count, lkb->lkb_flags);
841 		goto out;
842 	}
843 
844 	DLM_ASSERT(!lkb->lkb_wait_count,
845 		   dlm_print_lkb(lkb);
846 		   printk("wait_count %d\n", lkb->lkb_wait_count););
847 
848 	lkb->lkb_wait_count++;
849 	lkb->lkb_wait_type = mstype;
850 	hold_lkb(lkb);
851 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
852  out:
853 	if (error)
854 		log_error(ls, "addwait error %x %d flags %x %d %d %s",
855 			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
856 			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
857 	mutex_unlock(&ls->ls_waiters_mutex);
858 	return error;
859 }
860 
861 /* We clear the RESEND flag because we might be taking an lkb off the waiters
862    list as part of process_requestqueue (e.g. a lookup that has an optimized
863    request reply on the requestqueue) between dlm_recover_waiters_pre() which
864    set RESEND and dlm_recover_waiters_post() */
865 
866 static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
867 				struct dlm_message *ms)
868 {
869 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
870 	int overlap_done = 0;
871 
872 	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
873 		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
874 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
875 		overlap_done = 1;
876 		goto out_del;
877 	}
878 
879 	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
880 		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
881 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
882 		overlap_done = 1;
883 		goto out_del;
884 	}
885 
886 	/* Cancel state was preemptively cleared by a successful convert,
887 	   see next comment, nothing to do. */
888 
889 	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
890 	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
891 		log_debug(ls, "remwait %x cancel_reply wait_type %d",
892 			  lkb->lkb_id, lkb->lkb_wait_type);
893 		return -1;
894 	}
895 
896 	/* Remove for the convert reply, and premptively remove for the
897 	   cancel reply.  A convert has been granted while there's still
898 	   an outstanding cancel on it (the cancel is moot and the result
899 	   in the cancel reply should be 0).  We preempt the cancel reply
900 	   because the app gets the convert result and then can follow up
901 	   with another op, like convert.  This subsequent op would see the
902 	   lingering state of the cancel and fail with -EBUSY. */
903 
904 	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
905 	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
906 	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
907 		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
908 			  lkb->lkb_id);
909 		lkb->lkb_wait_type = 0;
910 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
911 		lkb->lkb_wait_count--;
912 		goto out_del;
913 	}
914 
915 	/* N.B. type of reply may not always correspond to type of original
916 	   msg due to lookup->request optimization, verify others? */
917 
918 	if (lkb->lkb_wait_type) {
919 		lkb->lkb_wait_type = 0;
920 		goto out_del;
921 	}
922 
923 	log_error(ls, "remwait error %x reply %d flags %x no wait_type",
924 		  lkb->lkb_id, mstype, lkb->lkb_flags);
925 	return -1;
926 
927  out_del:
928 	/* the force-unlock/cancel has completed and we haven't recvd a reply
929 	   to the op that was in progress prior to the unlock/cancel; we
930 	   give up on any reply to the earlier op.  FIXME: not sure when/how
931 	   this would happen */
932 
933 	if (overlap_done && lkb->lkb_wait_type) {
934 		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
935 			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
936 		lkb->lkb_wait_count--;
937 		lkb->lkb_wait_type = 0;
938 	}
939 
940 	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
941 
942 	lkb->lkb_flags &= ~DLM_IFL_RESEND;
943 	lkb->lkb_wait_count--;
944 	if (!lkb->lkb_wait_count)
945 		list_del_init(&lkb->lkb_wait_reply);
946 	unhold_lkb(lkb);
947 	return 0;
948 }
949 
950 static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
951 {
952 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
953 	int error;
954 
955 	mutex_lock(&ls->ls_waiters_mutex);
956 	error = _remove_from_waiters(lkb, mstype, NULL);
957 	mutex_unlock(&ls->ls_waiters_mutex);
958 	return error;
959 }
960 
961 /* Handles situations where we might be processing a "fake" or "stub" reply in
962    which we can't try to take waiters_mutex again. */
963 
964 static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
965 {
966 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
967 	int error;
968 
969 	if (ms != &ls->ls_stub_ms)
970 		mutex_lock(&ls->ls_waiters_mutex);
971 	error = _remove_from_waiters(lkb, ms->m_type, ms);
972 	if (ms != &ls->ls_stub_ms)
973 		mutex_unlock(&ls->ls_waiters_mutex);
974 	return error;
975 }
976 
977 static void dir_remove(struct dlm_rsb *r)
978 {
979 	int to_nodeid;
980 
981 	if (dlm_no_directory(r->res_ls))
982 		return;
983 
984 	to_nodeid = dlm_dir_nodeid(r);
985 	if (to_nodeid != dlm_our_nodeid())
986 		send_remove(r);
987 	else
988 		dlm_dir_remove_entry(r->res_ls, to_nodeid,
989 				     r->res_name, r->res_length);
990 }
991 
992 /* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
993    found since they are in order of newest to oldest? */
994 
995 static int shrink_bucket(struct dlm_ls *ls, int b)
996 {
997 	struct dlm_rsb *r;
998 	int count = 0, found;
999 
1000 	for (;;) {
1001 		found = 0;
1002 		spin_lock(&ls->ls_rsbtbl[b].lock);
1003 		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
1004 					    res_hashchain) {
1005 			if (!time_after_eq(jiffies, r->res_toss_time +
1006 					   dlm_config.ci_toss_secs * HZ))
1007 				continue;
1008 			found = 1;
1009 			break;
1010 		}
1011 
1012 		if (!found) {
1013 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1014 			break;
1015 		}
1016 
1017 		if (kref_put(&r->res_ref, kill_rsb)) {
1018 			list_del(&r->res_hashchain);
1019 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1020 
1021 			if (is_master(r))
1022 				dir_remove(r);
1023 			dlm_free_rsb(r);
1024 			count++;
1025 		} else {
1026 			spin_unlock(&ls->ls_rsbtbl[b].lock);
1027 			log_error(ls, "tossed rsb in use %s", r->res_name);
1028 		}
1029 	}
1030 
1031 	return count;
1032 }
1033 
1034 void dlm_scan_rsbs(struct dlm_ls *ls)
1035 {
1036 	int i;
1037 
1038 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1039 		shrink_bucket(ls, i);
1040 		if (dlm_locking_stopped(ls))
1041 			break;
1042 		cond_resched();
1043 	}
1044 }
1045 
1046 static void add_timeout(struct dlm_lkb *lkb)
1047 {
1048 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1049 
1050 	if (is_master_copy(lkb))
1051 		return;
1052 
1053 	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1054 	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1055 		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
1056 		goto add_it;
1057 	}
1058 	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
1059 		goto add_it;
1060 	return;
1061 
1062  add_it:
1063 	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1064 	mutex_lock(&ls->ls_timeout_mutex);
1065 	hold_lkb(lkb);
1066 	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1067 	mutex_unlock(&ls->ls_timeout_mutex);
1068 }
1069 
1070 static void del_timeout(struct dlm_lkb *lkb)
1071 {
1072 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1073 
1074 	mutex_lock(&ls->ls_timeout_mutex);
1075 	if (!list_empty(&lkb->lkb_time_list)) {
1076 		list_del_init(&lkb->lkb_time_list);
1077 		unhold_lkb(lkb);
1078 	}
1079 	mutex_unlock(&ls->ls_timeout_mutex);
1080 }
1081 
1082 /* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
1083    lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
1084    and then lock rsb because of lock ordering in add_timeout.  We may need
1085    to specify some special timeout-related bits in the lkb that are just to
1086    be accessed under the timeout_mutex. */
1087 
1088 void dlm_scan_timeout(struct dlm_ls *ls)
1089 {
1090 	struct dlm_rsb *r;
1091 	struct dlm_lkb *lkb;
1092 	int do_cancel, do_warn;
1093 	s64 wait_us;
1094 
1095 	for (;;) {
1096 		if (dlm_locking_stopped(ls))
1097 			break;
1098 
1099 		do_cancel = 0;
1100 		do_warn = 0;
1101 		mutex_lock(&ls->ls_timeout_mutex);
1102 		list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
1103 
1104 			wait_us = ktime_to_us(ktime_sub(ktime_get(),
1105 					      		lkb->lkb_timestamp));
1106 
1107 			if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
1108 			    wait_us >= (lkb->lkb_timeout_cs * 10000))
1109 				do_cancel = 1;
1110 
1111 			if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1112 			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
1113 				do_warn = 1;
1114 
1115 			if (!do_cancel && !do_warn)
1116 				continue;
1117 			hold_lkb(lkb);
1118 			break;
1119 		}
1120 		mutex_unlock(&ls->ls_timeout_mutex);
1121 
1122 		if (!do_cancel && !do_warn)
1123 			break;
1124 
1125 		r = lkb->lkb_resource;
1126 		hold_rsb(r);
1127 		lock_rsb(r);
1128 
1129 		if (do_warn) {
1130 			/* clear flag so we only warn once */
1131 			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1132 			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
1133 				del_timeout(lkb);
1134 			dlm_timeout_warn(lkb);
1135 		}
1136 
1137 		if (do_cancel) {
1138 			log_debug(ls, "timeout cancel %x node %d %s",
1139 				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1140 			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1141 			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
1142 			del_timeout(lkb);
1143 			_cancel_lock(r, lkb);
1144 		}
1145 
1146 		unlock_rsb(r);
1147 		unhold_rsb(r);
1148 		dlm_put_lkb(lkb);
1149 	}
1150 }
1151 
1152 /* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
1153    dlm_recoverd before checking/setting ls_recover_begin. */
1154 
1155 void dlm_adjust_timeouts(struct dlm_ls *ls)
1156 {
1157 	struct dlm_lkb *lkb;
1158 	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1159 
1160 	ls->ls_recover_begin = 0;
1161 	mutex_lock(&ls->ls_timeout_mutex);
1162 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1163 		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1164 	mutex_unlock(&ls->ls_timeout_mutex);
1165 }
1166 
1167 /* lkb is master or local copy */
1168 
1169 static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1170 {
1171 	int b, len = r->res_ls->ls_lvblen;
1172 
1173 	/* b=1 lvb returned to caller
1174 	   b=0 lvb written to rsb or invalidated
1175 	   b=-1 do nothing */
1176 
1177 	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
1178 
1179 	if (b == 1) {
1180 		if (!lkb->lkb_lvbptr)
1181 			return;
1182 
1183 		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1184 			return;
1185 
1186 		if (!r->res_lvbptr)
1187 			return;
1188 
1189 		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
1190 		lkb->lkb_lvbseq = r->res_lvbseq;
1191 
1192 	} else if (b == 0) {
1193 		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
1194 			rsb_set_flag(r, RSB_VALNOTVALID);
1195 			return;
1196 		}
1197 
1198 		if (!lkb->lkb_lvbptr)
1199 			return;
1200 
1201 		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1202 			return;
1203 
1204 		if (!r->res_lvbptr)
1205 			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1206 
1207 		if (!r->res_lvbptr)
1208 			return;
1209 
1210 		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
1211 		r->res_lvbseq++;
1212 		lkb->lkb_lvbseq = r->res_lvbseq;
1213 		rsb_clear_flag(r, RSB_VALNOTVALID);
1214 	}
1215 
1216 	if (rsb_flag(r, RSB_VALNOTVALID))
1217 		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
1218 }
1219 
1220 static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1221 {
1222 	if (lkb->lkb_grmode < DLM_LOCK_PW)
1223 		return;
1224 
1225 	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
1226 		rsb_set_flag(r, RSB_VALNOTVALID);
1227 		return;
1228 	}
1229 
1230 	if (!lkb->lkb_lvbptr)
1231 		return;
1232 
1233 	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1234 		return;
1235 
1236 	if (!r->res_lvbptr)
1237 		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1238 
1239 	if (!r->res_lvbptr)
1240 		return;
1241 
1242 	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
1243 	r->res_lvbseq++;
1244 	rsb_clear_flag(r, RSB_VALNOTVALID);
1245 }
1246 
1247 /* lkb is process copy (pc) */
1248 
1249 static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1250 			    struct dlm_message *ms)
1251 {
1252 	int b;
1253 
1254 	if (!lkb->lkb_lvbptr)
1255 		return;
1256 
1257 	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1258 		return;
1259 
1260 	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
1261 	if (b == 1) {
1262 		int len = receive_extralen(ms);
1263 		if (len > DLM_RESNAME_MAXLEN)
1264 			len = DLM_RESNAME_MAXLEN;
1265 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
1266 		lkb->lkb_lvbseq = ms->m_lvbseq;
1267 	}
1268 }
1269 
1270 /* Manipulate lkb's on rsb's convert/granted/waiting queues
1271    remove_lock -- used for unlock, removes lkb from granted
1272    revert_lock -- used for cancel, moves lkb from convert to granted
1273    grant_lock  -- used for request and convert, adds lkb to granted or
1274                   moves lkb from convert or waiting to granted
1275 
1276    Each of these is used for master or local copy lkb's.  There is
1277    also a _pc() variation used to make the corresponding change on
1278    a process copy (pc) lkb. */
1279 
1280 static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1281 {
1282 	del_lkb(r, lkb);
1283 	lkb->lkb_grmode = DLM_LOCK_IV;
1284 	/* this unhold undoes the original ref from create_lkb()
1285 	   so this leads to the lkb being freed */
1286 	unhold_lkb(lkb);
1287 }
1288 
1289 static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1290 {
1291 	set_lvb_unlock(r, lkb);
1292 	_remove_lock(r, lkb);
1293 }
1294 
1295 static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1296 {
1297 	_remove_lock(r, lkb);
1298 }
1299 
1300 /* returns: 0 did nothing
1301 	    1 moved lock to granted
1302 	   -1 removed lock */
1303 
1304 static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1305 {
1306 	int rv = 0;
1307 
1308 	lkb->lkb_rqmode = DLM_LOCK_IV;
1309 
1310 	switch (lkb->lkb_status) {
1311 	case DLM_LKSTS_GRANTED:
1312 		break;
1313 	case DLM_LKSTS_CONVERT:
1314 		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1315 		rv = 1;
1316 		break;
1317 	case DLM_LKSTS_WAITING:
1318 		del_lkb(r, lkb);
1319 		lkb->lkb_grmode = DLM_LOCK_IV;
1320 		/* this unhold undoes the original ref from create_lkb()
1321 		   so this leads to the lkb being freed */
1322 		unhold_lkb(lkb);
1323 		rv = -1;
1324 		break;
1325 	default:
1326 		log_print("invalid status for revert %d", lkb->lkb_status);
1327 	}
1328 	return rv;
1329 }
1330 
1331 static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1332 {
1333 	return revert_lock(r, lkb);
1334 }
1335 
1336 static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1337 {
1338 	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
1339 		lkb->lkb_grmode = lkb->lkb_rqmode;
1340 		if (lkb->lkb_status)
1341 			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1342 		else
1343 			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1344 	}
1345 
1346 	lkb->lkb_rqmode = DLM_LOCK_IV;
1347 }
1348 
1349 static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1350 {
1351 	set_lvb_lock(r, lkb);
1352 	_grant_lock(r, lkb);
1353 	lkb->lkb_highbast = 0;
1354 }
1355 
1356 static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1357 			  struct dlm_message *ms)
1358 {
1359 	set_lvb_lock_pc(r, lkb, ms);
1360 	_grant_lock(r, lkb);
1361 }
1362 
1363 /* called by grant_pending_locks() which means an async grant message must
1364    be sent to the requesting node in addition to granting the lock if the
1365    lkb belongs to a remote node. */
1366 
1367 static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1368 {
1369 	grant_lock(r, lkb);
1370 	if (is_master_copy(lkb))
1371 		send_grant(r, lkb);
1372 	else
1373 		queue_cast(r, lkb, 0);
1374 }
1375 
1376 /* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
1377    change the granted/requested modes.  We're munging things accordingly in
1378    the process copy.
1379    CONVDEADLK: our grmode may have been forced down to NL to resolve a
1380    conversion deadlock
1381    ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
1382    compatible with other granted locks */
1383 
1384 static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
1385 {
1386 	if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
1387 		log_print("munge_demoted %x invalid reply type %d",
1388 			  lkb->lkb_id, ms->m_type);
1389 		return;
1390 	}
1391 
1392 	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
1393 		log_print("munge_demoted %x invalid modes gr %d rq %d",
1394 			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
1395 		return;
1396 	}
1397 
1398 	lkb->lkb_grmode = DLM_LOCK_NL;
1399 }
1400 
1401 static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
1402 {
1403 	if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
1404 	    ms->m_type != DLM_MSG_GRANT) {
1405 		log_print("munge_altmode %x invalid reply type %d",
1406 			  lkb->lkb_id, ms->m_type);
1407 		return;
1408 	}
1409 
1410 	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
1411 		lkb->lkb_rqmode = DLM_LOCK_PR;
1412 	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
1413 		lkb->lkb_rqmode = DLM_LOCK_CW;
1414 	else {
1415 		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
1416 		dlm_print_lkb(lkb);
1417 	}
1418 }
1419 
1420 static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1421 {
1422 	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1423 					   lkb_statequeue);
1424 	if (lkb->lkb_id == first->lkb_id)
1425 		return 1;
1426 
1427 	return 0;
1428 }
1429 
1430 /* Check if the given lkb conflicts with another lkb on the queue. */
1431 
1432 static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1433 {
1434 	struct dlm_lkb *this;
1435 
1436 	list_for_each_entry(this, head, lkb_statequeue) {
1437 		if (this == lkb)
1438 			continue;
1439 		if (!modes_compat(this, lkb))
1440 			return 1;
1441 	}
1442 	return 0;
1443 }
1444 
1445 /*
1446  * "A conversion deadlock arises with a pair of lock requests in the converting
1447  * queue for one resource.  The granted mode of each lock blocks the requested
1448  * mode of the other lock."
1449  *
1450  * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
1451  * convert queue from being granted, then deadlk/demote lkb.
1452  *
1453  * Example:
1454  * Granted Queue: empty
1455  * Convert Queue: NL->EX (first lock)
1456  *                PR->EX (second lock)
1457  *
1458  * The first lock can't be granted because of the granted mode of the second
1459  * lock and the second lock can't be granted because it's not first in the
1460  * list.  We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
1461  * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
1462  * flag set and return DEMOTED in the lksb flags.
1463  *
1464  * Originally, this function detected conv-deadlk in a more limited scope:
1465  * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
1466  * - if lkb1 was the first entry in the queue (not just earlier), and was
1467  *   blocked by the granted mode of lkb2, and there was nothing on the
1468  *   granted queue preventing lkb1 from being granted immediately, i.e.
1469  *   lkb2 was the only thing preventing lkb1 from being granted.
1470  *
1471  * That second condition meant we'd only say there was conv-deadlk if
1472  * resolving it (by demotion) would lead to the first lock on the convert
1473  * queue being granted right away.  It allowed conversion deadlocks to exist
1474  * between locks on the convert queue while they couldn't be granted anyway.
1475  *
1476  * Now, we detect and take action on conversion deadlocks immediately when
1477  * they're created, even if they may not be immediately consequential.  If
1478  * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
1479  * mode that would prevent lkb1's conversion from being granted, we do a
1480  * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
1481  * I think this means that the lkb_is_ahead condition below should always
1482  * be zero, i.e. there will never be conv-deadlk between two locks that are
1483  * both already on the convert queue.
1484  */
1485 
1486 static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
1487 {
1488 	struct dlm_lkb *lkb1;
1489 	int lkb_is_ahead = 0;
1490 
1491 	list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
1492 		if (lkb1 == lkb2) {
1493 			lkb_is_ahead = 1;
1494 			continue;
1495 		}
1496 
1497 		if (!lkb_is_ahead) {
1498 			if (!modes_compat(lkb2, lkb1))
1499 				return 1;
1500 		} else {
1501 			if (!modes_compat(lkb2, lkb1) &&
1502 			    !modes_compat(lkb1, lkb2))
1503 				return 1;
1504 		}
1505 	}
1506 	return 0;
1507 }
1508 
1509 /*
1510  * Return 1 if the lock can be granted, 0 otherwise.
1511  * Also detect and resolve conversion deadlocks.
1512  *
1513  * lkb is the lock to be granted
1514  *
1515  * now is 1 if the function is being called in the context of the
1516  * immediate request, it is 0 if called later, after the lock has been
1517  * queued.
1518  *
1519  * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1520  */
1521 
1522 static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1523 {
1524 	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1525 
1526 	/*
1527 	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1528 	 * a new request for a NL mode lock being blocked.
1529 	 *
1530 	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1531 	 * request, then it would be granted.  In essence, the use of this flag
1532 	 * tells the Lock Manager to expedite theis request by not considering
1533 	 * what may be in the CONVERTING or WAITING queues...  As of this
1534 	 * writing, the EXPEDITE flag can be used only with new requests for NL
1535 	 * mode locks.  This flag is not valid for conversion requests.
1536 	 *
1537 	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
1538 	 * conversion or used with a non-NL requested mode.  We also know an
1539 	 * EXPEDITE request is always granted immediately, so now must always
1540 	 * be 1.  The full condition to grant an expedite request: (now &&
1541 	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1542 	 * therefore be shortened to just checking the flag.
1543 	 */
1544 
1545 	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1546 		return 1;
1547 
1548 	/*
1549 	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1550 	 * added to the remaining conditions.
1551 	 */
1552 
1553 	if (queue_conflict(&r->res_grantqueue, lkb))
1554 		goto out;
1555 
1556 	/*
1557 	 * 6-3: By default, a conversion request is immediately granted if the
1558 	 * requested mode is compatible with the modes of all other granted
1559 	 * locks
1560 	 */
1561 
1562 	if (queue_conflict(&r->res_convertqueue, lkb))
1563 		goto out;
1564 
1565 	/*
1566 	 * 6-5: But the default algorithm for deciding whether to grant or
1567 	 * queue conversion requests does not by itself guarantee that such
1568 	 * requests are serviced on a "first come first serve" basis.  This, in
1569 	 * turn, can lead to a phenomenon known as "indefinate postponement".
1570 	 *
1571 	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1572 	 * the system service employed to request a lock conversion.  This flag
1573 	 * forces certain conversion requests to be queued, even if they are
1574 	 * compatible with the granted modes of other locks on the same
1575 	 * resource.  Thus, the use of this flag results in conversion requests
1576 	 * being ordered on a "first come first servce" basis.
1577 	 *
1578 	 * DCT: This condition is all about new conversions being able to occur
1579 	 * "in place" while the lock remains on the granted queue (assuming
1580 	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
1581 	 * doesn't _have_ to go onto the convert queue where it's processed in
1582 	 * order.  The "now" variable is necessary to distinguish converts
1583 	 * being received and processed for the first time now, because once a
1584 	 * convert is moved to the conversion queue the condition below applies
1585 	 * requiring fifo granting.
1586 	 */
1587 
1588 	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1589 		return 1;
1590 
1591 	/*
1592 	 * The NOORDER flag is set to avoid the standard vms rules on grant
1593 	 * order.
1594 	 */
1595 
1596 	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1597 		return 1;
1598 
1599 	/*
1600 	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1601 	 * granted until all other conversion requests ahead of it are granted
1602 	 * and/or canceled.
1603 	 */
1604 
1605 	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1606 		return 1;
1607 
1608 	/*
1609 	 * 6-4: By default, a new request is immediately granted only if all
1610 	 * three of the following conditions are satisfied when the request is
1611 	 * issued:
1612 	 * - The queue of ungranted conversion requests for the resource is
1613 	 *   empty.
1614 	 * - The queue of ungranted new requests for the resource is empty.
1615 	 * - The mode of the new request is compatible with the most
1616 	 *   restrictive mode of all granted locks on the resource.
1617 	 */
1618 
1619 	if (now && !conv && list_empty(&r->res_convertqueue) &&
1620 	    list_empty(&r->res_waitqueue))
1621 		return 1;
1622 
1623 	/*
1624 	 * 6-4: Once a lock request is in the queue of ungranted new requests,
1625 	 * it cannot be granted until the queue of ungranted conversion
1626 	 * requests is empty, all ungranted new requests ahead of it are
1627 	 * granted and/or canceled, and it is compatible with the granted mode
1628 	 * of the most restrictive lock granted on the resource.
1629 	 */
1630 
1631 	if (!now && !conv && list_empty(&r->res_convertqueue) &&
1632 	    first_in_list(lkb, &r->res_waitqueue))
1633 		return 1;
1634  out:
1635 	return 0;
1636 }
1637 
1638 static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
1639 			  int *err)
1640 {
1641 	int rv;
1642 	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1643 	int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
1644 
1645 	if (err)
1646 		*err = 0;
1647 
1648 	rv = _can_be_granted(r, lkb, now);
1649 	if (rv)
1650 		goto out;
1651 
1652 	/*
1653 	 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
1654 	 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
1655 	 * cancels one of the locks.
1656 	 */
1657 
1658 	if (is_convert && can_be_queued(lkb) &&
1659 	    conversion_deadlock_detect(r, lkb)) {
1660 		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
1661 			lkb->lkb_grmode = DLM_LOCK_NL;
1662 			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1663 		} else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1664 			if (err)
1665 				*err = -EDEADLK;
1666 			else {
1667 				log_print("can_be_granted deadlock %x now %d",
1668 					  lkb->lkb_id, now);
1669 				dlm_dump_rsb(r);
1670 			}
1671 		}
1672 		goto out;
1673 	}
1674 
1675 	/*
1676 	 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
1677 	 * to grant a request in a mode other than the normal rqmode.  It's a
1678 	 * simple way to provide a big optimization to applications that can
1679 	 * use them.
1680 	 */
1681 
1682 	if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
1683 		alt = DLM_LOCK_PR;
1684 	else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
1685 		alt = DLM_LOCK_CW;
1686 
1687 	if (alt) {
1688 		lkb->lkb_rqmode = alt;
1689 		rv = _can_be_granted(r, lkb, now);
1690 		if (rv)
1691 			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1692 		else
1693 			lkb->lkb_rqmode = rqmode;
1694 	}
1695  out:
1696 	return rv;
1697 }
1698 
1699 /* FIXME: I don't think that can_be_granted() can/will demote or find deadlock
1700    for locks pending on the convert list.  Once verified (watch for these
1701    log_prints), we should be able to just call _can_be_granted() and not
1702    bother with the demote/deadlk cases here (and there's no easy way to deal
1703    with a deadlk here, we'd have to generate something like grant_lock with
1704    the deadlk error.) */
1705 
1706 /* Returns the highest requested mode of all blocked conversions; sets
1707    cw if there's a blocked conversion to DLM_LOCK_CW. */
1708 
1709 static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
1710 {
1711 	struct dlm_lkb *lkb, *s;
1712 	int hi, demoted, quit, grant_restart, demote_restart;
1713 	int deadlk;
1714 
1715 	quit = 0;
1716  restart:
1717 	grant_restart = 0;
1718 	demote_restart = 0;
1719 	hi = DLM_LOCK_IV;
1720 
1721 	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1722 		demoted = is_demoted(lkb);
1723 		deadlk = 0;
1724 
1725 		if (can_be_granted(r, lkb, 0, &deadlk)) {
1726 			grant_lock_pending(r, lkb);
1727 			grant_restart = 1;
1728 			continue;
1729 		}
1730 
1731 		if (!demoted && is_demoted(lkb)) {
1732 			log_print("WARN: pending demoted %x node %d %s",
1733 				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1734 			demote_restart = 1;
1735 			continue;
1736 		}
1737 
1738 		if (deadlk) {
1739 			log_print("WARN: pending deadlock %x node %d %s",
1740 				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1741 			dlm_dump_rsb(r);
1742 			continue;
1743 		}
1744 
1745 		hi = max_t(int, lkb->lkb_rqmode, hi);
1746 
1747 		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
1748 			*cw = 1;
1749 	}
1750 
1751 	if (grant_restart)
1752 		goto restart;
1753 	if (demote_restart && !quit) {
1754 		quit = 1;
1755 		goto restart;
1756 	}
1757 
1758 	return max_t(int, high, hi);
1759 }
1760 
1761 static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw)
1762 {
1763 	struct dlm_lkb *lkb, *s;
1764 
1765 	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1766 		if (can_be_granted(r, lkb, 0, NULL))
1767 			grant_lock_pending(r, lkb);
1768                 else {
1769 			high = max_t(int, lkb->lkb_rqmode, high);
1770 			if (lkb->lkb_rqmode == DLM_LOCK_CW)
1771 				*cw = 1;
1772 		}
1773 	}
1774 
1775 	return high;
1776 }
1777 
1778 /* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
1779    on either the convert or waiting queue.
1780    high is the largest rqmode of all locks blocked on the convert or
1781    waiting queue. */
1782 
1783 static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
1784 {
1785 	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
1786 		if (gr->lkb_highbast < DLM_LOCK_EX)
1787 			return 1;
1788 		return 0;
1789 	}
1790 
1791 	if (gr->lkb_highbast < high &&
1792 	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
1793 		return 1;
1794 	return 0;
1795 }
1796 
1797 static void grant_pending_locks(struct dlm_rsb *r)
1798 {
1799 	struct dlm_lkb *lkb, *s;
1800 	int high = DLM_LOCK_IV;
1801 	int cw = 0;
1802 
1803 	DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
1804 
1805 	high = grant_pending_convert(r, high, &cw);
1806 	high = grant_pending_wait(r, high, &cw);
1807 
1808 	if (high == DLM_LOCK_IV)
1809 		return;
1810 
1811 	/*
1812 	 * If there are locks left on the wait/convert queue then send blocking
1813 	 * ASTs to granted locks based on the largest requested mode (high)
1814 	 * found above.
1815 	 */
1816 
1817 	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1818 		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
1819 			if (cw && high == DLM_LOCK_PR &&
1820 			    lkb->lkb_grmode == DLM_LOCK_PR)
1821 				queue_bast(r, lkb, DLM_LOCK_CW);
1822 			else
1823 				queue_bast(r, lkb, high);
1824 			lkb->lkb_highbast = high;
1825 		}
1826 	}
1827 }
1828 
1829 static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
1830 {
1831 	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
1832 	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
1833 		if (gr->lkb_highbast < DLM_LOCK_EX)
1834 			return 1;
1835 		return 0;
1836 	}
1837 
1838 	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
1839 		return 1;
1840 	return 0;
1841 }
1842 
1843 static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1844 			    struct dlm_lkb *lkb)
1845 {
1846 	struct dlm_lkb *gr;
1847 
1848 	list_for_each_entry(gr, head, lkb_statequeue) {
1849 		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
1850 			queue_bast(r, gr, lkb->lkb_rqmode);
1851 			gr->lkb_highbast = lkb->lkb_rqmode;
1852 		}
1853 	}
1854 }
1855 
1856 static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1857 {
1858 	send_bast_queue(r, &r->res_grantqueue, lkb);
1859 }
1860 
1861 static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1862 {
1863 	send_bast_queue(r, &r->res_grantqueue, lkb);
1864 	send_bast_queue(r, &r->res_convertqueue, lkb);
1865 }
1866 
1867 /* set_master(r, lkb) -- set the master nodeid of a resource
1868 
1869    The purpose of this function is to set the nodeid field in the given
1870    lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
1871    known, it can just be copied to the lkb and the function will return
1872    0.  If the rsb's nodeid is _not_ known, it needs to be looked up
1873    before it can be copied to the lkb.
1874 
1875    When the rsb nodeid is being looked up remotely, the initial lkb
1876    causing the lookup is kept on the ls_waiters list waiting for the
1877    lookup reply.  Other lkb's waiting for the same rsb lookup are kept
1878    on the rsb's res_lookup list until the master is verified.
1879 
1880    Return values:
1881    0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1882    1: the rsb master is not available and the lkb has been placed on
1883       a wait queue
1884 */
1885 
1886 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1887 {
1888 	struct dlm_ls *ls = r->res_ls;
1889 	int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1890 
1891 	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1892 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1893 		r->res_first_lkid = lkb->lkb_id;
1894 		lkb->lkb_nodeid = r->res_nodeid;
1895 		return 0;
1896 	}
1897 
1898 	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1899 		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1900 		return 1;
1901 	}
1902 
1903 	if (r->res_nodeid == 0) {
1904 		lkb->lkb_nodeid = 0;
1905 		return 0;
1906 	}
1907 
1908 	if (r->res_nodeid > 0) {
1909 		lkb->lkb_nodeid = r->res_nodeid;
1910 		return 0;
1911 	}
1912 
1913 	DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
1914 
1915 	dir_nodeid = dlm_dir_nodeid(r);
1916 
1917 	if (dir_nodeid != our_nodeid) {
1918 		r->res_first_lkid = lkb->lkb_id;
1919 		send_lookup(r, lkb);
1920 		return 1;
1921 	}
1922 
1923 	for (i = 0; i < 2; i++) {
1924 		/* It's possible for dlm_scand to remove an old rsb for
1925 		   this same resource from the toss list, us to create
1926 		   a new one, look up the master locally, and find it
1927 		   already exists just before dlm_scand does the
1928 		   dir_remove() on the previous rsb. */
1929 
1930 		error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1931 				       r->res_length, &ret_nodeid);
1932 		if (!error)
1933 			break;
1934 		log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1935 		schedule();
1936 	}
1937 	if (error && error != -EEXIST)
1938 		return error;
1939 
1940 	if (ret_nodeid == our_nodeid) {
1941 		r->res_first_lkid = 0;
1942 		r->res_nodeid = 0;
1943 		lkb->lkb_nodeid = 0;
1944 	} else {
1945 		r->res_first_lkid = lkb->lkb_id;
1946 		r->res_nodeid = ret_nodeid;
1947 		lkb->lkb_nodeid = ret_nodeid;
1948 	}
1949 	return 0;
1950 }
1951 
1952 static void process_lookup_list(struct dlm_rsb *r)
1953 {
1954 	struct dlm_lkb *lkb, *safe;
1955 
1956 	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1957 		list_del_init(&lkb->lkb_rsb_lookup);
1958 		_request_lock(r, lkb);
1959 		schedule();
1960 	}
1961 }
1962 
1963 /* confirm_master -- confirm (or deny) an rsb's master nodeid */
1964 
1965 static void confirm_master(struct dlm_rsb *r, int error)
1966 {
1967 	struct dlm_lkb *lkb;
1968 
1969 	if (!r->res_first_lkid)
1970 		return;
1971 
1972 	switch (error) {
1973 	case 0:
1974 	case -EINPROGRESS:
1975 		r->res_first_lkid = 0;
1976 		process_lookup_list(r);
1977 		break;
1978 
1979 	case -EAGAIN:
1980 	case -EBADR:
1981 	case -ENOTBLK:
1982 		/* the remote request failed and won't be retried (it was
1983 		   a NOQUEUE, or has been canceled/unlocked); make a waiting
1984 		   lkb the first_lkid */
1985 
1986 		r->res_first_lkid = 0;
1987 
1988 		if (!list_empty(&r->res_lookup)) {
1989 			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1990 					 lkb_rsb_lookup);
1991 			list_del_init(&lkb->lkb_rsb_lookup);
1992 			r->res_first_lkid = lkb->lkb_id;
1993 			_request_lock(r, lkb);
1994 		}
1995 		break;
1996 
1997 	default:
1998 		log_error(r->res_ls, "confirm_master unknown error %d", error);
1999 	}
2000 }
2001 
2002 static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
2003 			 int namelen, unsigned long timeout_cs,
2004 			 void (*ast) (void *astparam),
2005 			 void *astparam,
2006 			 void (*bast) (void *astparam, int mode),
2007 			 struct dlm_args *args)
2008 {
2009 	int rv = -EINVAL;
2010 
2011 	/* check for invalid arg usage */
2012 
2013 	if (mode < 0 || mode > DLM_LOCK_EX)
2014 		goto out;
2015 
2016 	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
2017 		goto out;
2018 
2019 	if (flags & DLM_LKF_CANCEL)
2020 		goto out;
2021 
2022 	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
2023 		goto out;
2024 
2025 	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
2026 		goto out;
2027 
2028 	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
2029 		goto out;
2030 
2031 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
2032 		goto out;
2033 
2034 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
2035 		goto out;
2036 
2037 	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
2038 		goto out;
2039 
2040 	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
2041 		goto out;
2042 
2043 	if (!ast || !lksb)
2044 		goto out;
2045 
2046 	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
2047 		goto out;
2048 
2049 	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
2050 		goto out;
2051 
2052 	/* these args will be copied to the lkb in validate_lock_args,
2053 	   it cannot be done now because when converting locks, fields in
2054 	   an active lkb cannot be modified before locking the rsb */
2055 
2056 	args->flags = flags;
2057 	args->astfn = ast;
2058 	args->astparam = astparam;
2059 	args->bastfn = bast;
2060 	args->timeout = timeout_cs;
2061 	args->mode = mode;
2062 	args->lksb = lksb;
2063 	rv = 0;
2064  out:
2065 	return rv;
2066 }
2067 
2068 static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
2069 {
2070 	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
2071  		      DLM_LKF_FORCEUNLOCK))
2072 		return -EINVAL;
2073 
2074 	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
2075 		return -EINVAL;
2076 
2077 	args->flags = flags;
2078 	args->astparam = astarg;
2079 	return 0;
2080 }
2081 
2082 static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2083 			      struct dlm_args *args)
2084 {
2085 	int rv = -EINVAL;
2086 
2087 	if (args->flags & DLM_LKF_CONVERT) {
2088 		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
2089 			goto out;
2090 
2091 		if (args->flags & DLM_LKF_QUECVT &&
2092 		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
2093 			goto out;
2094 
2095 		rv = -EBUSY;
2096 		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2097 			goto out;
2098 
2099 		if (lkb->lkb_wait_type)
2100 			goto out;
2101 
2102 		if (is_overlap(lkb))
2103 			goto out;
2104 	}
2105 
2106 	lkb->lkb_exflags = args->flags;
2107 	lkb->lkb_sbflags = 0;
2108 	lkb->lkb_astfn = args->astfn;
2109 	lkb->lkb_astparam = args->astparam;
2110 	lkb->lkb_bastfn = args->bastfn;
2111 	lkb->lkb_rqmode = args->mode;
2112 	lkb->lkb_lksb = args->lksb;
2113 	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
2114 	lkb->lkb_ownpid = (int) current->pid;
2115 	lkb->lkb_timeout_cs = args->timeout;
2116 	rv = 0;
2117  out:
2118 	if (rv)
2119 		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
2120 			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
2121 			  lkb->lkb_status, lkb->lkb_wait_type,
2122 			  lkb->lkb_resource->res_name);
2123 	return rv;
2124 }
2125 
2126 /* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
2127    for success */
2128 
2129 /* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
2130    because there may be a lookup in progress and it's valid to do
2131    cancel/unlockf on it */
2132 
2133 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2134 {
2135 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
2136 	int rv = -EINVAL;
2137 
2138 	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
2139 		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
2140 		dlm_print_lkb(lkb);
2141 		goto out;
2142 	}
2143 
2144 	/* an lkb may still exist even though the lock is EOL'ed due to a
2145 	   cancel, unlock or failed noqueue request; an app can't use these
2146 	   locks; return same error as if the lkid had not been found at all */
2147 
2148 	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
2149 		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
2150 		rv = -ENOENT;
2151 		goto out;
2152 	}
2153 
2154 	/* an lkb may be waiting for an rsb lookup to complete where the
2155 	   lookup was initiated by another lock */
2156 
2157 	if (!list_empty(&lkb->lkb_rsb_lookup)) {
2158 		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2159 			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2160 			list_del_init(&lkb->lkb_rsb_lookup);
2161 			queue_cast(lkb->lkb_resource, lkb,
2162 				   args->flags & DLM_LKF_CANCEL ?
2163 				   -DLM_ECANCEL : -DLM_EUNLOCK);
2164 			unhold_lkb(lkb); /* undoes create_lkb() */
2165 		}
2166 		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
2167 		rv = -EBUSY;
2168 		goto out;
2169 	}
2170 
2171 	/* cancel not allowed with another cancel/unlock in progress */
2172 
2173 	if (args->flags & DLM_LKF_CANCEL) {
2174 		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
2175 			goto out;
2176 
2177 		if (is_overlap(lkb))
2178 			goto out;
2179 
2180 		/* don't let scand try to do a cancel */
2181 		del_timeout(lkb);
2182 
2183 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
2184 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
2185 			rv = -EBUSY;
2186 			goto out;
2187 		}
2188 
2189 		/* there's nothing to cancel */
2190 		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
2191 		    !lkb->lkb_wait_type) {
2192 			rv = -EBUSY;
2193 			goto out;
2194 		}
2195 
2196 		switch (lkb->lkb_wait_type) {
2197 		case DLM_MSG_LOOKUP:
2198 		case DLM_MSG_REQUEST:
2199 			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
2200 			rv = -EBUSY;
2201 			goto out;
2202 		case DLM_MSG_UNLOCK:
2203 		case DLM_MSG_CANCEL:
2204 			goto out;
2205 		}
2206 		/* add_to_waiters() will set OVERLAP_CANCEL */
2207 		goto out_ok;
2208 	}
2209 
2210 	/* do we need to allow a force-unlock if there's a normal unlock
2211 	   already in progress?  in what conditions could the normal unlock
2212 	   fail such that we'd want to send a force-unlock to be sure? */
2213 
2214 	if (args->flags & DLM_LKF_FORCEUNLOCK) {
2215 		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
2216 			goto out;
2217 
2218 		if (is_overlap_unlock(lkb))
2219 			goto out;
2220 
2221 		/* don't let scand try to do a cancel */
2222 		del_timeout(lkb);
2223 
2224 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
2225 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
2226 			rv = -EBUSY;
2227 			goto out;
2228 		}
2229 
2230 		switch (lkb->lkb_wait_type) {
2231 		case DLM_MSG_LOOKUP:
2232 		case DLM_MSG_REQUEST:
2233 			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
2234 			rv = -EBUSY;
2235 			goto out;
2236 		case DLM_MSG_UNLOCK:
2237 			goto out;
2238 		}
2239 		/* add_to_waiters() will set OVERLAP_UNLOCK */
2240 		goto out_ok;
2241 	}
2242 
2243 	/* normal unlock not allowed if there's any op in progress */
2244 	rv = -EBUSY;
2245 	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
2246 		goto out;
2247 
2248  out_ok:
2249 	/* an overlapping op shouldn't blow away exflags from other op */
2250 	lkb->lkb_exflags |= args->flags;
2251 	lkb->lkb_sbflags = 0;
2252 	lkb->lkb_astparam = args->astparam;
2253 	rv = 0;
2254  out:
2255 	if (rv)
2256 		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
2257 			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
2258 			  args->flags, lkb->lkb_wait_type,
2259 			  lkb->lkb_resource->res_name);
2260 	return rv;
2261 }
2262 
2263 /*
2264  * Four stage 4 varieties:
2265  * do_request(), do_convert(), do_unlock(), do_cancel()
2266  * These are called on the master node for the given lock and
2267  * from the central locking logic.
2268  */
2269 
2270 static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2271 {
2272 	int error = 0;
2273 
2274 	if (can_be_granted(r, lkb, 1, NULL)) {
2275 		grant_lock(r, lkb);
2276 		queue_cast(r, lkb, 0);
2277 		goto out;
2278 	}
2279 
2280 	if (can_be_queued(lkb)) {
2281 		error = -EINPROGRESS;
2282 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
2283 		send_blocking_asts(r, lkb);
2284 		add_timeout(lkb);
2285 		goto out;
2286 	}
2287 
2288 	error = -EAGAIN;
2289 	if (force_blocking_asts(lkb))
2290 		send_blocking_asts_all(r, lkb);
2291 	queue_cast(r, lkb, -EAGAIN);
2292 
2293  out:
2294 	return error;
2295 }
2296 
2297 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2298 {
2299 	int error = 0;
2300 	int deadlk = 0;
2301 
2302 	/* changing an existing lock may allow others to be granted */
2303 
2304 	if (can_be_granted(r, lkb, 1, &deadlk)) {
2305 		grant_lock(r, lkb);
2306 		queue_cast(r, lkb, 0);
2307 		grant_pending_locks(r);
2308 		goto out;
2309 	}
2310 
2311 	/* can_be_granted() detected that this lock would block in a conversion
2312 	   deadlock, so we leave it on the granted queue and return EDEADLK in
2313 	   the ast for the convert. */
2314 
2315 	if (deadlk) {
2316 		/* it's left on the granted queue */
2317 		log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s",
2318 			  lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status,
2319 			  lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name);
2320 		revert_lock(r, lkb);
2321 		queue_cast(r, lkb, -EDEADLK);
2322 		error = -EDEADLK;
2323 		goto out;
2324 	}
2325 
2326 	/* is_demoted() means the can_be_granted() above set the grmode
2327 	   to NL, and left us on the granted queue.  This auto-demotion
2328 	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
2329 	   now grantable.  We have to try to grant other converting locks
2330 	   before we try again to grant this one. */
2331 
2332 	if (is_demoted(lkb)) {
2333 		grant_pending_convert(r, DLM_LOCK_IV, NULL);
2334 		if (_can_be_granted(r, lkb, 1)) {
2335 			grant_lock(r, lkb);
2336 			queue_cast(r, lkb, 0);
2337 			grant_pending_locks(r);
2338 			goto out;
2339 		}
2340 		/* else fall through and move to convert queue */
2341 	}
2342 
2343 	if (can_be_queued(lkb)) {
2344 		error = -EINPROGRESS;
2345 		del_lkb(r, lkb);
2346 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2347 		send_blocking_asts(r, lkb);
2348 		add_timeout(lkb);
2349 		goto out;
2350 	}
2351 
2352 	error = -EAGAIN;
2353 	if (force_blocking_asts(lkb))
2354 		send_blocking_asts_all(r, lkb);
2355 	queue_cast(r, lkb, -EAGAIN);
2356 
2357  out:
2358 	return error;
2359 }
2360 
2361 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2362 {
2363 	remove_lock(r, lkb);
2364 	queue_cast(r, lkb, -DLM_EUNLOCK);
2365 	grant_pending_locks(r);
2366 	return -DLM_EUNLOCK;
2367 }
2368 
2369 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
2370 
2371 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2372 {
2373 	int error;
2374 
2375 	error = revert_lock(r, lkb);
2376 	if (error) {
2377 		queue_cast(r, lkb, -DLM_ECANCEL);
2378 		grant_pending_locks(r);
2379 		return -DLM_ECANCEL;
2380 	}
2381 	return 0;
2382 }
2383 
2384 /*
2385  * Four stage 3 varieties:
2386  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
2387  */
2388 
2389 /* add a new lkb to a possibly new rsb, called by requesting process */
2390 
2391 static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2392 {
2393 	int error;
2394 
2395 	/* set_master: sets lkb nodeid from r */
2396 
2397 	error = set_master(r, lkb);
2398 	if (error < 0)
2399 		goto out;
2400 	if (error) {
2401 		error = 0;
2402 		goto out;
2403 	}
2404 
2405 	if (is_remote(r))
2406 		/* receive_request() calls do_request() on remote node */
2407 		error = send_request(r, lkb);
2408 	else
2409 		error = do_request(r, lkb);
2410  out:
2411 	return error;
2412 }
2413 
2414 /* change some property of an existing lkb, e.g. mode */
2415 
2416 static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2417 {
2418 	int error;
2419 
2420 	if (is_remote(r))
2421 		/* receive_convert() calls do_convert() on remote node */
2422 		error = send_convert(r, lkb);
2423 	else
2424 		error = do_convert(r, lkb);
2425 
2426 	return error;
2427 }
2428 
2429 /* remove an existing lkb from the granted queue */
2430 
2431 static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2432 {
2433 	int error;
2434 
2435 	if (is_remote(r))
2436 		/* receive_unlock() calls do_unlock() on remote node */
2437 		error = send_unlock(r, lkb);
2438 	else
2439 		error = do_unlock(r, lkb);
2440 
2441 	return error;
2442 }
2443 
2444 /* remove an existing lkb from the convert or wait queue */
2445 
2446 static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2447 {
2448 	int error;
2449 
2450 	if (is_remote(r))
2451 		/* receive_cancel() calls do_cancel() on remote node */
2452 		error = send_cancel(r, lkb);
2453 	else
2454 		error = do_cancel(r, lkb);
2455 
2456 	return error;
2457 }
2458 
2459 /*
2460  * Four stage 2 varieties:
2461  * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
2462  */
2463 
2464 static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
2465 			int len, struct dlm_args *args)
2466 {
2467 	struct dlm_rsb *r;
2468 	int error;
2469 
2470 	error = validate_lock_args(ls, lkb, args);
2471 	if (error)
2472 		goto out;
2473 
2474 	error = find_rsb(ls, name, len, R_CREATE, &r);
2475 	if (error)
2476 		goto out;
2477 
2478 	lock_rsb(r);
2479 
2480 	attach_lkb(r, lkb);
2481 	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
2482 
2483 	error = _request_lock(r, lkb);
2484 
2485 	unlock_rsb(r);
2486 	put_rsb(r);
2487 
2488  out:
2489 	return error;
2490 }
2491 
2492 static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
2493 			struct dlm_args *args)
2494 {
2495 	struct dlm_rsb *r;
2496 	int error;
2497 
2498 	r = lkb->lkb_resource;
2499 
2500 	hold_rsb(r);
2501 	lock_rsb(r);
2502 
2503 	error = validate_lock_args(ls, lkb, args);
2504 	if (error)
2505 		goto out;
2506 
2507 	error = _convert_lock(r, lkb);
2508  out:
2509 	unlock_rsb(r);
2510 	put_rsb(r);
2511 	return error;
2512 }
2513 
2514 static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
2515 		       struct dlm_args *args)
2516 {
2517 	struct dlm_rsb *r;
2518 	int error;
2519 
2520 	r = lkb->lkb_resource;
2521 
2522 	hold_rsb(r);
2523 	lock_rsb(r);
2524 
2525 	error = validate_unlock_args(lkb, args);
2526 	if (error)
2527 		goto out;
2528 
2529 	error = _unlock_lock(r, lkb);
2530  out:
2531 	unlock_rsb(r);
2532 	put_rsb(r);
2533 	return error;
2534 }
2535 
2536 static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
2537 		       struct dlm_args *args)
2538 {
2539 	struct dlm_rsb *r;
2540 	int error;
2541 
2542 	r = lkb->lkb_resource;
2543 
2544 	hold_rsb(r);
2545 	lock_rsb(r);
2546 
2547 	error = validate_unlock_args(lkb, args);
2548 	if (error)
2549 		goto out;
2550 
2551 	error = _cancel_lock(r, lkb);
2552  out:
2553 	unlock_rsb(r);
2554 	put_rsb(r);
2555 	return error;
2556 }
2557 
2558 /*
2559  * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
2560  */
2561 
2562 int dlm_lock(dlm_lockspace_t *lockspace,
2563 	     int mode,
2564 	     struct dlm_lksb *lksb,
2565 	     uint32_t flags,
2566 	     void *name,
2567 	     unsigned int namelen,
2568 	     uint32_t parent_lkid,
2569 	     void (*ast) (void *astarg),
2570 	     void *astarg,
2571 	     void (*bast) (void *astarg, int mode))
2572 {
2573 	struct dlm_ls *ls;
2574 	struct dlm_lkb *lkb;
2575 	struct dlm_args args;
2576 	int error, convert = flags & DLM_LKF_CONVERT;
2577 
2578 	ls = dlm_find_lockspace_local(lockspace);
2579 	if (!ls)
2580 		return -EINVAL;
2581 
2582 	dlm_lock_recovery(ls);
2583 
2584 	if (convert)
2585 		error = find_lkb(ls, lksb->sb_lkid, &lkb);
2586 	else
2587 		error = create_lkb(ls, &lkb);
2588 
2589 	if (error)
2590 		goto out;
2591 
2592 	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
2593 			      astarg, bast, &args);
2594 	if (error)
2595 		goto out_put;
2596 
2597 	if (convert)
2598 		error = convert_lock(ls, lkb, &args);
2599 	else
2600 		error = request_lock(ls, lkb, name, namelen, &args);
2601 
2602 	if (error == -EINPROGRESS)
2603 		error = 0;
2604  out_put:
2605 	if (convert || error)
2606 		__put_lkb(ls, lkb);
2607 	if (error == -EAGAIN || error == -EDEADLK)
2608 		error = 0;
2609  out:
2610 	dlm_unlock_recovery(ls);
2611 	dlm_put_lockspace(ls);
2612 	return error;
2613 }
2614 
2615 int dlm_unlock(dlm_lockspace_t *lockspace,
2616 	       uint32_t lkid,
2617 	       uint32_t flags,
2618 	       struct dlm_lksb *lksb,
2619 	       void *astarg)
2620 {
2621 	struct dlm_ls *ls;
2622 	struct dlm_lkb *lkb;
2623 	struct dlm_args args;
2624 	int error;
2625 
2626 	ls = dlm_find_lockspace_local(lockspace);
2627 	if (!ls)
2628 		return -EINVAL;
2629 
2630 	dlm_lock_recovery(ls);
2631 
2632 	error = find_lkb(ls, lkid, &lkb);
2633 	if (error)
2634 		goto out;
2635 
2636 	error = set_unlock_args(flags, astarg, &args);
2637 	if (error)
2638 		goto out_put;
2639 
2640 	if (flags & DLM_LKF_CANCEL)
2641 		error = cancel_lock(ls, lkb, &args);
2642 	else
2643 		error = unlock_lock(ls, lkb, &args);
2644 
2645 	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2646 		error = 0;
2647 	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
2648 		error = 0;
2649  out_put:
2650 	dlm_put_lkb(lkb);
2651  out:
2652 	dlm_unlock_recovery(ls);
2653 	dlm_put_lockspace(ls);
2654 	return error;
2655 }
2656 
2657 /*
2658  * send/receive routines for remote operations and replies
2659  *
2660  * send_args
2661  * send_common
2662  * send_request			receive_request
2663  * send_convert			receive_convert
2664  * send_unlock			receive_unlock
2665  * send_cancel			receive_cancel
2666  * send_grant			receive_grant
2667  * send_bast			receive_bast
2668  * send_lookup			receive_lookup
2669  * send_remove			receive_remove
2670  *
2671  * 				send_common_reply
2672  * receive_request_reply	send_request_reply
2673  * receive_convert_reply	send_convert_reply
2674  * receive_unlock_reply		send_unlock_reply
2675  * receive_cancel_reply		send_cancel_reply
2676  * receive_lookup_reply		send_lookup_reply
2677  */
2678 
2679 static int _create_message(struct dlm_ls *ls, int mb_len,
2680 			   int to_nodeid, int mstype,
2681 			   struct dlm_message **ms_ret,
2682 			   struct dlm_mhandle **mh_ret)
2683 {
2684 	struct dlm_message *ms;
2685 	struct dlm_mhandle *mh;
2686 	char *mb;
2687 
2688 	/* get_buffer gives us a message handle (mh) that we need to
2689 	   pass into lowcomms_commit and a message buffer (mb) that we
2690 	   write our data into */
2691 
2692 	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
2693 	if (!mh)
2694 		return -ENOBUFS;
2695 
2696 	memset(mb, 0, mb_len);
2697 
2698 	ms = (struct dlm_message *) mb;
2699 
2700 	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2701 	ms->m_header.h_lockspace = ls->ls_global_id;
2702 	ms->m_header.h_nodeid = dlm_our_nodeid();
2703 	ms->m_header.h_length = mb_len;
2704 	ms->m_header.h_cmd = DLM_MSG;
2705 
2706 	ms->m_type = mstype;
2707 
2708 	*mh_ret = mh;
2709 	*ms_ret = ms;
2710 	return 0;
2711 }
2712 
2713 static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2714 			  int to_nodeid, int mstype,
2715 			  struct dlm_message **ms_ret,
2716 			  struct dlm_mhandle **mh_ret)
2717 {
2718 	int mb_len = sizeof(struct dlm_message);
2719 
2720 	switch (mstype) {
2721 	case DLM_MSG_REQUEST:
2722 	case DLM_MSG_LOOKUP:
2723 	case DLM_MSG_REMOVE:
2724 		mb_len += r->res_length;
2725 		break;
2726 	case DLM_MSG_CONVERT:
2727 	case DLM_MSG_UNLOCK:
2728 	case DLM_MSG_REQUEST_REPLY:
2729 	case DLM_MSG_CONVERT_REPLY:
2730 	case DLM_MSG_GRANT:
2731 		if (lkb && lkb->lkb_lvbptr)
2732 			mb_len += r->res_ls->ls_lvblen;
2733 		break;
2734 	}
2735 
2736 	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
2737 			       ms_ret, mh_ret);
2738 }
2739 
2740 /* further lowcomms enhancements or alternate implementations may make
2741    the return value from this function useful at some point */
2742 
2743 static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2744 {
2745 	dlm_message_out(ms);
2746 	dlm_lowcomms_commit_buffer(mh);
2747 	return 0;
2748 }
2749 
2750 static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2751 		      struct dlm_message *ms)
2752 {
2753 	ms->m_nodeid   = lkb->lkb_nodeid;
2754 	ms->m_pid      = lkb->lkb_ownpid;
2755 	ms->m_lkid     = lkb->lkb_id;
2756 	ms->m_remid    = lkb->lkb_remid;
2757 	ms->m_exflags  = lkb->lkb_exflags;
2758 	ms->m_sbflags  = lkb->lkb_sbflags;
2759 	ms->m_flags    = lkb->lkb_flags;
2760 	ms->m_lvbseq   = lkb->lkb_lvbseq;
2761 	ms->m_status   = lkb->lkb_status;
2762 	ms->m_grmode   = lkb->lkb_grmode;
2763 	ms->m_rqmode   = lkb->lkb_rqmode;
2764 	ms->m_hash     = r->res_hash;
2765 
2766 	/* m_result and m_bastmode are set from function args,
2767 	   not from lkb fields */
2768 
2769 	if (lkb->lkb_bastfn)
2770 		ms->m_asts |= AST_BAST;
2771 	if (lkb->lkb_astfn)
2772 		ms->m_asts |= AST_COMP;
2773 
2774 	/* compare with switch in create_message; send_remove() doesn't
2775 	   use send_args() */
2776 
2777 	switch (ms->m_type) {
2778 	case DLM_MSG_REQUEST:
2779 	case DLM_MSG_LOOKUP:
2780 		memcpy(ms->m_extra, r->res_name, r->res_length);
2781 		break;
2782 	case DLM_MSG_CONVERT:
2783 	case DLM_MSG_UNLOCK:
2784 	case DLM_MSG_REQUEST_REPLY:
2785 	case DLM_MSG_CONVERT_REPLY:
2786 	case DLM_MSG_GRANT:
2787 		if (!lkb->lkb_lvbptr)
2788 			break;
2789 		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2790 		break;
2791 	}
2792 }
2793 
2794 static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2795 {
2796 	struct dlm_message *ms;
2797 	struct dlm_mhandle *mh;
2798 	int to_nodeid, error;
2799 
2800 	error = add_to_waiters(lkb, mstype);
2801 	if (error)
2802 		return error;
2803 
2804 	to_nodeid = r->res_nodeid;
2805 
2806 	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2807 	if (error)
2808 		goto fail;
2809 
2810 	send_args(r, lkb, ms);
2811 
2812 	error = send_message(mh, ms);
2813 	if (error)
2814 		goto fail;
2815 	return 0;
2816 
2817  fail:
2818 	remove_from_waiters(lkb, msg_reply_type(mstype));
2819 	return error;
2820 }
2821 
2822 static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2823 {
2824 	return send_common(r, lkb, DLM_MSG_REQUEST);
2825 }
2826 
2827 static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2828 {
2829 	int error;
2830 
2831 	error = send_common(r, lkb, DLM_MSG_CONVERT);
2832 
2833 	/* down conversions go without a reply from the master */
2834 	if (!error && down_conversion(lkb)) {
2835 		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
2836 		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
2837 		r->res_ls->ls_stub_ms.m_result = 0;
2838 		r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2839 		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2840 	}
2841 
2842 	return error;
2843 }
2844 
2845 /* FIXME: if this lkb is the only lock we hold on the rsb, then set
2846    MASTER_UNCERTAIN to force the next request on the rsb to confirm
2847    that the master is still correct. */
2848 
2849 static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2850 {
2851 	return send_common(r, lkb, DLM_MSG_UNLOCK);
2852 }
2853 
2854 static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2855 {
2856 	return send_common(r, lkb, DLM_MSG_CANCEL);
2857 }
2858 
2859 static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2860 {
2861 	struct dlm_message *ms;
2862 	struct dlm_mhandle *mh;
2863 	int to_nodeid, error;
2864 
2865 	to_nodeid = lkb->lkb_nodeid;
2866 
2867 	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2868 	if (error)
2869 		goto out;
2870 
2871 	send_args(r, lkb, ms);
2872 
2873 	ms->m_result = 0;
2874 
2875 	error = send_message(mh, ms);
2876  out:
2877 	return error;
2878 }
2879 
2880 static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2881 {
2882 	struct dlm_message *ms;
2883 	struct dlm_mhandle *mh;
2884 	int to_nodeid, error;
2885 
2886 	to_nodeid = lkb->lkb_nodeid;
2887 
2888 	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2889 	if (error)
2890 		goto out;
2891 
2892 	send_args(r, lkb, ms);
2893 
2894 	ms->m_bastmode = mode;
2895 
2896 	error = send_message(mh, ms);
2897  out:
2898 	return error;
2899 }
2900 
2901 static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2902 {
2903 	struct dlm_message *ms;
2904 	struct dlm_mhandle *mh;
2905 	int to_nodeid, error;
2906 
2907 	error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
2908 	if (error)
2909 		return error;
2910 
2911 	to_nodeid = dlm_dir_nodeid(r);
2912 
2913 	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2914 	if (error)
2915 		goto fail;
2916 
2917 	send_args(r, lkb, ms);
2918 
2919 	error = send_message(mh, ms);
2920 	if (error)
2921 		goto fail;
2922 	return 0;
2923 
2924  fail:
2925 	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
2926 	return error;
2927 }
2928 
2929 static int send_remove(struct dlm_rsb *r)
2930 {
2931 	struct dlm_message *ms;
2932 	struct dlm_mhandle *mh;
2933 	int to_nodeid, error;
2934 
2935 	to_nodeid = dlm_dir_nodeid(r);
2936 
2937 	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2938 	if (error)
2939 		goto out;
2940 
2941 	memcpy(ms->m_extra, r->res_name, r->res_length);
2942 	ms->m_hash = r->res_hash;
2943 
2944 	error = send_message(mh, ms);
2945  out:
2946 	return error;
2947 }
2948 
2949 static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2950 			     int mstype, int rv)
2951 {
2952 	struct dlm_message *ms;
2953 	struct dlm_mhandle *mh;
2954 	int to_nodeid, error;
2955 
2956 	to_nodeid = lkb->lkb_nodeid;
2957 
2958 	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2959 	if (error)
2960 		goto out;
2961 
2962 	send_args(r, lkb, ms);
2963 
2964 	ms->m_result = rv;
2965 
2966 	error = send_message(mh, ms);
2967  out:
2968 	return error;
2969 }
2970 
2971 static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2972 {
2973 	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2974 }
2975 
2976 static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2977 {
2978 	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2979 }
2980 
2981 static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2982 {
2983 	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2984 }
2985 
2986 static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2987 {
2988 	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2989 }
2990 
2991 static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2992 			     int ret_nodeid, int rv)
2993 {
2994 	struct dlm_rsb *r = &ls->ls_stub_rsb;
2995 	struct dlm_message *ms;
2996 	struct dlm_mhandle *mh;
2997 	int error, nodeid = ms_in->m_header.h_nodeid;
2998 
2999 	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
3000 	if (error)
3001 		goto out;
3002 
3003 	ms->m_lkid = ms_in->m_lkid;
3004 	ms->m_result = rv;
3005 	ms->m_nodeid = ret_nodeid;
3006 
3007 	error = send_message(mh, ms);
3008  out:
3009 	return error;
3010 }
3011 
3012 /* which args we save from a received message depends heavily on the type
3013    of message, unlike the send side where we can safely send everything about
3014    the lkb for any type of message */
3015 
3016 static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
3017 {
3018 	lkb->lkb_exflags = ms->m_exflags;
3019 	lkb->lkb_sbflags = ms->m_sbflags;
3020 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3021 		         (ms->m_flags & 0x0000FFFF);
3022 }
3023 
3024 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3025 {
3026 	lkb->lkb_sbflags = ms->m_sbflags;
3027 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3028 		         (ms->m_flags & 0x0000FFFF);
3029 }
3030 
3031 static int receive_extralen(struct dlm_message *ms)
3032 {
3033 	return (ms->m_header.h_length - sizeof(struct dlm_message));
3034 }
3035 
3036 static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
3037 		       struct dlm_message *ms)
3038 {
3039 	int len;
3040 
3041 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3042 		if (!lkb->lkb_lvbptr)
3043 			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3044 		if (!lkb->lkb_lvbptr)
3045 			return -ENOMEM;
3046 		len = receive_extralen(ms);
3047 		if (len > DLM_RESNAME_MAXLEN)
3048 			len = DLM_RESNAME_MAXLEN;
3049 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
3050 	}
3051 	return 0;
3052 }
3053 
3054 static void fake_bastfn(void *astparam, int mode)
3055 {
3056 	log_print("fake_bastfn should not be called");
3057 }
3058 
3059 static void fake_astfn(void *astparam)
3060 {
3061 	log_print("fake_astfn should not be called");
3062 }
3063 
3064 static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3065 				struct dlm_message *ms)
3066 {
3067 	lkb->lkb_nodeid = ms->m_header.h_nodeid;
3068 	lkb->lkb_ownpid = ms->m_pid;
3069 	lkb->lkb_remid = ms->m_lkid;
3070 	lkb->lkb_grmode = DLM_LOCK_IV;
3071 	lkb->lkb_rqmode = ms->m_rqmode;
3072 
3073 	lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL;
3074 	lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL;
3075 
3076 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3077 		/* lkb was just created so there won't be an lvb yet */
3078 		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3079 		if (!lkb->lkb_lvbptr)
3080 			return -ENOMEM;
3081 	}
3082 
3083 	return 0;
3084 }
3085 
3086 static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3087 				struct dlm_message *ms)
3088 {
3089 	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3090 		return -EBUSY;
3091 
3092 	if (receive_lvb(ls, lkb, ms))
3093 		return -ENOMEM;
3094 
3095 	lkb->lkb_rqmode = ms->m_rqmode;
3096 	lkb->lkb_lvbseq = ms->m_lvbseq;
3097 
3098 	return 0;
3099 }
3100 
3101 static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3102 			       struct dlm_message *ms)
3103 {
3104 	if (receive_lvb(ls, lkb, ms))
3105 		return -ENOMEM;
3106 	return 0;
3107 }
3108 
3109 /* We fill in the stub-lkb fields with the info that send_xxxx_reply()
3110    uses to send a reply and that the remote end uses to process the reply. */
3111 
3112 static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
3113 {
3114 	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
3115 	lkb->lkb_nodeid = ms->m_header.h_nodeid;
3116 	lkb->lkb_remid = ms->m_lkid;
3117 }
3118 
3119 /* This is called after the rsb is locked so that we can safely inspect
3120    fields in the lkb. */
3121 
3122 static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
3123 {
3124 	int from = ms->m_header.h_nodeid;
3125 	int error = 0;
3126 
3127 	switch (ms->m_type) {
3128 	case DLM_MSG_CONVERT:
3129 	case DLM_MSG_UNLOCK:
3130 	case DLM_MSG_CANCEL:
3131 		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
3132 			error = -EINVAL;
3133 		break;
3134 
3135 	case DLM_MSG_CONVERT_REPLY:
3136 	case DLM_MSG_UNLOCK_REPLY:
3137 	case DLM_MSG_CANCEL_REPLY:
3138 	case DLM_MSG_GRANT:
3139 	case DLM_MSG_BAST:
3140 		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
3141 			error = -EINVAL;
3142 		break;
3143 
3144 	case DLM_MSG_REQUEST_REPLY:
3145 		if (!is_process_copy(lkb))
3146 			error = -EINVAL;
3147 		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
3148 			error = -EINVAL;
3149 		break;
3150 
3151 	default:
3152 		error = -EINVAL;
3153 	}
3154 
3155 	if (error)
3156 		log_error(lkb->lkb_resource->res_ls,
3157 			  "ignore invalid message %d from %d %x %x %x %d",
3158 			  ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
3159 			  lkb->lkb_flags, lkb->lkb_nodeid);
3160 	return error;
3161 }
3162 
3163 static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3164 {
3165 	struct dlm_lkb *lkb;
3166 	struct dlm_rsb *r;
3167 	int error, namelen;
3168 
3169 	error = create_lkb(ls, &lkb);
3170 	if (error)
3171 		goto fail;
3172 
3173 	receive_flags(lkb, ms);
3174 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
3175 	error = receive_request_args(ls, lkb, ms);
3176 	if (error) {
3177 		__put_lkb(ls, lkb);
3178 		goto fail;
3179 	}
3180 
3181 	namelen = receive_extralen(ms);
3182 
3183 	error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
3184 	if (error) {
3185 		__put_lkb(ls, lkb);
3186 		goto fail;
3187 	}
3188 
3189 	lock_rsb(r);
3190 
3191 	attach_lkb(r, lkb);
3192 	error = do_request(r, lkb);
3193 	send_request_reply(r, lkb, error);
3194 
3195 	unlock_rsb(r);
3196 	put_rsb(r);
3197 
3198 	if (error == -EINPROGRESS)
3199 		error = 0;
3200 	if (error)
3201 		dlm_put_lkb(lkb);
3202 	return;
3203 
3204  fail:
3205 	setup_stub_lkb(ls, ms);
3206 	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
3207 }
3208 
3209 static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3210 {
3211 	struct dlm_lkb *lkb;
3212 	struct dlm_rsb *r;
3213 	int error, reply = 1;
3214 
3215 	error = find_lkb(ls, ms->m_remid, &lkb);
3216 	if (error)
3217 		goto fail;
3218 
3219 	r = lkb->lkb_resource;
3220 
3221 	hold_rsb(r);
3222 	lock_rsb(r);
3223 
3224 	error = validate_message(lkb, ms);
3225 	if (error)
3226 		goto out;
3227 
3228 	receive_flags(lkb, ms);
3229 	error = receive_convert_args(ls, lkb, ms);
3230 	if (error)
3231 		goto out_reply;
3232 	reply = !down_conversion(lkb);
3233 
3234 	error = do_convert(r, lkb);
3235  out_reply:
3236 	if (reply)
3237 		send_convert_reply(r, lkb, error);
3238  out:
3239 	unlock_rsb(r);
3240 	put_rsb(r);
3241 	dlm_put_lkb(lkb);
3242 	return;
3243 
3244  fail:
3245 	setup_stub_lkb(ls, ms);
3246 	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
3247 }
3248 
3249 static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3250 {
3251 	struct dlm_lkb *lkb;
3252 	struct dlm_rsb *r;
3253 	int error;
3254 
3255 	error = find_lkb(ls, ms->m_remid, &lkb);
3256 	if (error)
3257 		goto fail;
3258 
3259 	r = lkb->lkb_resource;
3260 
3261 	hold_rsb(r);
3262 	lock_rsb(r);
3263 
3264 	error = validate_message(lkb, ms);
3265 	if (error)
3266 		goto out;
3267 
3268 	receive_flags(lkb, ms);
3269 	error = receive_unlock_args(ls, lkb, ms);
3270 	if (error)
3271 		goto out_reply;
3272 
3273 	error = do_unlock(r, lkb);
3274  out_reply:
3275 	send_unlock_reply(r, lkb, error);
3276  out:
3277 	unlock_rsb(r);
3278 	put_rsb(r);
3279 	dlm_put_lkb(lkb);
3280 	return;
3281 
3282  fail:
3283 	setup_stub_lkb(ls, ms);
3284 	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
3285 }
3286 
3287 static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3288 {
3289 	struct dlm_lkb *lkb;
3290 	struct dlm_rsb *r;
3291 	int error;
3292 
3293 	error = find_lkb(ls, ms->m_remid, &lkb);
3294 	if (error)
3295 		goto fail;
3296 
3297 	receive_flags(lkb, ms);
3298 
3299 	r = lkb->lkb_resource;
3300 
3301 	hold_rsb(r);
3302 	lock_rsb(r);
3303 
3304 	error = validate_message(lkb, ms);
3305 	if (error)
3306 		goto out;
3307 
3308 	error = do_cancel(r, lkb);
3309 	send_cancel_reply(r, lkb, error);
3310  out:
3311 	unlock_rsb(r);
3312 	put_rsb(r);
3313 	dlm_put_lkb(lkb);
3314 	return;
3315 
3316  fail:
3317 	setup_stub_lkb(ls, ms);
3318 	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
3319 }
3320 
3321 static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
3322 {
3323 	struct dlm_lkb *lkb;
3324 	struct dlm_rsb *r;
3325 	int error;
3326 
3327 	error = find_lkb(ls, ms->m_remid, &lkb);
3328 	if (error) {
3329 		log_debug(ls, "receive_grant from %d no lkb %x",
3330 			  ms->m_header.h_nodeid, ms->m_remid);
3331 		return;
3332 	}
3333 
3334 	r = lkb->lkb_resource;
3335 
3336 	hold_rsb(r);
3337 	lock_rsb(r);
3338 
3339 	error = validate_message(lkb, ms);
3340 	if (error)
3341 		goto out;
3342 
3343 	receive_flags_reply(lkb, ms);
3344 	if (is_altmode(lkb))
3345 		munge_altmode(lkb, ms);
3346 	grant_lock_pc(r, lkb, ms);
3347 	queue_cast(r, lkb, 0);
3348  out:
3349 	unlock_rsb(r);
3350 	put_rsb(r);
3351 	dlm_put_lkb(lkb);
3352 }
3353 
3354 static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
3355 {
3356 	struct dlm_lkb *lkb;
3357 	struct dlm_rsb *r;
3358 	int error;
3359 
3360 	error = find_lkb(ls, ms->m_remid, &lkb);
3361 	if (error) {
3362 		log_debug(ls, "receive_bast from %d no lkb %x",
3363 			  ms->m_header.h_nodeid, ms->m_remid);
3364 		return;
3365 	}
3366 
3367 	r = lkb->lkb_resource;
3368 
3369 	hold_rsb(r);
3370 	lock_rsb(r);
3371 
3372 	error = validate_message(lkb, ms);
3373 	if (error)
3374 		goto out;
3375 
3376 	queue_bast(r, lkb, ms->m_bastmode);
3377  out:
3378 	unlock_rsb(r);
3379 	put_rsb(r);
3380 	dlm_put_lkb(lkb);
3381 }
3382 
3383 static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
3384 {
3385 	int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
3386 
3387 	from_nodeid = ms->m_header.h_nodeid;
3388 	our_nodeid = dlm_our_nodeid();
3389 
3390 	len = receive_extralen(ms);
3391 
3392 	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
3393 	if (dir_nodeid != our_nodeid) {
3394 		log_error(ls, "lookup dir_nodeid %d from %d",
3395 			  dir_nodeid, from_nodeid);
3396 		error = -EINVAL;
3397 		ret_nodeid = -1;
3398 		goto out;
3399 	}
3400 
3401 	error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
3402 
3403 	/* Optimization: we're master so treat lookup as a request */
3404 	if (!error && ret_nodeid == our_nodeid) {
3405 		receive_request(ls, ms);
3406 		return;
3407 	}
3408  out:
3409 	send_lookup_reply(ls, ms, ret_nodeid, error);
3410 }
3411 
3412 static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
3413 {
3414 	int len, dir_nodeid, from_nodeid;
3415 
3416 	from_nodeid = ms->m_header.h_nodeid;
3417 
3418 	len = receive_extralen(ms);
3419 
3420 	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
3421 	if (dir_nodeid != dlm_our_nodeid()) {
3422 		log_error(ls, "remove dir entry dir_nodeid %d from %d",
3423 			  dir_nodeid, from_nodeid);
3424 		return;
3425 	}
3426 
3427 	dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
3428 }
3429 
3430 static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
3431 {
3432 	do_purge(ls, ms->m_nodeid, ms->m_pid);
3433 }
3434 
3435 static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3436 {
3437 	struct dlm_lkb *lkb;
3438 	struct dlm_rsb *r;
3439 	int error, mstype, result;
3440 
3441 	error = find_lkb(ls, ms->m_remid, &lkb);
3442 	if (error) {
3443 		log_debug(ls, "receive_request_reply from %d no lkb %x",
3444 			  ms->m_header.h_nodeid, ms->m_remid);
3445 		return;
3446 	}
3447 
3448 	r = lkb->lkb_resource;
3449 	hold_rsb(r);
3450 	lock_rsb(r);
3451 
3452 	error = validate_message(lkb, ms);
3453 	if (error)
3454 		goto out;
3455 
3456 	mstype = lkb->lkb_wait_type;
3457 	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
3458 	if (error)
3459 		goto out;
3460 
3461 	/* Optimization: the dir node was also the master, so it took our
3462 	   lookup as a request and sent request reply instead of lookup reply */
3463 	if (mstype == DLM_MSG_LOOKUP) {
3464 		r->res_nodeid = ms->m_header.h_nodeid;
3465 		lkb->lkb_nodeid = r->res_nodeid;
3466 	}
3467 
3468 	/* this is the value returned from do_request() on the master */
3469 	result = ms->m_result;
3470 
3471 	switch (result) {
3472 	case -EAGAIN:
3473 		/* request would block (be queued) on remote master */
3474 		queue_cast(r, lkb, -EAGAIN);
3475 		confirm_master(r, -EAGAIN);
3476 		unhold_lkb(lkb); /* undoes create_lkb() */
3477 		break;
3478 
3479 	case -EINPROGRESS:
3480 	case 0:
3481 		/* request was queued or granted on remote master */
3482 		receive_flags_reply(lkb, ms);
3483 		lkb->lkb_remid = ms->m_lkid;
3484 		if (is_altmode(lkb))
3485 			munge_altmode(lkb, ms);
3486 		if (result) {
3487 			add_lkb(r, lkb, DLM_LKSTS_WAITING);
3488 			add_timeout(lkb);
3489 		} else {
3490 			grant_lock_pc(r, lkb, ms);
3491 			queue_cast(r, lkb, 0);
3492 		}
3493 		confirm_master(r, result);
3494 		break;
3495 
3496 	case -EBADR:
3497 	case -ENOTBLK:
3498 		/* find_rsb failed to find rsb or rsb wasn't master */
3499 		log_debug(ls, "receive_request_reply %x %x master diff %d %d",
3500 			  lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
3501 		r->res_nodeid = -1;
3502 		lkb->lkb_nodeid = -1;
3503 
3504 		if (is_overlap(lkb)) {
3505 			/* we'll ignore error in cancel/unlock reply */
3506 			queue_cast_overlap(r, lkb);
3507 			confirm_master(r, result);
3508 			unhold_lkb(lkb); /* undoes create_lkb() */
3509 		} else
3510 			_request_lock(r, lkb);
3511 		break;
3512 
3513 	default:
3514 		log_error(ls, "receive_request_reply %x error %d",
3515 			  lkb->lkb_id, result);
3516 	}
3517 
3518 	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
3519 		log_debug(ls, "receive_request_reply %x result %d unlock",
3520 			  lkb->lkb_id, result);
3521 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3522 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3523 		send_unlock(r, lkb);
3524 	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
3525 		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
3526 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3527 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3528 		send_cancel(r, lkb);
3529 	} else {
3530 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3531 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3532 	}
3533  out:
3534 	unlock_rsb(r);
3535 	put_rsb(r);
3536 	dlm_put_lkb(lkb);
3537 }
3538 
3539 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3540 				    struct dlm_message *ms)
3541 {
3542 	/* this is the value returned from do_convert() on the master */
3543 	switch (ms->m_result) {
3544 	case -EAGAIN:
3545 		/* convert would block (be queued) on remote master */
3546 		queue_cast(r, lkb, -EAGAIN);
3547 		break;
3548 
3549 	case -EDEADLK:
3550 		receive_flags_reply(lkb, ms);
3551 		revert_lock_pc(r, lkb);
3552 		queue_cast(r, lkb, -EDEADLK);
3553 		break;
3554 
3555 	case -EINPROGRESS:
3556 		/* convert was queued on remote master */
3557 		receive_flags_reply(lkb, ms);
3558 		if (is_demoted(lkb))
3559 			munge_demoted(lkb, ms);
3560 		del_lkb(r, lkb);
3561 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3562 		add_timeout(lkb);
3563 		break;
3564 
3565 	case 0:
3566 		/* convert was granted on remote master */
3567 		receive_flags_reply(lkb, ms);
3568 		if (is_demoted(lkb))
3569 			munge_demoted(lkb, ms);
3570 		grant_lock_pc(r, lkb, ms);
3571 		queue_cast(r, lkb, 0);
3572 		break;
3573 
3574 	default:
3575 		log_error(r->res_ls, "receive_convert_reply %x error %d",
3576 			  lkb->lkb_id, ms->m_result);
3577 	}
3578 }
3579 
3580 static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3581 {
3582 	struct dlm_rsb *r = lkb->lkb_resource;
3583 	int error;
3584 
3585 	hold_rsb(r);
3586 	lock_rsb(r);
3587 
3588 	error = validate_message(lkb, ms);
3589 	if (error)
3590 		goto out;
3591 
3592 	/* stub reply can happen with waiters_mutex held */
3593 	error = remove_from_waiters_ms(lkb, ms);
3594 	if (error)
3595 		goto out;
3596 
3597 	__receive_convert_reply(r, lkb, ms);
3598  out:
3599 	unlock_rsb(r);
3600 	put_rsb(r);
3601 }
3602 
3603 static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
3604 {
3605 	struct dlm_lkb *lkb;
3606 	int error;
3607 
3608 	error = find_lkb(ls, ms->m_remid, &lkb);
3609 	if (error) {
3610 		log_debug(ls, "receive_convert_reply from %d no lkb %x",
3611 			  ms->m_header.h_nodeid, ms->m_remid);
3612 		return;
3613 	}
3614 
3615 	_receive_convert_reply(lkb, ms);
3616 	dlm_put_lkb(lkb);
3617 }
3618 
3619 static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3620 {
3621 	struct dlm_rsb *r = lkb->lkb_resource;
3622 	int error;
3623 
3624 	hold_rsb(r);
3625 	lock_rsb(r);
3626 
3627 	error = validate_message(lkb, ms);
3628 	if (error)
3629 		goto out;
3630 
3631 	/* stub reply can happen with waiters_mutex held */
3632 	error = remove_from_waiters_ms(lkb, ms);
3633 	if (error)
3634 		goto out;
3635 
3636 	/* this is the value returned from do_unlock() on the master */
3637 
3638 	switch (ms->m_result) {
3639 	case -DLM_EUNLOCK:
3640 		receive_flags_reply(lkb, ms);
3641 		remove_lock_pc(r, lkb);
3642 		queue_cast(r, lkb, -DLM_EUNLOCK);
3643 		break;
3644 	case -ENOENT:
3645 		break;
3646 	default:
3647 		log_error(r->res_ls, "receive_unlock_reply %x error %d",
3648 			  lkb->lkb_id, ms->m_result);
3649 	}
3650  out:
3651 	unlock_rsb(r);
3652 	put_rsb(r);
3653 }
3654 
3655 static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
3656 {
3657 	struct dlm_lkb *lkb;
3658 	int error;
3659 
3660 	error = find_lkb(ls, ms->m_remid, &lkb);
3661 	if (error) {
3662 		log_debug(ls, "receive_unlock_reply from %d no lkb %x",
3663 			  ms->m_header.h_nodeid, ms->m_remid);
3664 		return;
3665 	}
3666 
3667 	_receive_unlock_reply(lkb, ms);
3668 	dlm_put_lkb(lkb);
3669 }
3670 
3671 static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3672 {
3673 	struct dlm_rsb *r = lkb->lkb_resource;
3674 	int error;
3675 
3676 	hold_rsb(r);
3677 	lock_rsb(r);
3678 
3679 	error = validate_message(lkb, ms);
3680 	if (error)
3681 		goto out;
3682 
3683 	/* stub reply can happen with waiters_mutex held */
3684 	error = remove_from_waiters_ms(lkb, ms);
3685 	if (error)
3686 		goto out;
3687 
3688 	/* this is the value returned from do_cancel() on the master */
3689 
3690 	switch (ms->m_result) {
3691 	case -DLM_ECANCEL:
3692 		receive_flags_reply(lkb, ms);
3693 		revert_lock_pc(r, lkb);
3694 		queue_cast(r, lkb, -DLM_ECANCEL);
3695 		break;
3696 	case 0:
3697 		break;
3698 	default:
3699 		log_error(r->res_ls, "receive_cancel_reply %x error %d",
3700 			  lkb->lkb_id, ms->m_result);
3701 	}
3702  out:
3703 	unlock_rsb(r);
3704 	put_rsb(r);
3705 }
3706 
3707 static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
3708 {
3709 	struct dlm_lkb *lkb;
3710 	int error;
3711 
3712 	error = find_lkb(ls, ms->m_remid, &lkb);
3713 	if (error) {
3714 		log_debug(ls, "receive_cancel_reply from %d no lkb %x",
3715 			  ms->m_header.h_nodeid, ms->m_remid);
3716 		return;
3717 	}
3718 
3719 	_receive_cancel_reply(lkb, ms);
3720 	dlm_put_lkb(lkb);
3721 }
3722 
3723 static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3724 {
3725 	struct dlm_lkb *lkb;
3726 	struct dlm_rsb *r;
3727 	int error, ret_nodeid;
3728 
3729 	error = find_lkb(ls, ms->m_lkid, &lkb);
3730 	if (error) {
3731 		log_error(ls, "receive_lookup_reply no lkb");
3732 		return;
3733 	}
3734 
3735 	/* ms->m_result is the value returned by dlm_dir_lookup on dir node
3736 	   FIXME: will a non-zero error ever be returned? */
3737 
3738 	r = lkb->lkb_resource;
3739 	hold_rsb(r);
3740 	lock_rsb(r);
3741 
3742 	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
3743 	if (error)
3744 		goto out;
3745 
3746 	ret_nodeid = ms->m_nodeid;
3747 	if (ret_nodeid == dlm_our_nodeid()) {
3748 		r->res_nodeid = 0;
3749 		ret_nodeid = 0;
3750 		r->res_first_lkid = 0;
3751 	} else {
3752 		/* set_master() will copy res_nodeid to lkb_nodeid */
3753 		r->res_nodeid = ret_nodeid;
3754 	}
3755 
3756 	if (is_overlap(lkb)) {
3757 		log_debug(ls, "receive_lookup_reply %x unlock %x",
3758 			  lkb->lkb_id, lkb->lkb_flags);
3759 		queue_cast_overlap(r, lkb);
3760 		unhold_lkb(lkb); /* undoes create_lkb() */
3761 		goto out_list;
3762 	}
3763 
3764 	_request_lock(r, lkb);
3765 
3766  out_list:
3767 	if (!ret_nodeid)
3768 		process_lookup_list(r);
3769  out:
3770 	unlock_rsb(r);
3771 	put_rsb(r);
3772 	dlm_put_lkb(lkb);
3773 }
3774 
3775 static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3776 {
3777 	if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
3778 		log_debug(ls, "ignore non-member message %d from %d %x %x %d",
3779 			  ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
3780 			  ms->m_remid, ms->m_result);
3781 		return;
3782 	}
3783 
3784 	switch (ms->m_type) {
3785 
3786 	/* messages sent to a master node */
3787 
3788 	case DLM_MSG_REQUEST:
3789 		receive_request(ls, ms);
3790 		break;
3791 
3792 	case DLM_MSG_CONVERT:
3793 		receive_convert(ls, ms);
3794 		break;
3795 
3796 	case DLM_MSG_UNLOCK:
3797 		receive_unlock(ls, ms);
3798 		break;
3799 
3800 	case DLM_MSG_CANCEL:
3801 		receive_cancel(ls, ms);
3802 		break;
3803 
3804 	/* messages sent from a master node (replies to above) */
3805 
3806 	case DLM_MSG_REQUEST_REPLY:
3807 		receive_request_reply(ls, ms);
3808 		break;
3809 
3810 	case DLM_MSG_CONVERT_REPLY:
3811 		receive_convert_reply(ls, ms);
3812 		break;
3813 
3814 	case DLM_MSG_UNLOCK_REPLY:
3815 		receive_unlock_reply(ls, ms);
3816 		break;
3817 
3818 	case DLM_MSG_CANCEL_REPLY:
3819 		receive_cancel_reply(ls, ms);
3820 		break;
3821 
3822 	/* messages sent from a master node (only two types of async msg) */
3823 
3824 	case DLM_MSG_GRANT:
3825 		receive_grant(ls, ms);
3826 		break;
3827 
3828 	case DLM_MSG_BAST:
3829 		receive_bast(ls, ms);
3830 		break;
3831 
3832 	/* messages sent to a dir node */
3833 
3834 	case DLM_MSG_LOOKUP:
3835 		receive_lookup(ls, ms);
3836 		break;
3837 
3838 	case DLM_MSG_REMOVE:
3839 		receive_remove(ls, ms);
3840 		break;
3841 
3842 	/* messages sent from a dir node (remove has no reply) */
3843 
3844 	case DLM_MSG_LOOKUP_REPLY:
3845 		receive_lookup_reply(ls, ms);
3846 		break;
3847 
3848 	/* other messages */
3849 
3850 	case DLM_MSG_PURGE:
3851 		receive_purge(ls, ms);
3852 		break;
3853 
3854 	default:
3855 		log_error(ls, "unknown message type %d", ms->m_type);
3856 	}
3857 
3858 	dlm_astd_wake();
3859 }
3860 
3861 /* If the lockspace is in recovery mode (locking stopped), then normal
3862    messages are saved on the requestqueue for processing after recovery is
3863    done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
3864    messages off the requestqueue before we process new ones. This occurs right
3865    after recovery completes when we transition from saving all messages on
3866    requestqueue, to processing all the saved messages, to processing new
3867    messages as they arrive. */
3868 
3869 static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
3870 				int nodeid)
3871 {
3872 	if (dlm_locking_stopped(ls)) {
3873 		dlm_add_requestqueue(ls, nodeid, ms);
3874 	} else {
3875 		dlm_wait_requestqueue(ls);
3876 		_receive_message(ls, ms);
3877 	}
3878 }
3879 
3880 /* This is called by dlm_recoverd to process messages that were saved on
3881    the requestqueue. */
3882 
3883 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
3884 {
3885 	_receive_message(ls, ms);
3886 }
3887 
3888 /* This is called by the midcomms layer when something is received for
3889    the lockspace.  It could be either a MSG (normal message sent as part of
3890    standard locking activity) or an RCOM (recovery message sent as part of
3891    lockspace recovery). */
3892 
3893 void dlm_receive_buffer(union dlm_packet *p, int nodeid)
3894 {
3895 	struct dlm_header *hd = &p->header;
3896 	struct dlm_ls *ls;
3897 	int type = 0;
3898 
3899 	switch (hd->h_cmd) {
3900 	case DLM_MSG:
3901 		dlm_message_in(&p->message);
3902 		type = p->message.m_type;
3903 		break;
3904 	case DLM_RCOM:
3905 		dlm_rcom_in(&p->rcom);
3906 		type = p->rcom.rc_type;
3907 		break;
3908 	default:
3909 		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
3910 		return;
3911 	}
3912 
3913 	if (hd->h_nodeid != nodeid) {
3914 		log_print("invalid h_nodeid %d from %d lockspace %x",
3915 			  hd->h_nodeid, nodeid, hd->h_lockspace);
3916 		return;
3917 	}
3918 
3919 	ls = dlm_find_lockspace_global(hd->h_lockspace);
3920 	if (!ls) {
3921 		if (dlm_config.ci_log_debug)
3922 			log_print("invalid lockspace %x from %d cmd %d type %d",
3923 				  hd->h_lockspace, nodeid, hd->h_cmd, type);
3924 
3925 		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3926 			dlm_send_ls_not_ready(nodeid, &p->rcom);
3927 		return;
3928 	}
3929 
3930 	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
3931 	   be inactive (in this ls) before transitioning to recovery mode */
3932 
3933 	down_read(&ls->ls_recv_active);
3934 	if (hd->h_cmd == DLM_MSG)
3935 		dlm_receive_message(ls, &p->message, nodeid);
3936 	else
3937 		dlm_receive_rcom(ls, &p->rcom, nodeid);
3938 	up_read(&ls->ls_recv_active);
3939 
3940 	dlm_put_lockspace(ls);
3941 }
3942 
3943 static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3944 {
3945 	if (middle_conversion(lkb)) {
3946 		hold_lkb(lkb);
3947 		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
3948 		ls->ls_stub_ms.m_result = -EINPROGRESS;
3949 		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3950 		ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3951 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
3952 
3953 		/* Same special case as in receive_rcom_lock_args() */
3954 		lkb->lkb_grmode = DLM_LOCK_IV;
3955 		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3956 		unhold_lkb(lkb);
3957 
3958 	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3959 		lkb->lkb_flags |= DLM_IFL_RESEND;
3960 	}
3961 
3962 	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3963 	   conversions are async; there's no reply from the remote master */
3964 }
3965 
3966 /* A waiting lkb needs recovery if the master node has failed, or
3967    the master node is changing (only when no directory is used) */
3968 
3969 static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3970 {
3971 	if (dlm_is_removed(ls, lkb->lkb_nodeid))
3972 		return 1;
3973 
3974 	if (!dlm_no_directory(ls))
3975 		return 0;
3976 
3977 	if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3978 		return 1;
3979 
3980 	return 0;
3981 }
3982 
3983 /* Recovery for locks that are waiting for replies from nodes that are now
3984    gone.  We can just complete unlocks and cancels by faking a reply from the
3985    dead node.  Requests and up-conversions we flag to be resent after
3986    recovery.  Down-conversions can just be completed with a fake reply like
3987    unlocks.  Conversions between PR and CW need special attention. */
3988 
3989 void dlm_recover_waiters_pre(struct dlm_ls *ls)
3990 {
3991 	struct dlm_lkb *lkb, *safe;
3992 	int wait_type, stub_unlock_result, stub_cancel_result;
3993 
3994 	mutex_lock(&ls->ls_waiters_mutex);
3995 
3996 	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3997 		log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3998 			  lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3999 
4000 		/* all outstanding lookups, regardless of destination  will be
4001 		   resent after recovery is done */
4002 
4003 		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
4004 			lkb->lkb_flags |= DLM_IFL_RESEND;
4005 			continue;
4006 		}
4007 
4008 		if (!waiter_needs_recovery(ls, lkb))
4009 			continue;
4010 
4011 		wait_type = lkb->lkb_wait_type;
4012 		stub_unlock_result = -DLM_EUNLOCK;
4013 		stub_cancel_result = -DLM_ECANCEL;
4014 
4015 		/* Main reply may have been received leaving a zero wait_type,
4016 		   but a reply for the overlapping op may not have been
4017 		   received.  In that case we need to fake the appropriate
4018 		   reply for the overlap op. */
4019 
4020 		if (!wait_type) {
4021 			if (is_overlap_cancel(lkb)) {
4022 				wait_type = DLM_MSG_CANCEL;
4023 				if (lkb->lkb_grmode == DLM_LOCK_IV)
4024 					stub_cancel_result = 0;
4025 			}
4026 			if (is_overlap_unlock(lkb)) {
4027 				wait_type = DLM_MSG_UNLOCK;
4028 				if (lkb->lkb_grmode == DLM_LOCK_IV)
4029 					stub_unlock_result = -ENOENT;
4030 			}
4031 
4032 			log_debug(ls, "rwpre overlap %x %x %d %d %d",
4033 				  lkb->lkb_id, lkb->lkb_flags, wait_type,
4034 				  stub_cancel_result, stub_unlock_result);
4035 		}
4036 
4037 		switch (wait_type) {
4038 
4039 		case DLM_MSG_REQUEST:
4040 			lkb->lkb_flags |= DLM_IFL_RESEND;
4041 			break;
4042 
4043 		case DLM_MSG_CONVERT:
4044 			recover_convert_waiter(ls, lkb);
4045 			break;
4046 
4047 		case DLM_MSG_UNLOCK:
4048 			hold_lkb(lkb);
4049 			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
4050 			ls->ls_stub_ms.m_result = stub_unlock_result;
4051 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
4052 			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
4053 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
4054 			dlm_put_lkb(lkb);
4055 			break;
4056 
4057 		case DLM_MSG_CANCEL:
4058 			hold_lkb(lkb);
4059 			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
4060 			ls->ls_stub_ms.m_result = stub_cancel_result;
4061 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
4062 			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
4063 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
4064 			dlm_put_lkb(lkb);
4065 			break;
4066 
4067 		default:
4068 			log_error(ls, "invalid lkb wait_type %d %d",
4069 				  lkb->lkb_wait_type, wait_type);
4070 		}
4071 		schedule();
4072 	}
4073 	mutex_unlock(&ls->ls_waiters_mutex);
4074 }
4075 
4076 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
4077 {
4078 	struct dlm_lkb *lkb;
4079 	int found = 0;
4080 
4081 	mutex_lock(&ls->ls_waiters_mutex);
4082 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
4083 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
4084 			hold_lkb(lkb);
4085 			found = 1;
4086 			break;
4087 		}
4088 	}
4089 	mutex_unlock(&ls->ls_waiters_mutex);
4090 
4091 	if (!found)
4092 		lkb = NULL;
4093 	return lkb;
4094 }
4095 
4096 /* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
4097    master or dir-node for r.  Processing the lkb may result in it being placed
4098    back on waiters. */
4099 
4100 /* We do this after normal locking has been enabled and any saved messages
4101    (in requestqueue) have been processed.  We should be confident that at
4102    this point we won't get or process a reply to any of these waiting
4103    operations.  But, new ops may be coming in on the rsbs/locks here from
4104    userspace or remotely. */
4105 
4106 /* there may have been an overlap unlock/cancel prior to recovery or after
4107    recovery.  if before, the lkb may still have a pos wait_count; if after, the
4108    overlap flag would just have been set and nothing new sent.  we can be
4109    confident here than any replies to either the initial op or overlap ops
4110    prior to recovery have been received. */
4111 
4112 int dlm_recover_waiters_post(struct dlm_ls *ls)
4113 {
4114 	struct dlm_lkb *lkb;
4115 	struct dlm_rsb *r;
4116 	int error = 0, mstype, err, oc, ou;
4117 
4118 	while (1) {
4119 		if (dlm_locking_stopped(ls)) {
4120 			log_debug(ls, "recover_waiters_post aborted");
4121 			error = -EINTR;
4122 			break;
4123 		}
4124 
4125 		lkb = find_resend_waiter(ls);
4126 		if (!lkb)
4127 			break;
4128 
4129 		r = lkb->lkb_resource;
4130 		hold_rsb(r);
4131 		lock_rsb(r);
4132 
4133 		mstype = lkb->lkb_wait_type;
4134 		oc = is_overlap_cancel(lkb);
4135 		ou = is_overlap_unlock(lkb);
4136 		err = 0;
4137 
4138 		log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
4139 			  lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
4140 
4141 		/* At this point we assume that we won't get a reply to any
4142 		   previous op or overlap op on this lock.  First, do a big
4143 		   remove_from_waiters() for all previous ops. */
4144 
4145 		lkb->lkb_flags &= ~DLM_IFL_RESEND;
4146 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4147 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4148 		lkb->lkb_wait_type = 0;
4149 		lkb->lkb_wait_count = 0;
4150 		mutex_lock(&ls->ls_waiters_mutex);
4151 		list_del_init(&lkb->lkb_wait_reply);
4152 		mutex_unlock(&ls->ls_waiters_mutex);
4153 		unhold_lkb(lkb); /* for waiters list */
4154 
4155 		if (oc || ou) {
4156 			/* do an unlock or cancel instead of resending */
4157 			switch (mstype) {
4158 			case DLM_MSG_LOOKUP:
4159 			case DLM_MSG_REQUEST:
4160 				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
4161 							-DLM_ECANCEL);
4162 				unhold_lkb(lkb); /* undoes create_lkb() */
4163 				break;
4164 			case DLM_MSG_CONVERT:
4165 				if (oc) {
4166 					queue_cast(r, lkb, -DLM_ECANCEL);
4167 				} else {
4168 					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
4169 					_unlock_lock(r, lkb);
4170 				}
4171 				break;
4172 			default:
4173 				err = 1;
4174 			}
4175 		} else {
4176 			switch (mstype) {
4177 			case DLM_MSG_LOOKUP:
4178 			case DLM_MSG_REQUEST:
4179 				_request_lock(r, lkb);
4180 				if (is_master(r))
4181 					confirm_master(r, 0);
4182 				break;
4183 			case DLM_MSG_CONVERT:
4184 				_convert_lock(r, lkb);
4185 				break;
4186 			default:
4187 				err = 1;
4188 			}
4189 		}
4190 
4191 		if (err)
4192 			log_error(ls, "recover_waiters_post %x %d %x %d %d",
4193 			  	  lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
4194 		unlock_rsb(r);
4195 		put_rsb(r);
4196 		dlm_put_lkb(lkb);
4197 	}
4198 
4199 	return error;
4200 }
4201 
4202 static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
4203 			int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
4204 {
4205 	struct dlm_ls *ls = r->res_ls;
4206 	struct dlm_lkb *lkb, *safe;
4207 
4208 	list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
4209 		if (test(ls, lkb)) {
4210 			rsb_set_flag(r, RSB_LOCKS_PURGED);
4211 			del_lkb(r, lkb);
4212 			/* this put should free the lkb */
4213 			if (!dlm_put_lkb(lkb))
4214 				log_error(ls, "purged lkb not released");
4215 		}
4216 	}
4217 }
4218 
4219 static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
4220 {
4221 	return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
4222 }
4223 
4224 static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
4225 {
4226 	return is_master_copy(lkb);
4227 }
4228 
4229 static void purge_dead_locks(struct dlm_rsb *r)
4230 {
4231 	purge_queue(r, &r->res_grantqueue, &purge_dead_test);
4232 	purge_queue(r, &r->res_convertqueue, &purge_dead_test);
4233 	purge_queue(r, &r->res_waitqueue, &purge_dead_test);
4234 }
4235 
4236 void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
4237 {
4238 	purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
4239 	purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
4240 	purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
4241 }
4242 
4243 /* Get rid of locks held by nodes that are gone. */
4244 
4245 int dlm_purge_locks(struct dlm_ls *ls)
4246 {
4247 	struct dlm_rsb *r;
4248 
4249 	log_debug(ls, "dlm_purge_locks");
4250 
4251 	down_write(&ls->ls_root_sem);
4252 	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
4253 		hold_rsb(r);
4254 		lock_rsb(r);
4255 		if (is_master(r))
4256 			purge_dead_locks(r);
4257 		unlock_rsb(r);
4258 		unhold_rsb(r);
4259 
4260 		schedule();
4261 	}
4262 	up_write(&ls->ls_root_sem);
4263 
4264 	return 0;
4265 }
4266 
4267 static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4268 {
4269 	struct dlm_rsb *r, *r_ret = NULL;
4270 
4271 	spin_lock(&ls->ls_rsbtbl[bucket].lock);
4272 	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
4273 		if (!rsb_flag(r, RSB_LOCKS_PURGED))
4274 			continue;
4275 		hold_rsb(r);
4276 		rsb_clear_flag(r, RSB_LOCKS_PURGED);
4277 		r_ret = r;
4278 		break;
4279 	}
4280 	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
4281 	return r_ret;
4282 }
4283 
4284 void dlm_grant_after_purge(struct dlm_ls *ls)
4285 {
4286 	struct dlm_rsb *r;
4287 	int bucket = 0;
4288 
4289 	while (1) {
4290 		r = find_purged_rsb(ls, bucket);
4291 		if (!r) {
4292 			if (bucket == ls->ls_rsbtbl_size - 1)
4293 				break;
4294 			bucket++;
4295 			continue;
4296 		}
4297 		lock_rsb(r);
4298 		if (is_master(r)) {
4299 			grant_pending_locks(r);
4300 			confirm_master(r, 0);
4301 		}
4302 		unlock_rsb(r);
4303 		put_rsb(r);
4304 		schedule();
4305 	}
4306 }
4307 
4308 static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
4309 					 uint32_t remid)
4310 {
4311 	struct dlm_lkb *lkb;
4312 
4313 	list_for_each_entry(lkb, head, lkb_statequeue) {
4314 		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
4315 			return lkb;
4316 	}
4317 	return NULL;
4318 }
4319 
4320 static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
4321 				    uint32_t remid)
4322 {
4323 	struct dlm_lkb *lkb;
4324 
4325 	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
4326 	if (lkb)
4327 		return lkb;
4328 	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
4329 	if (lkb)
4330 		return lkb;
4331 	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
4332 	if (lkb)
4333 		return lkb;
4334 	return NULL;
4335 }
4336 
4337 /* needs at least dlm_rcom + rcom_lock */
4338 static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4339 				  struct dlm_rsb *r, struct dlm_rcom *rc)
4340 {
4341 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
4342 
4343 	lkb->lkb_nodeid = rc->rc_header.h_nodeid;
4344 	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
4345 	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
4346 	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
4347 	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
4348 	lkb->lkb_flags |= DLM_IFL_MSTCPY;
4349 	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
4350 	lkb->lkb_rqmode = rl->rl_rqmode;
4351 	lkb->lkb_grmode = rl->rl_grmode;
4352 	/* don't set lkb_status because add_lkb wants to itself */
4353 
4354 	lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL;
4355 	lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL;
4356 
4357 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4358 		int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
4359 			 sizeof(struct rcom_lock);
4360 		if (lvblen > ls->ls_lvblen)
4361 			return -EINVAL;
4362 		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
4363 		if (!lkb->lkb_lvbptr)
4364 			return -ENOMEM;
4365 		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
4366 	}
4367 
4368 	/* Conversions between PR and CW (middle modes) need special handling.
4369 	   The real granted mode of these converting locks cannot be determined
4370 	   until all locks have been rebuilt on the rsb (recover_conversion) */
4371 
4372 	if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
4373 	    middle_conversion(lkb)) {
4374 		rl->rl_status = DLM_LKSTS_CONVERT;
4375 		lkb->lkb_grmode = DLM_LOCK_IV;
4376 		rsb_set_flag(r, RSB_RECOVER_CONVERT);
4377 	}
4378 
4379 	return 0;
4380 }
4381 
4382 /* This lkb may have been recovered in a previous aborted recovery so we need
4383    to check if the rsb already has an lkb with the given remote nodeid/lkid.
4384    If so we just send back a standard reply.  If not, we create a new lkb with
4385    the given values and send back our lkid.  We send back our lkid by sending
4386    back the rcom_lock struct we got but with the remid field filled in. */
4387 
4388 /* needs at least dlm_rcom + rcom_lock */
4389 int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4390 {
4391 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
4392 	struct dlm_rsb *r;
4393 	struct dlm_lkb *lkb;
4394 	int error;
4395 
4396 	if (rl->rl_parent_lkid) {
4397 		error = -EOPNOTSUPP;
4398 		goto out;
4399 	}
4400 
4401 	error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
4402 			 R_MASTER, &r);
4403 	if (error)
4404 		goto out;
4405 
4406 	lock_rsb(r);
4407 
4408 	lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid));
4409 	if (lkb) {
4410 		error = -EEXIST;
4411 		goto out_remid;
4412 	}
4413 
4414 	error = create_lkb(ls, &lkb);
4415 	if (error)
4416 		goto out_unlock;
4417 
4418 	error = receive_rcom_lock_args(ls, lkb, r, rc);
4419 	if (error) {
4420 		__put_lkb(ls, lkb);
4421 		goto out_unlock;
4422 	}
4423 
4424 	attach_lkb(r, lkb);
4425 	add_lkb(r, lkb, rl->rl_status);
4426 	error = 0;
4427 
4428  out_remid:
4429 	/* this is the new value returned to the lock holder for
4430 	   saving in its process-copy lkb */
4431 	rl->rl_remid = cpu_to_le32(lkb->lkb_id);
4432 
4433  out_unlock:
4434 	unlock_rsb(r);
4435 	put_rsb(r);
4436  out:
4437 	if (error)
4438 		log_debug(ls, "recover_master_copy %d %x", error,
4439 			  le32_to_cpu(rl->rl_lkid));
4440 	rl->rl_result = cpu_to_le32(error);
4441 	return error;
4442 }
4443 
4444 /* needs at least dlm_rcom + rcom_lock */
4445 int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4446 {
4447 	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
4448 	struct dlm_rsb *r;
4449 	struct dlm_lkb *lkb;
4450 	int error;
4451 
4452 	error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb);
4453 	if (error) {
4454 		log_error(ls, "recover_process_copy no lkid %x",
4455 				le32_to_cpu(rl->rl_lkid));
4456 		return error;
4457 	}
4458 
4459 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
4460 
4461 	error = le32_to_cpu(rl->rl_result);
4462 
4463 	r = lkb->lkb_resource;
4464 	hold_rsb(r);
4465 	lock_rsb(r);
4466 
4467 	switch (error) {
4468 	case -EBADR:
4469 		/* There's a chance the new master received our lock before
4470 		   dlm_recover_master_reply(), this wouldn't happen if we did
4471 		   a barrier between recover_masters and recover_locks. */
4472 		log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
4473 			  (unsigned long)r, r->res_name);
4474 		dlm_send_rcom_lock(r, lkb);
4475 		goto out;
4476 	case -EEXIST:
4477 		log_debug(ls, "master copy exists %x", lkb->lkb_id);
4478 		/* fall through */
4479 	case 0:
4480 		lkb->lkb_remid = le32_to_cpu(rl->rl_remid);
4481 		break;
4482 	default:
4483 		log_error(ls, "dlm_recover_process_copy unknown error %d %x",
4484 			  error, lkb->lkb_id);
4485 	}
4486 
4487 	/* an ack for dlm_recover_locks() which waits for replies from
4488 	   all the locks it sends to new masters */
4489 	dlm_recovered_lock(r);
4490  out:
4491 	unlock_rsb(r);
4492 	put_rsb(r);
4493 	dlm_put_lkb(lkb);
4494 
4495 	return 0;
4496 }
4497 
4498 int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4499 		     int mode, uint32_t flags, void *name, unsigned int namelen,
4500 		     unsigned long timeout_cs)
4501 {
4502 	struct dlm_lkb *lkb;
4503 	struct dlm_args args;
4504 	int error;
4505 
4506 	dlm_lock_recovery(ls);
4507 
4508 	error = create_lkb(ls, &lkb);
4509 	if (error) {
4510 		kfree(ua);
4511 		goto out;
4512 	}
4513 
4514 	if (flags & DLM_LKF_VALBLK) {
4515 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4516 		if (!ua->lksb.sb_lvbptr) {
4517 			kfree(ua);
4518 			__put_lkb(ls, lkb);
4519 			error = -ENOMEM;
4520 			goto out;
4521 		}
4522 	}
4523 
4524 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
4525 	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
4526 	   lock and that lkb_astparam is the dlm_user_args structure. */
4527 
4528 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
4529 			      fake_astfn, ua, fake_bastfn, &args);
4530 	lkb->lkb_flags |= DLM_IFL_USER;
4531 	ua->old_mode = DLM_LOCK_IV;
4532 
4533 	if (error) {
4534 		__put_lkb(ls, lkb);
4535 		goto out;
4536 	}
4537 
4538 	error = request_lock(ls, lkb, name, namelen, &args);
4539 
4540 	switch (error) {
4541 	case 0:
4542 		break;
4543 	case -EINPROGRESS:
4544 		error = 0;
4545 		break;
4546 	case -EAGAIN:
4547 		error = 0;
4548 		/* fall through */
4549 	default:
4550 		__put_lkb(ls, lkb);
4551 		goto out;
4552 	}
4553 
4554 	/* add this new lkb to the per-process list of locks */
4555 	spin_lock(&ua->proc->locks_spin);
4556 	hold_lkb(lkb);
4557 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
4558 	spin_unlock(&ua->proc->locks_spin);
4559  out:
4560 	dlm_unlock_recovery(ls);
4561 	return error;
4562 }
4563 
4564 int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4565 		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
4566 		     unsigned long timeout_cs)
4567 {
4568 	struct dlm_lkb *lkb;
4569 	struct dlm_args args;
4570 	struct dlm_user_args *ua;
4571 	int error;
4572 
4573 	dlm_lock_recovery(ls);
4574 
4575 	error = find_lkb(ls, lkid, &lkb);
4576 	if (error)
4577 		goto out;
4578 
4579 	/* user can change the params on its lock when it converts it, or
4580 	   add an lvb that didn't exist before */
4581 
4582 	ua = lkb->lkb_ua;
4583 
4584 	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
4585 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4586 		if (!ua->lksb.sb_lvbptr) {
4587 			error = -ENOMEM;
4588 			goto out_put;
4589 		}
4590 	}
4591 	if (lvb_in && ua->lksb.sb_lvbptr)
4592 		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
4593 
4594 	ua->xid = ua_tmp->xid;
4595 	ua->castparam = ua_tmp->castparam;
4596 	ua->castaddr = ua_tmp->castaddr;
4597 	ua->bastparam = ua_tmp->bastparam;
4598 	ua->bastaddr = ua_tmp->bastaddr;
4599 	ua->user_lksb = ua_tmp->user_lksb;
4600 	ua->old_mode = lkb->lkb_grmode;
4601 
4602 	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
4603 			      fake_astfn, ua, fake_bastfn, &args);
4604 	if (error)
4605 		goto out_put;
4606 
4607 	error = convert_lock(ls, lkb, &args);
4608 
4609 	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
4610 		error = 0;
4611  out_put:
4612 	dlm_put_lkb(lkb);
4613  out:
4614 	dlm_unlock_recovery(ls);
4615 	kfree(ua_tmp);
4616 	return error;
4617 }
4618 
4619 int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4620 		    uint32_t flags, uint32_t lkid, char *lvb_in)
4621 {
4622 	struct dlm_lkb *lkb;
4623 	struct dlm_args args;
4624 	struct dlm_user_args *ua;
4625 	int error;
4626 
4627 	dlm_lock_recovery(ls);
4628 
4629 	error = find_lkb(ls, lkid, &lkb);
4630 	if (error)
4631 		goto out;
4632 
4633 	ua = lkb->lkb_ua;
4634 
4635 	if (lvb_in && ua->lksb.sb_lvbptr)
4636 		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
4637 	if (ua_tmp->castparam)
4638 		ua->castparam = ua_tmp->castparam;
4639 	ua->user_lksb = ua_tmp->user_lksb;
4640 
4641 	error = set_unlock_args(flags, ua, &args);
4642 	if (error)
4643 		goto out_put;
4644 
4645 	error = unlock_lock(ls, lkb, &args);
4646 
4647 	if (error == -DLM_EUNLOCK)
4648 		error = 0;
4649 	/* from validate_unlock_args() */
4650 	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
4651 		error = 0;
4652 	if (error)
4653 		goto out_put;
4654 
4655 	spin_lock(&ua->proc->locks_spin);
4656 	/* dlm_user_add_ast() may have already taken lkb off the proc list */
4657 	if (!list_empty(&lkb->lkb_ownqueue))
4658 		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
4659 	spin_unlock(&ua->proc->locks_spin);
4660  out_put:
4661 	dlm_put_lkb(lkb);
4662  out:
4663 	dlm_unlock_recovery(ls);
4664 	kfree(ua_tmp);
4665 	return error;
4666 }
4667 
4668 int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4669 		    uint32_t flags, uint32_t lkid)
4670 {
4671 	struct dlm_lkb *lkb;
4672 	struct dlm_args args;
4673 	struct dlm_user_args *ua;
4674 	int error;
4675 
4676 	dlm_lock_recovery(ls);
4677 
4678 	error = find_lkb(ls, lkid, &lkb);
4679 	if (error)
4680 		goto out;
4681 
4682 	ua = lkb->lkb_ua;
4683 	if (ua_tmp->castparam)
4684 		ua->castparam = ua_tmp->castparam;
4685 	ua->user_lksb = ua_tmp->user_lksb;
4686 
4687 	error = set_unlock_args(flags, ua, &args);
4688 	if (error)
4689 		goto out_put;
4690 
4691 	error = cancel_lock(ls, lkb, &args);
4692 
4693 	if (error == -DLM_ECANCEL)
4694 		error = 0;
4695 	/* from validate_unlock_args() */
4696 	if (error == -EBUSY)
4697 		error = 0;
4698  out_put:
4699 	dlm_put_lkb(lkb);
4700  out:
4701 	dlm_unlock_recovery(ls);
4702 	kfree(ua_tmp);
4703 	return error;
4704 }
4705 
4706 int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
4707 {
4708 	struct dlm_lkb *lkb;
4709 	struct dlm_args args;
4710 	struct dlm_user_args *ua;
4711 	struct dlm_rsb *r;
4712 	int error;
4713 
4714 	dlm_lock_recovery(ls);
4715 
4716 	error = find_lkb(ls, lkid, &lkb);
4717 	if (error)
4718 		goto out;
4719 
4720 	ua = lkb->lkb_ua;
4721 
4722 	error = set_unlock_args(flags, ua, &args);
4723 	if (error)
4724 		goto out_put;
4725 
4726 	/* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
4727 
4728 	r = lkb->lkb_resource;
4729 	hold_rsb(r);
4730 	lock_rsb(r);
4731 
4732 	error = validate_unlock_args(lkb, &args);
4733 	if (error)
4734 		goto out_r;
4735 	lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
4736 
4737 	error = _cancel_lock(r, lkb);
4738  out_r:
4739 	unlock_rsb(r);
4740 	put_rsb(r);
4741 
4742 	if (error == -DLM_ECANCEL)
4743 		error = 0;
4744 	/* from validate_unlock_args() */
4745 	if (error == -EBUSY)
4746 		error = 0;
4747  out_put:
4748 	dlm_put_lkb(lkb);
4749  out:
4750 	dlm_unlock_recovery(ls);
4751 	return error;
4752 }
4753 
4754 /* lkb's that are removed from the waiters list by revert are just left on the
4755    orphans list with the granted orphan locks, to be freed by purge */
4756 
4757 static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
4758 {
4759 	struct dlm_args args;
4760 	int error;
4761 
4762 	hold_lkb(lkb);
4763 	mutex_lock(&ls->ls_orphans_mutex);
4764 	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
4765 	mutex_unlock(&ls->ls_orphans_mutex);
4766 
4767 	set_unlock_args(0, lkb->lkb_ua, &args);
4768 
4769 	error = cancel_lock(ls, lkb, &args);
4770 	if (error == -DLM_ECANCEL)
4771 		error = 0;
4772 	return error;
4773 }
4774 
4775 /* The force flag allows the unlock to go ahead even if the lkb isn't granted.
4776    Regardless of what rsb queue the lock is on, it's removed and freed. */
4777 
4778 static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
4779 {
4780 	struct dlm_args args;
4781 	int error;
4782 
4783 	set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
4784 
4785 	error = unlock_lock(ls, lkb, &args);
4786 	if (error == -DLM_EUNLOCK)
4787 		error = 0;
4788 	return error;
4789 }
4790 
4791 /* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
4792    (which does lock_rsb) due to deadlock with receiving a message that does
4793    lock_rsb followed by dlm_user_add_ast() */
4794 
4795 static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
4796 				     struct dlm_user_proc *proc)
4797 {
4798 	struct dlm_lkb *lkb = NULL;
4799 
4800 	mutex_lock(&ls->ls_clear_proc_locks);
4801 	if (list_empty(&proc->locks))
4802 		goto out;
4803 
4804 	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
4805 	list_del_init(&lkb->lkb_ownqueue);
4806 
4807 	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
4808 		lkb->lkb_flags |= DLM_IFL_ORPHAN;
4809 	else
4810 		lkb->lkb_flags |= DLM_IFL_DEAD;
4811  out:
4812 	mutex_unlock(&ls->ls_clear_proc_locks);
4813 	return lkb;
4814 }
4815 
4816 /* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
4817    1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
4818    which we clear here. */
4819 
4820 /* proc CLOSING flag is set so no more device_reads should look at proc->asts
4821    list, and no more device_writes should add lkb's to proc->locks list; so we
4822    shouldn't need to take asts_spin or locks_spin here.  this assumes that
4823    device reads/writes/closes are serialized -- FIXME: we may need to serialize
4824    them ourself. */
4825 
4826 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4827 {
4828 	struct dlm_lkb *lkb, *safe;
4829 
4830 	dlm_lock_recovery(ls);
4831 
4832 	while (1) {
4833 		lkb = del_proc_lock(ls, proc);
4834 		if (!lkb)
4835 			break;
4836 		del_timeout(lkb);
4837 		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
4838 			orphan_proc_lock(ls, lkb);
4839 		else
4840 			unlock_proc_lock(ls, lkb);
4841 
4842 		/* this removes the reference for the proc->locks list
4843 		   added by dlm_user_request, it may result in the lkb
4844 		   being freed */
4845 
4846 		dlm_put_lkb(lkb);
4847 	}
4848 
4849 	mutex_lock(&ls->ls_clear_proc_locks);
4850 
4851 	/* in-progress unlocks */
4852 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
4853 		list_del_init(&lkb->lkb_ownqueue);
4854 		lkb->lkb_flags |= DLM_IFL_DEAD;
4855 		dlm_put_lkb(lkb);
4856 	}
4857 
4858 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4859 		lkb->lkb_ast_type = 0;
4860 		list_del(&lkb->lkb_astqueue);
4861 		dlm_put_lkb(lkb);
4862 	}
4863 
4864 	mutex_unlock(&ls->ls_clear_proc_locks);
4865 	dlm_unlock_recovery(ls);
4866 }
4867 
4868 static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4869 {
4870 	struct dlm_lkb *lkb, *safe;
4871 
4872 	while (1) {
4873 		lkb = NULL;
4874 		spin_lock(&proc->locks_spin);
4875 		if (!list_empty(&proc->locks)) {
4876 			lkb = list_entry(proc->locks.next, struct dlm_lkb,
4877 					 lkb_ownqueue);
4878 			list_del_init(&lkb->lkb_ownqueue);
4879 		}
4880 		spin_unlock(&proc->locks_spin);
4881 
4882 		if (!lkb)
4883 			break;
4884 
4885 		lkb->lkb_flags |= DLM_IFL_DEAD;
4886 		unlock_proc_lock(ls, lkb);
4887 		dlm_put_lkb(lkb); /* ref from proc->locks list */
4888 	}
4889 
4890 	spin_lock(&proc->locks_spin);
4891 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
4892 		list_del_init(&lkb->lkb_ownqueue);
4893 		lkb->lkb_flags |= DLM_IFL_DEAD;
4894 		dlm_put_lkb(lkb);
4895 	}
4896 	spin_unlock(&proc->locks_spin);
4897 
4898 	spin_lock(&proc->asts_spin);
4899 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4900 		list_del(&lkb->lkb_astqueue);
4901 		dlm_put_lkb(lkb);
4902 	}
4903 	spin_unlock(&proc->asts_spin);
4904 }
4905 
4906 /* pid of 0 means purge all orphans */
4907 
4908 static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
4909 {
4910 	struct dlm_lkb *lkb, *safe;
4911 
4912 	mutex_lock(&ls->ls_orphans_mutex);
4913 	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
4914 		if (pid && lkb->lkb_ownpid != pid)
4915 			continue;
4916 		unlock_proc_lock(ls, lkb);
4917 		list_del_init(&lkb->lkb_ownqueue);
4918 		dlm_put_lkb(lkb);
4919 	}
4920 	mutex_unlock(&ls->ls_orphans_mutex);
4921 }
4922 
4923 static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
4924 {
4925 	struct dlm_message *ms;
4926 	struct dlm_mhandle *mh;
4927 	int error;
4928 
4929 	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
4930 				DLM_MSG_PURGE, &ms, &mh);
4931 	if (error)
4932 		return error;
4933 	ms->m_nodeid = nodeid;
4934 	ms->m_pid = pid;
4935 
4936 	return send_message(mh, ms);
4937 }
4938 
4939 int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
4940 		   int nodeid, int pid)
4941 {
4942 	int error = 0;
4943 
4944 	if (nodeid != dlm_our_nodeid()) {
4945 		error = send_purge(ls, nodeid, pid);
4946 	} else {
4947 		dlm_lock_recovery(ls);
4948 		if (pid == current->pid)
4949 			purge_proc_locks(ls, proc);
4950 		else
4951 			do_purge(ls, nodeid, pid);
4952 		dlm_unlock_recovery(ls);
4953 	}
4954 	return error;
4955 }
4956 
4957