xref: /openbmc/linux/fs/ceph/locks.c (revision 22c03398)
1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/ceph/ceph_debug.h>
3  
4  #include <linux/file.h>
5  #include <linux/namei.h>
6  #include <linux/random.h>
7  
8  #include "super.h"
9  #include "mds_client.h"
10  #include <linux/ceph/pagelist.h>
11  
12  static u64 lock_secret;
13  static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
14                                           struct ceph_mds_request *req);
15  
16  static inline u64 secure_addr(void *addr)
17  {
18  	u64 v = lock_secret ^ (u64)(unsigned long)addr;
19  	/*
20  	 * Set the most significant bit, so that MDS knows the 'owner'
21  	 * is sufficient to identify the owner of lock. (old code uses
22  	 * both 'owner' and 'pid')
23  	 */
24  	v |= (1ULL << 63);
25  	return v;
26  }
27  
28  void __init ceph_flock_init(void)
29  {
30  	get_random_bytes(&lock_secret, sizeof(lock_secret));
31  }
32  
33  static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
34  {
35  	struct ceph_file_info *fi = dst->fl_file->private_data;
36  	struct inode *inode = file_inode(dst->fl_file);
37  	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38  	atomic_inc(&fi->num_locks);
39  }
40  
41  static void ceph_fl_release_lock(struct file_lock *fl)
42  {
43  	struct ceph_file_info *fi = fl->fl_file->private_data;
44  	struct inode *inode = file_inode(fl->fl_file);
45  	struct ceph_inode_info *ci = ceph_inode(inode);
46  	atomic_dec(&fi->num_locks);
47  	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
48  		/* clear error when all locks are released */
49  		spin_lock(&ci->i_ceph_lock);
50  		ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
51  		spin_unlock(&ci->i_ceph_lock);
52  	}
53  }
54  
55  static const struct file_lock_operations ceph_fl_lock_ops = {
56  	.fl_copy_lock = ceph_fl_copy_lock,
57  	.fl_release_private = ceph_fl_release_lock,
58  };
59  
60  /*
61   * Implement fcntl and flock locking functions.
62   */
63  static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
64  			     int cmd, u8 wait, struct file_lock *fl)
65  {
66  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
67  	struct ceph_mds_request *req;
68  	int err;
69  	u64 length = 0;
70  	u64 owner;
71  
72  	if (operation == CEPH_MDS_OP_SETFILELOCK) {
73  		/*
74  		 * increasing i_filelock_ref closes race window between
75  		 * handling request reply and adding file_lock struct to
76  		 * inode. Otherwise, auth caps may get trimmed in the
77  		 * window. Caller function will decrease the counter.
78  		 */
79  		fl->fl_ops = &ceph_fl_lock_ops;
80  		fl->fl_ops->fl_copy_lock(fl, NULL);
81  	}
82  
83  	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
84  		wait = 0;
85  
86  	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
87  	if (IS_ERR(req))
88  		return PTR_ERR(req);
89  	req->r_inode = inode;
90  	ihold(inode);
91  	req->r_num_caps = 1;
92  
93  	/* mds requires start and length rather than start and end */
94  	if (LLONG_MAX == fl->fl_end)
95  		length = 0;
96  	else
97  		length = fl->fl_end - fl->fl_start + 1;
98  
99  	owner = secure_addr(fl->fl_owner);
100  
101  	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
102  	     "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
103  	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
104  	     wait, fl->fl_type);
105  
106  	req->r_args.filelock_change.rule = lock_type;
107  	req->r_args.filelock_change.type = cmd;
108  	req->r_args.filelock_change.owner = cpu_to_le64(owner);
109  	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
110  	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
111  	req->r_args.filelock_change.length = cpu_to_le64(length);
112  	req->r_args.filelock_change.wait = wait;
113  
114  	if (wait)
115  		req->r_wait_for_completion = ceph_lock_wait_for_completion;
116  
117  	err = ceph_mdsc_do_request(mdsc, inode, req);
118  	if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
119  		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
120  		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
121  			fl->fl_type = F_RDLCK;
122  		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
123  			fl->fl_type = F_WRLCK;
124  		else
125  			fl->fl_type = F_UNLCK;
126  
127  		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
128  		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
129  						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
130  		if (length >= 1)
131  			fl->fl_end = length -1;
132  		else
133  			fl->fl_end = 0;
134  
135  	}
136  	ceph_mdsc_put_request(req);
137  	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
138  	     "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
139  	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
140  	     length, wait, fl->fl_type, err);
141  	return err;
142  }
143  
144  static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
145                                           struct ceph_mds_request *req)
146  {
147  	struct ceph_mds_request *intr_req;
148  	struct inode *inode = req->r_inode;
149  	int err, lock_type;
150  
151  	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
152  	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
153  		lock_type = CEPH_LOCK_FCNTL_INTR;
154  	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
155  		lock_type = CEPH_LOCK_FLOCK_INTR;
156  	else
157  		BUG_ON(1);
158  	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
159  
160  	err = wait_for_completion_interruptible(&req->r_completion);
161  	if (!err)
162  		return 0;
163  
164  	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
165  	     req->r_tid);
166  
167  	mutex_lock(&mdsc->mutex);
168  	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
169  		err = 0;
170  	} else {
171  		/*
172  		 * ensure we aren't running concurrently with
173  		 * ceph_fill_trace or ceph_readdir_prepopulate, which
174  		 * rely on locks (dir mutex) held by our caller.
175  		 */
176  		mutex_lock(&req->r_fill_mutex);
177  		req->r_err = err;
178  		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
179  		mutex_unlock(&req->r_fill_mutex);
180  
181  		if (!req->r_session) {
182  			// haven't sent the request
183  			err = 0;
184  		}
185  	}
186  	mutex_unlock(&mdsc->mutex);
187  	if (!err)
188  		return 0;
189  
190  	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
191  					    USE_AUTH_MDS);
192  	if (IS_ERR(intr_req))
193  		return PTR_ERR(intr_req);
194  
195  	intr_req->r_inode = inode;
196  	ihold(inode);
197  	intr_req->r_num_caps = 1;
198  
199  	intr_req->r_args.filelock_change = req->r_args.filelock_change;
200  	intr_req->r_args.filelock_change.rule = lock_type;
201  	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
202  
203  	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
204  	ceph_mdsc_put_request(intr_req);
205  
206  	if (err && err != -ERESTARTSYS)
207  		return err;
208  
209  	wait_for_completion_killable(&req->r_safe_completion);
210  	return 0;
211  }
212  
213  static int try_unlock_file(struct file *file, struct file_lock *fl)
214  {
215  	int err;
216  	unsigned int orig_flags = fl->fl_flags;
217  	fl->fl_flags |= FL_EXISTS;
218  	err = locks_lock_file_wait(file, fl);
219  	fl->fl_flags = orig_flags;
220  	if (err == -ENOENT) {
221  		if (!(orig_flags & FL_EXISTS))
222  			err = 0;
223  		return err;
224  	}
225  	return 1;
226  }
227  
228  /*
229   * Attempt to set an fcntl lock.
230   * For now, this just goes away to the server. Later it may be more awesome.
231   */
232  int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
233  {
234  	struct inode *inode = file_inode(file);
235  	struct ceph_inode_info *ci = ceph_inode(inode);
236  	int err = 0;
237  	u16 op = CEPH_MDS_OP_SETFILELOCK;
238  	u8 wait = 0;
239  	u8 lock_cmd;
240  
241  	if (!(fl->fl_flags & FL_POSIX))
242  		return -ENOLCK;
243  
244  	if (ceph_inode_is_shutdown(inode))
245  		return -ESTALE;
246  
247  	dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
248  
249  	/* set wait bit as appropriate, then make command as Ceph expects it*/
250  	if (IS_GETLK(cmd))
251  		op = CEPH_MDS_OP_GETFILELOCK;
252  	else if (IS_SETLKW(cmd))
253  		wait = 1;
254  
255  	spin_lock(&ci->i_ceph_lock);
256  	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
257  		err = -EIO;
258  	}
259  	spin_unlock(&ci->i_ceph_lock);
260  	if (err < 0) {
261  		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
262  			posix_lock_file(file, fl, NULL);
263  		return err;
264  	}
265  
266  	if (F_RDLCK == fl->fl_type)
267  		lock_cmd = CEPH_LOCK_SHARED;
268  	else if (F_WRLCK == fl->fl_type)
269  		lock_cmd = CEPH_LOCK_EXCL;
270  	else
271  		lock_cmd = CEPH_LOCK_UNLOCK;
272  
273  	if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
274  		err = try_unlock_file(file, fl);
275  		if (err <= 0)
276  			return err;
277  	}
278  
279  	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
280  	if (!err) {
281  		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
282  			dout("mds locked, locking locally\n");
283  			err = posix_lock_file(file, fl, NULL);
284  			if (err) {
285  				/* undo! This should only happen if
286  				 * the kernel detects local
287  				 * deadlock. */
288  				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
289  						  CEPH_LOCK_UNLOCK, 0, fl);
290  				dout("got %d on posix_lock_file, undid lock\n",
291  				     err);
292  			}
293  		}
294  	}
295  	return err;
296  }
297  
298  int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
299  {
300  	struct inode *inode = file_inode(file);
301  	struct ceph_inode_info *ci = ceph_inode(inode);
302  	int err = 0;
303  	u8 wait = 0;
304  	u8 lock_cmd;
305  
306  	if (!(fl->fl_flags & FL_FLOCK))
307  		return -ENOLCK;
308  
309  	if (ceph_inode_is_shutdown(inode))
310  		return -ESTALE;
311  
312  	dout("ceph_flock, fl_file: %p\n", fl->fl_file);
313  
314  	spin_lock(&ci->i_ceph_lock);
315  	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
316  		err = -EIO;
317  	}
318  	spin_unlock(&ci->i_ceph_lock);
319  	if (err < 0) {
320  		if (F_UNLCK == fl->fl_type)
321  			locks_lock_file_wait(file, fl);
322  		return err;
323  	}
324  
325  	if (IS_SETLKW(cmd))
326  		wait = 1;
327  
328  	if (F_RDLCK == fl->fl_type)
329  		lock_cmd = CEPH_LOCK_SHARED;
330  	else if (F_WRLCK == fl->fl_type)
331  		lock_cmd = CEPH_LOCK_EXCL;
332  	else
333  		lock_cmd = CEPH_LOCK_UNLOCK;
334  
335  	if (F_UNLCK == fl->fl_type) {
336  		err = try_unlock_file(file, fl);
337  		if (err <= 0)
338  			return err;
339  	}
340  
341  	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
342  				inode, lock_cmd, wait, fl);
343  	if (!err && F_UNLCK != fl->fl_type) {
344  		err = locks_lock_file_wait(file, fl);
345  		if (err) {
346  			ceph_lock_message(CEPH_LOCK_FLOCK,
347  					  CEPH_MDS_OP_SETFILELOCK,
348  					  inode, CEPH_LOCK_UNLOCK, 0, fl);
349  			dout("got %d on locks_lock_file_wait, undid lock\n", err);
350  		}
351  	}
352  	return err;
353  }
354  
355  /*
356   * Fills in the passed counter variables, so you can prepare pagelist metadata
357   * before calling ceph_encode_locks.
358   */
359  void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
360  {
361  	struct file_lock *lock;
362  	struct file_lock_context *ctx;
363  
364  	*fcntl_count = 0;
365  	*flock_count = 0;
366  
367  	ctx = inode->i_flctx;
368  	if (ctx) {
369  		spin_lock(&ctx->flc_lock);
370  		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
371  			++(*fcntl_count);
372  		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
373  			++(*flock_count);
374  		spin_unlock(&ctx->flc_lock);
375  	}
376  	dout("counted %d flock locks and %d fcntl locks\n",
377  	     *flock_count, *fcntl_count);
378  }
379  
380  /*
381   * Given a pointer to a lock, convert it to a ceph filelock
382   */
383  static int lock_to_ceph_filelock(struct file_lock *lock,
384  				 struct ceph_filelock *cephlock)
385  {
386  	int err = 0;
387  	cephlock->start = cpu_to_le64(lock->fl_start);
388  	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
389  	cephlock->client = cpu_to_le64(0);
390  	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
391  	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
392  
393  	switch (lock->fl_type) {
394  	case F_RDLCK:
395  		cephlock->type = CEPH_LOCK_SHARED;
396  		break;
397  	case F_WRLCK:
398  		cephlock->type = CEPH_LOCK_EXCL;
399  		break;
400  	case F_UNLCK:
401  		cephlock->type = CEPH_LOCK_UNLOCK;
402  		break;
403  	default:
404  		dout("Have unknown lock type %d\n", lock->fl_type);
405  		err = -EINVAL;
406  	}
407  
408  	return err;
409  }
410  
411  /*
412   * Encode the flock and fcntl locks for the given inode into the ceph_filelock
413   * array. Must be called with inode->i_lock already held.
414   * If we encounter more of a specific lock type than expected, return -ENOSPC.
415   */
416  int ceph_encode_locks_to_buffer(struct inode *inode,
417  				struct ceph_filelock *flocks,
418  				int num_fcntl_locks, int num_flock_locks)
419  {
420  	struct file_lock *lock;
421  	struct file_lock_context *ctx = inode->i_flctx;
422  	int err = 0;
423  	int seen_fcntl = 0;
424  	int seen_flock = 0;
425  	int l = 0;
426  
427  	dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
428  	     num_fcntl_locks);
429  
430  	if (!ctx)
431  		return 0;
432  
433  	spin_lock(&ctx->flc_lock);
434  	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
435  		++seen_fcntl;
436  		if (seen_fcntl > num_fcntl_locks) {
437  			err = -ENOSPC;
438  			goto fail;
439  		}
440  		err = lock_to_ceph_filelock(lock, &flocks[l]);
441  		if (err)
442  			goto fail;
443  		++l;
444  	}
445  	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
446  		++seen_flock;
447  		if (seen_flock > num_flock_locks) {
448  			err = -ENOSPC;
449  			goto fail;
450  		}
451  		err = lock_to_ceph_filelock(lock, &flocks[l]);
452  		if (err)
453  			goto fail;
454  		++l;
455  	}
456  fail:
457  	spin_unlock(&ctx->flc_lock);
458  	return err;
459  }
460  
461  /*
462   * Copy the encoded flock and fcntl locks into the pagelist.
463   * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
464   * sequential flock locks.
465   * Returns zero on success.
466   */
467  int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
468  			   struct ceph_pagelist *pagelist,
469  			   int num_fcntl_locks, int num_flock_locks)
470  {
471  	int err = 0;
472  	__le32 nlocks;
473  
474  	nlocks = cpu_to_le32(num_fcntl_locks);
475  	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
476  	if (err)
477  		goto out_fail;
478  
479  	if (num_fcntl_locks > 0) {
480  		err = ceph_pagelist_append(pagelist, flocks,
481  					   num_fcntl_locks * sizeof(*flocks));
482  		if (err)
483  			goto out_fail;
484  	}
485  
486  	nlocks = cpu_to_le32(num_flock_locks);
487  	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
488  	if (err)
489  		goto out_fail;
490  
491  	if (num_flock_locks > 0) {
492  		err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
493  					   num_flock_locks * sizeof(*flocks));
494  	}
495  out_fail:
496  	return err;
497  }
498