1 /* 2 * Copyright (c) 2014 Christoph Hellwig. 3 */ 4 #include <linux/iomap.h> 5 #include "xfs.h" 6 #include "xfs_format.h" 7 #include "xfs_log_format.h" 8 #include "xfs_trans_resv.h" 9 #include "xfs_sb.h" 10 #include "xfs_mount.h" 11 #include "xfs_inode.h" 12 #include "xfs_trans.h" 13 #include "xfs_log.h" 14 #include "xfs_bmap.h" 15 #include "xfs_bmap_util.h" 16 #include "xfs_error.h" 17 #include "xfs_iomap.h" 18 #include "xfs_shared.h" 19 #include "xfs_bit.h" 20 #include "xfs_pnfs.h" 21 22 /* 23 * Ensure that we do not have any outstanding pNFS layouts that can be used by 24 * clients to directly read from or write to this inode. This must be called 25 * before every operation that can remove blocks from the extent map. 26 * Additionally we call it during the write operation, where aren't concerned 27 * about exposing unallocated blocks but just want to provide basic 28 * synchronization between a local writer and pNFS clients. mmap writes would 29 * also benefit from this sort of synchronization, but due to the tricky locking 30 * rules in the page fault path we don't bother. 31 */ 32 int 33 xfs_break_layouts( 34 struct inode *inode, 35 uint *iolock) 36 { 37 struct xfs_inode *ip = XFS_I(inode); 38 int error; 39 40 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); 41 42 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { 43 xfs_iunlock(ip, *iolock); 44 error = break_layout(inode, true); 45 *iolock = XFS_IOLOCK_EXCL; 46 xfs_ilock(ip, *iolock); 47 } 48 49 return error; 50 } 51 52 /* 53 * Get a unique ID including its location so that the client can identify 54 * the exported device. 55 */ 56 int 57 xfs_fs_get_uuid( 58 struct super_block *sb, 59 u8 *buf, 60 u32 *len, 61 u64 *offset) 62 { 63 struct xfs_mount *mp = XFS_M(sb); 64 65 printk_once(KERN_NOTICE 66 "XFS (%s): using experimental pNFS feature, use at your own risk!\n", 67 mp->m_fsname); 68 69 if (*len < sizeof(uuid_t)) 70 return -EINVAL; 71 72 memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 73 *len = sizeof(uuid_t); 74 *offset = offsetof(struct xfs_dsb, sb_uuid); 75 return 0; 76 } 77 78 /* 79 * Get a layout for the pNFS client. 80 */ 81 int 82 xfs_fs_map_blocks( 83 struct inode *inode, 84 loff_t offset, 85 u64 length, 86 struct iomap *iomap, 87 bool write, 88 u32 *device_generation) 89 { 90 struct xfs_inode *ip = XFS_I(inode); 91 struct xfs_mount *mp = ip->i_mount; 92 struct xfs_bmbt_irec imap; 93 xfs_fileoff_t offset_fsb, end_fsb; 94 loff_t limit; 95 int bmapi_flags = XFS_BMAPI_ENTIRE; 96 int nimaps = 1; 97 uint lock_flags; 98 int error = 0; 99 100 if (XFS_FORCED_SHUTDOWN(mp)) 101 return -EIO; 102 103 /* 104 * We can't export inodes residing on the realtime device. The realtime 105 * device doesn't have a UUID to identify it, so the client has no way 106 * to find it. 107 */ 108 if (XFS_IS_REALTIME_INODE(ip)) 109 return -ENXIO; 110 111 /* 112 * The pNFS block layout spec actually supports reflink like 113 * functionality, but the Linux pNFS server doesn't implement it yet. 114 */ 115 if (xfs_is_reflink_inode(ip)) 116 return -ENXIO; 117 118 /* 119 * Lock out any other I/O before we flush and invalidate the pagecache, 120 * and then hand out a layout to the remote system. This is very 121 * similar to direct I/O, except that the synchronization is much more 122 * complicated. See the comment near xfs_break_layouts for a detailed 123 * explanation. 124 */ 125 xfs_ilock(ip, XFS_IOLOCK_EXCL); 126 127 error = -EINVAL; 128 limit = mp->m_super->s_maxbytes; 129 if (!write) 130 limit = max(limit, round_up(i_size_read(inode), 131 inode->i_sb->s_blocksize)); 132 if (offset > limit) 133 goto out_unlock; 134 if (offset > limit - length) 135 length = limit - offset; 136 137 error = filemap_write_and_wait(inode->i_mapping); 138 if (error) 139 goto out_unlock; 140 error = invalidate_inode_pages2(inode->i_mapping); 141 if (WARN_ON_ONCE(error)) 142 return error; 143 144 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); 145 offset_fsb = XFS_B_TO_FSBT(mp, offset); 146 147 lock_flags = xfs_ilock_data_map_shared(ip); 148 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 149 &imap, &nimaps, bmapi_flags); 150 xfs_iunlock(ip, lock_flags); 151 152 if (error) 153 goto out_unlock; 154 155 if (write) { 156 enum xfs_prealloc_flags flags = 0; 157 158 ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 159 160 if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { 161 /* 162 * xfs_iomap_write_direct() expects to take ownership of 163 * the shared ilock. 164 */ 165 xfs_ilock(ip, XFS_ILOCK_SHARED); 166 error = xfs_iomap_write_direct(ip, offset, length, 167 &imap, nimaps); 168 if (error) 169 goto out_unlock; 170 171 /* 172 * Ensure the next transaction is committed 173 * synchronously so that the blocks allocated and 174 * handed out to the client are guaranteed to be 175 * present even after a server crash. 176 */ 177 flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; 178 } 179 180 error = xfs_update_prealloc_flags(ip, flags); 181 if (error) 182 goto out_unlock; 183 } 184 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 185 186 xfs_bmbt_to_iomap(ip, iomap, &imap); 187 *device_generation = mp->m_generation; 188 return error; 189 out_unlock: 190 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 191 return error; 192 } 193 194 /* 195 * Ensure the size update falls into a valid allocated block. 196 */ 197 static int 198 xfs_pnfs_validate_isize( 199 struct xfs_inode *ip, 200 xfs_off_t isize) 201 { 202 struct xfs_bmbt_irec imap; 203 int nimaps = 1; 204 int error = 0; 205 206 xfs_ilock(ip, XFS_ILOCK_SHARED); 207 error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, 208 &imap, &nimaps, 0); 209 xfs_iunlock(ip, XFS_ILOCK_SHARED); 210 if (error) 211 return error; 212 213 if (imap.br_startblock == HOLESTARTBLOCK || 214 imap.br_startblock == DELAYSTARTBLOCK || 215 imap.br_state == XFS_EXT_UNWRITTEN) 216 return -EIO; 217 return 0; 218 } 219 220 /* 221 * Make sure the blocks described by maps are stable on disk. This includes 222 * converting any unwritten extents, flushing the disk cache and updating the 223 * time stamps. 224 * 225 * Note that we rely on the caller to always send us a timestamp update so that 226 * we always commit a transaction here. If that stops being true we will have 227 * to manually flush the cache here similar to what the fsync code path does 228 * for datasyncs on files that have no dirty metadata. 229 */ 230 int 231 xfs_fs_commit_blocks( 232 struct inode *inode, 233 struct iomap *maps, 234 int nr_maps, 235 struct iattr *iattr) 236 { 237 struct xfs_inode *ip = XFS_I(inode); 238 struct xfs_mount *mp = ip->i_mount; 239 struct xfs_trans *tp; 240 bool update_isize = false; 241 int error, i; 242 loff_t size; 243 244 ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); 245 246 xfs_ilock(ip, XFS_IOLOCK_EXCL); 247 248 size = i_size_read(inode); 249 if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { 250 update_isize = true; 251 size = iattr->ia_size; 252 } 253 254 for (i = 0; i < nr_maps; i++) { 255 u64 start, length, end; 256 257 start = maps[i].offset; 258 if (start > size) 259 continue; 260 261 end = start + maps[i].length; 262 if (end > size) 263 end = size; 264 265 length = end - start; 266 if (!length) 267 continue; 268 269 /* 270 * Make sure reads through the pagecache see the new data. 271 */ 272 error = invalidate_inode_pages2_range(inode->i_mapping, 273 start >> PAGE_SHIFT, 274 (end - 1) >> PAGE_SHIFT); 275 WARN_ON_ONCE(error); 276 277 error = xfs_iomap_write_unwritten(ip, start, length, false); 278 if (error) 279 goto out_drop_iolock; 280 } 281 282 if (update_isize) { 283 error = xfs_pnfs_validate_isize(ip, size); 284 if (error) 285 goto out_drop_iolock; 286 } 287 288 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 289 if (error) 290 goto out_drop_iolock; 291 292 xfs_ilock(ip, XFS_ILOCK_EXCL); 293 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 294 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 295 296 xfs_setattr_time(ip, iattr); 297 if (update_isize) { 298 i_size_write(inode, iattr->ia_size); 299 ip->i_d.di_size = iattr->ia_size; 300 } 301 302 xfs_trans_set_sync(tp); 303 error = xfs_trans_commit(tp); 304 305 out_drop_iolock: 306 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 307 return error; 308 } 309